%matplotlib inline
%load_ext autoreload
   
import os
import requests
from IPython.display import display, Image, FileLink
#from jupyter_datatables import init_datatables_mode
#from ipydatagrid import DataGrid
from itables import init_notebook_mode
#init_notebook_mode(all_interactive=True)
init_notebook_mode(connected=True)
import itables.options as itablesopt
# display table options
#itablesopt.lengthMenu = [2, 5, 10, 20, 50, 100, 200, 500]
#itablesopt.maxBytes = 10000 , 0 to disable
#itablesopt.maxRows
#itablesopt.maxColumns

#%load_ext jupyter_require
#%requirejs d3 https://d3js.org/d3.v5.min
#init_datatables_mode()

%autoreload 2
import omics_api
from omics_api import *

# Flags to emable example
others=True
PLOT_PHEN=others
WGCNA=others
GWAS=others
MATRIXEQT=others
VERBOSE=True
MERGE=others
REQUERY_URL=False

CHECK_BE=True  # check and correct batch effect

# imputation strategy from https://doi.org/10.1038/s41598-023-30084-2
IMPUTE_M1=True # global mean
IMPUTE_M2=True # batch mean


DEBUGGNG=False

# strict setting
#maxp=1e-5

# threshold settings
maxpgwas=1e-20
maxpeqtl=1e-5
maxpwgcna=1e-5
mincorwgcna=0.7
cortopnwgcna=None
gstopnwgcna=None

top_dfgwas=[]
top_dfwgcna=[]
top_dfeqtl=[]
myunit='percent_of_dw'
#SHOW_ALL
#SHOW_DEBUG
#SHOW_ERRORONLY



if DEBUGGNG:
    setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
else:
    setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
#setVariables(loglevel=getSetting('SHOW_ERRORONLY'))

data = requests.get('https://icgrc.info/api/user/phenotype/dataset').json()
print('Phenotype datasets(phends) options')
display_table(data,columns=['phen_dataset','unit_name','samples_count','trait_count'])

Phenotype datasets(phends) options

#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1"]
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
#label_url=['Booth2020','Zager2019','GloerfeltTarp2023','BGZphenotypes']
api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
label_url=['Booth2020','Zager2019','BZphenotypes']
label2color=dict()
label2color['Azman2023']='red'
label2color['Booth2020']='blue'
label2color['GloerfeltTarp2023']='green'
label2color['Zager2019']='orange'
label2color['BZphenotypes']='yellow'

keep_unit='percent_of_dw'
#keep_unit='ug_per_gdw'
n_bins=10

icnt=0
dfs=[]
dfs_imputed_m2=[]
phenall=set()
keep_samples=set()
keep_phenotypes=set()
if PLOT_PHEN:
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
	
		if VERBOSE and icnt==0:
			display('in original units')
			display(drop_allnazero_rowcol(df_raw.loc[df_raw['datatype'].str.startswith('3 PHEN')]))

		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit='percent_of_dw')
		phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
		plot_histograms(c_converted_phenunits_values, 'changed_to_' + keep_unit, label_url[icnt]) #,label2color=label2color)
	
		if VERBOSE and icnt==0:
	
			display('coverted to ' +  keep_unit)
			display(drop_allnazero_rowcol(c_converted_phenunits.loc[c_converted_phenunits['datatype'].str.startswith('3 PHEN')]))
	
			display('coverted to (values only) ' +  keep_unit)
			display(drop_allnazero_rowcol(c_converted_phenunits_values.loc[c_converted_phenunits_values['datatype'].str.startswith('3 PHEN')]))
		
		dfs.append(c_converted_phenunits_values)
        
		if IMPUTE_M2:
			(df_corrected,dummy)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=False) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
			dfs_imputed_m2.append(df_corrected)
		icnt+=1

Booth2020

'in original units'

'coverted to percent_of_dw'

'coverted to (values only) percent_of_dw'

original data

becorrected_Booth2020.tsv

Zager2019
original data

becorrected_Zager2019.tsv

BZphenotypes
original data

becorrected_BZphenotypes.tsv

if PLOT_PHEN:
    phenminmaxbin=None
    plot_histograms_multi(dfs, keep_unit,label_url,phenminmaxbin,label2color=label2color,nbins=n_bins)

<Figure size 2000x2000 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

if PLOT_PHEN and IMPUTE_M2:
    phenminmaxbin=None
    df_orgcor=dfs + dfs_imputed_m2
    label_orgcor=[]
    for label in label_url:
        label_orgcor.append(label)
    for label in label_url:
        label_orgcor.append('m2_' + label)
    label2color['m2_Booth2020']='cyan'
    label2color['m2_Zager2019']='magenta'
    label2color['m2_BZphenotypes']='gray'        
    plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)

<Figure size 2000x2000 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

if CHECK_BE and IMPUTE_M1:
    #api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
    #label_url=['BGZphenotypes']
    api_url1 = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
    label_url1=['BZphenotypes']
    icnt=0
    df_raw= read_url(api_url1[0], label_url1[0],requery=REQUERY_URL)
    (c_converted_phenunits, dfphen)=convert_units_to(df_raw,to_unit='percent_of_dw')
    (df_corrected, map_corrected_bybatch)=check_batcheffects(dfphen, label_url1[0], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
    #print(list(map_corrected_bybatch.keys()))
    #plot_histograms_multi(list(map_corrected_bybatch.keys()), keep_unit,list(map_corrected_bybatch.values()),phenminmaxbin,label2color=label2color,nbins=n_bins)

Found 2 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data

original data

becorrected_BZphenotypes.tsv

/home/lmansu10/jupyter/omics_api_utils.py:4250: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/home/lmansu10/jupyter/omics_api_utils.py:4250: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

#dfs_imputed_m2_0=dfs_imputed_m2[0]
#dfs_imputed_m2_1=dfs_imputed_m2[1]
#dfs_imputed_m2_0=dfs_imputed_m2_0.index=dfs_imputed_m2_0['property']
#dfs_imputed_m2_1=dfs_imputed_m2_1.index=dfs_imputed_m2_1['property']
#dfs_imputed_m2_bz=pd.merge(dfs_imputed_m2_0,dfs_imputed_m2_1, left_index=True, right_index=True, how = 'outer')
dfs_imputed_m2_bz=pd.merge(dfs_imputed_m2[0],dfs_imputed_m2[1], left_on=['datatype','property'], right_on=['datatype','property'], how = 'outer')

#init_notebook_mode(connected=True)
init_notebook_mode(all_interactive=True)
display(dfs_imputed_m2_bz)

#init_notebook_mode(connected=True)
init_notebook_mode(all_interactive=True)

if CHECK_BE and IMPUTE_M2:
    (df_corrected, map_corrected_bybatch)=check_batcheffects(dfs_imputed_m2_bz, 'm2_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset'])

Found 2 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data

/home/lmansu10/miniconda3/envs/R4/lib/python3.10/site-packages/combat/pycombat.py:159: RuntimeWarning:

divide by zero encountered in divide

original data

becorrected_m2_BZphenotypes.tsv

/home/lmansu10/jupyter/omics_api_utils.py:4254: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/home/lmansu10/jupyter/omics_api_utils.py:4254: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

if PLOT_PHEN and IMPUTE_M2:
    phenminmaxbin=None
    df_orgcor=dfs + dfs_imputed_m2
    label_orgcor=[]
    for label in label_url:
        label_orgcor.append(label)
    for label in label_url:
        label_orgcor.append('m2_' + label)
    for batch in list(map_corrected_bybatch.keys()):
        label_orgcor.append('m2cb_' + batch)
        df_orgcor.append(map_corrected_bybatch[batch])
    label_orgcor.append('m2cb_BZphenotypes')
    df_orgcor.append(df_corrected)                        
    label2color['m2_Booth2020']='cyan'
    label2color['m2_Zager2019']='magenta'
    label2color['m2_BZphenotypes']='gray'        
    label2color['m2cb_Booth2020']='darkblue'
    label2color['m2cb_Zager2019']='pink'
    label2color['m2cb_BZphenotypes']='brown'    
    plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)

<Figure size 2000x2000 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

map_corrected_bybatch['BZphenotypes']=df_corrected
dfs_m1=dfs
dfs_m2_cb=dfs

for cor in list(map_corrected_bybatch.keys()):
    label_url.append('m2cb_' + cor)
    dfs_m2_cb.append(map_corrected_bybatch[cor])

label2color['m2cb_Booth2020']='cyan'
label2color['m2cb_GloerfeltTarp2023']='pink'
label2color['m2cb_Zager2019']='magenta'
label2color['m2cb_BZphenotypes']='gray'

setVariables(imgsize_xlarge_in=(15,30))
plot_histograms_multi(dfs, keep_unit,label_url,phenminmaxbin,label2color=label2color,nbins=n_bins,heatmap=False)

<Figure size 1500x3000 with 0 Axes>

data = requests.get('https://icgrc.info/api/user/expression/dataset').json()
print('Expression datasets (expds) options')
#print(pd.DataFrame(data,columns=['expds','analysis_name']).to_string(index=False))
#display(pd.DataFrame(data,columns=['expds','analysis_name']))
display_table(data,columns=['expds','analysis_name'])

data = requests.get('https://icgrc.info/api/user/expression/transet').json()
print('Transcriptome set (transet) options')
display_table(data,columns=['transet'])

Expression datasets (expds) options

Transcriptome set (transet) options

#import omics_api
#from omics_api import *

phends='Zager2019,Booth2020'
expds='21trichcs10'
transet='cs10'
#label_url=['21trichcs10_cs10_zagerbooth']
label_url=['21trichcs10_cs10_zagerbooth_be']

api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0']

#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichpkclone109&phends=Zager2019%2CBooth2020&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichpk_zagerbooth']

#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichcs10&phends=Zager2019&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichcs10_zager']

icnt=0
top_dfwgcna=[]

if WGCNA :
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
		if False: #CHECK_BE
			#display(c_converted_phenunits_values)
			(df_corrected, map_corrected_bybatch)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
			c_converted_phenunits_values=df_corrected

		moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']
		topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=label_url[icnt],cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=True)
		break
		for itop in topmodules:
			moduletrait.append(itop[0]+","+itop[1])
		dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=label_url[icnt],moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna,annotate=True)
		#do_wgcna_visualize(c_converted_phenunits_values,dflabel=label_url[icnt],recalc=True,moduletrait=moduletrait)
		top_dfwgcna.append(dfwgcn)
		#display(Image(filename=label_url[0] + '.ModuleTraitRelationship.png'))
		#display(Image(filename=label_url[0] + '.ClusterDendogram.png'))

		icnt+=1

21trichcs10_cs10_zagerbooth_be

rm: cannot remove ‘21trichcs10_cs10_zagerbooth_be*’: No such file or directory

mydf

df_idprop

df_annot

mydf

df_idprop

df_annot

'Rscript Langfelder-01-dataInput.R 21trichcs10_cs10_zagerbooth_be 21trichcs10_cs10_zagerbooth_be.genes.csv 21trichcs10_cs10_zagerbooth_be 21trichcs10_cs10_zagerbooth_be.trait.csv > /dev/null'

Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust


Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor

dev.new(): using pdf(file="Rplots1.pdf")
Warning message:
In numbers2colors(datTraits, signed = FALSE) :
  (some columns in) 'x' are constant. Their color will be the color of NA.

data = requests.get('https://icgrc.info/api/user/variant/dataset').json()
print('Variant reference (ref) and dataset (snpds) options')
display_table(data,columns=['ref','snpds'])

Variant reference (ref) and dataset (snpds) options

ref="cs10"
snpds='21trichs'
expds='21trichcs10'
transet='cs10'
querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'


api_url = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
label_url=["eqtlb_terpcan_" + snpds + "_" + expds]
#label_url=["eqtl_terpcan_" + snpds + "_allsnps_" + expds]

icnt=0
top_dfeqtl=[]
if MATRIXEQT:
#if True:
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
		(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,label_url[icnt],annot='cs10', recodeplink=False,recalc=True,maxp=maxpeqtl,expgeneloc_file='data/cs10xm_exp.genesloc.tsv')
		top_dfeqtl.append(dfeqtlall)
		#display(Image(filename=label_url[0] + ".eqtlpvalue.png"))
		#display(Image(filename=label_url[0] + ".eqtlqqplot.png"))
		icnt+=1

eqtlb_terpcan_21trichs_21trichcs10

'plink --bfile snp_eqtlb_terpcan_21trichs_21trichcs10 --recode A-transpose --out snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose --allow-extra-chr --noweb > /dev/null'

'cut -f1,4 snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw > snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.postmp'

'head -n1 snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw | cut -f1,2,3,4,5,6  --complement > /dev/null'

plink samples with exp data 63/63

"cut -f1,2,3,4,5,6 --complement snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw | tail -n +2 | sed 's/NA//g' > snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.allele"

'echo SAMN09747683\tSAMN09747684\tSAMN09747685\tSAMN09747686\tSAMN09747687\tSAMN09747688\tSAMN09747689\tSAMN09747690\tSAMN09747691\tSAMN10330896\tSAMN10330897\tSAMN10330898\tSAMN10330899\tSAMN10330900\tSAMN10330901\tSAMN10330902\tSAMN10330903\tSAMN10330904\tSAMN10330905\tSAMN10330906\tSAMN10330907\tSAMN10330908\tSAMN10330909\tSAMN10330910\tSAMN10330911\tSAMN10330912\tSAMN10330913\tSAMN10330914\tSAMN10330915\tSAMN10330916\tSAMN10330917\tSAMN10330918\tSAMN10330919\tSAMN10330920\tSAMN10330921\tSAMN10330922\tSAMN13503266\tSAMN13503268\tSAMN13503269\tSAMN13503270\tSAMN13503272\tSAMN13503274\tSAMN13503277\tSAMN13503278\tSAMN13503281\tSAMN13503282\tSAMN13503283\tSAMN13503285\tSAMN13750438\tSAMN13750439\tSAMN13750440\tSAMN13750441\tSAMN13750442\tSAMN13750443\tSAMN13750444\tSAMN13750445\tSAMN13750446\tSAMN13750447\tSAMN13750448\tSAMN13750449\tSAMN13750450\tSAMN13750451\tSAMN13750452 > snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.newheader'

'cat  snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.newheader snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.allele > eqtlb_terpcan_21trichs_21trichcs10_snp.tsvtmp'

'paste snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.pos eqtlb_terpcan_21trichs_21trichcs10_snp.tsvtmp > eqtlb_terpcan_21trichs_21trichcs10_snp.tsv'

'ln -s data/cs10xm_exp.genesloc.tsv eqtlb_terpcan_21trichs_21trichcs10_exp.genesloc.tsv > /dev/null'

if GWAS: # snp, phen
	snpds='21trichs'
	#phends='Booth2020'
	phends='Zager2019,Booth2020'
	querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
	geneslabel='cannterp'
	ref='cs10'
	label_url=['21trichs_cs10_zagerbooth_cannterp_covar']
	#label_url=['21trich_allsnps_cs10_zagerbooth_cannterp']
	
	api_url = ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]

dfs=[]
phenall=set()
icnt=0
if GWAS:
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
		moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']    
		dfs.append(c_converted_phenunits_values)
		phenall.update(set(get_phenotypes(c_converted_phenunits_values)))

21trichs_cs10_zagerbooth_cannterp_covar

if GWAS:
	select_options(phenall, waitInput=False)

#selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55]
selectionid=[3,4,6,7]
GWAS_FROMPLINK=False

#%autoreload 2
setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
top_dfgwas=[]
if GWAS:
	selection=select_options(phenall, waitInput=False, selected=selectionid)
	print('selections: ' + str(selection))
	idf=0
	df_gene=pd.read_csv('data/gene_cs10.tsv',sep="\t")
	for df in dfs:
		dfgwas=do_gwas(df, label_url[idf],myunit,phenotypes=selection,rerunplink=True,maxp=maxpgwas,recalc=True,annot='cs10') #,covar=['phenotype_dataset'],assocmethod='linear') #,df_genes=df_gene,annot='cs10') 
		top_dfgwas.append(dfgwas)
		#printdisplay(MESSAGE_INFO,label_url[idf] + '.gwas.top.tsv') #Image(filename=label_url[idf] + '_gwasmp_' + myunit + '_' + label_url[idf] +  '.qassoc.counts.png'))
		idf+=1

selections: ['Delta9-tetrahydrocannabinol', 'Delta9-tetrahydrocannabinolic acid', 'Percent_Total_Cannabinoids_per_dry_weight', 'THCA_to_CBDA_ratio']

rm: cannot remove ‘21trichs_cs10_zagerbooth_cannterp_covar*’: No such file or directory

(1684, 44)

['datatype',
 'property',
 'SAMN10330896',
 'SAMN10330897',
 'SAMN10330898',
 'SAMN10330899',
 'SAMN10330900',
 'SAMN10330901',
 'SAMN10330902',
 'SAMN10330903',
 'SAMN10330904',
 'SAMN10330905',
 'SAMN10330906',
 'SAMN10330907',
 'SAMN10330908',
 'SAMN10330909',
 'SAMN10330910',
 'SAMN10330911',
 'SAMN10330912',
 'SAMN10330913',
 'SAMN10330914',
 'SAMN10330915',
 'SAMN10330916',
 'SAMN10330917',
 'SAMN10330918',
 'SAMN10330919',
 'SAMN10330920',
 'SAMN10330921',
 'SAMN10330922',
 'SAMN13750438',
 'SAMN13750439',
 'SAMN13750440',
 'SAMN13750441',
 'SAMN13750442',
 'SAMN13750443',
 'SAMN13750444',
 'SAMN13750445',
 'SAMN13750446',
 'SAMN13750447',
 'SAMN13750448',
 'SAMN13750449',
 'SAMN13750450',
 'SAMN13750451',
 'SAMN13750452']

'python HapMap_to_PLINK.py snp_21trichs_cs10_zagerbooth_cannterp_covar.hmp snp_21trichs_cs10_zagerbooth_cannterp_covar.map > snp_21trichs_cs10_zagerbooth_cannterp_covar.ped'

'plink --file snp_21trichs_cs10_zagerbooth_cannterp_covar --allow-extra-chr --make-bed --out snp_21trichs_cs10_zagerbooth_cannterp_covar'

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to snp_21trichs_cs10_zagerbooth_cannterp_covar.log.
Options in effect:
  --allow-extra-chr
  --file snp_21trichs_cs10_zagerbooth_cannterp_covar
  --make-bed
  --out snp_21trichs_cs10_zagerbooth_cannterp_covar

332375 MB RAM detected; reserving 166187 MB for main workspace.
.ped scan complete (for binary autoconversion).
Performing single-pass .bed write (1588 variants, 42 people).
--file: snp_21trichs_cs10_zagerbooth_cannterp_covar-temporary.bed +
snp_21trichs_cs10_zagerbooth_cannterp_covar-temporary.bim +
snp_21trichs_cs10_zagerbooth_cannterp_covar-temporary.fam written.
1588 variants loaded from .bim file.
42 people (0 males, 0 females, 42 ambiguous) loaded from .fam.
Ambiguous sex IDs written to snp_21trichs_cs10_zagerbooth_cannterp_covar.nosex
.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 42 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done.
Total genotyping rate is 0.692515.
1588 variants and 42 people pass filters and QC.
Note: No phenotypes present.
--make-bed to snp_21trichs_cs10_zagerbooth_cannterp_covar.bed +
snp_21trichs_cs10_zagerbooth_cannterp_covar.bim +
snp_21trichs_cs10_zagerbooth_cannterp_covar.fam ... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99%done.

Warning: Variant 6 triallelic; setting rarest alleles missing.
Warning: Variant 33 triallelic; setting rarest alleles missing.
Warning: Variant 134 triallelic; setting rarest alleles missing.
Warning: Variant 149 triallelic; setting rarest alleles missing.
Warning: Variant 206 triallelic; setting rarest alleles missing.
Warning: Variant 229 triallelic; setting rarest alleles missing.
Warning: Variant 281 triallelic; setting rarest alleles missing.
Warning: Variant 285 triallelic; setting rarest alleles missing.
Warning: Variant 310 triallelic; setting rarest alleles missing.
Warning: Variant 424 triallelic; setting rarest alleles missing.
Warning: Variant 450 triallelic; setting rarest alleles missing.
Warning: Variant 480 triallelic; setting rarest alleles missing.
Warning: Variant 481 triallelic; setting rarest alleles missing.
Warning: Variant 546 triallelic; setting rarest alleles missing.
Warning: Variant 626 triallelic; setting rarest alleles missing.
Warning: Variant 636 triallelic; setting rarest alleles missing.
Warning: Variant 664 triallelic; setting rarest alleles missing.
Warning: Variant 681 triallelic; setting rarest alleles missing.
Warning: Variant 1005 triallelic; setting rarest alleles missing.
Warning: Variant 1028 triallelic; setting rarest alleles missing.
Warning: Variant 1067 triallelic; setting rarest alleles missing.
Warning: Variant 1081 triallelic; setting rarest alleles missing.
Warning: Variant 1106 triallelic; setting rarest alleles missing.
Warning: Variant 1117 triallelic; setting rarest alleles missing.
Warning: Variant 1137 triallelic; setting rarest alleles missing.
Warning: Variant 1151 triallelic; setting rarest alleles missing.
Warning: Variant 1203 triallelic; setting rarest alleles missing.
Warning: Variant 1341 triallelic; setting rarest alleles missing.
Warning: Variant 1348 triallelic; setting rarest alleles missing.
Warning: Variant 1412 triallelic; setting rarest alleles missing.

'plink --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar  --assoc counts --pheno  phen_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt --out assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --allow-extra-chr --allow-no-sex && touch  assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.done &> assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.out  &'

PLINK v1.90b6.12 64-bit (28 Oct 2019)          www.cog-genomics.org/plink/1.9/
(C) 2005-2019 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.log.
Options in effect:
  --allow-extra-chr
  --allow-no-sex
  --assoc counts
  --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar
  --out assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts
  --pheno phen_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt

332375 MB RAM detected; reserving 166187 MB for main workspace.
1588 variants loaded from .bim file.
42 people (0 males, 0 females, 42 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.nosex
.
42 phenotype values present after --pheno.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 42 founders and 0 nonfounders present.
Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done.
Total genotyping rate is 0.692515.
1588 variants and 42 people pass filters and QC.
Phenotype data is quantitative.
Writing QT --assoc report to
assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc
... 0%4%13%18%27%46%48%57%80%96%done.

#%autoreload 2
#import omics_api
#from omics_api import *
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)

mergeoutfle='merged_21trichcs10_alltraits.tsv'
def printdflists(l):
	for df in l:
		print(df.columns.values.tolist())
#setVariables(loglevel=getSetting('SHOW_DEBUG'))
printdflists(top_dfeqtl)
printdflists(top_dfwgcna)
printdflists(top_dfgwas)
MERGE=True
if MERGE:
	df_genes=pd.read_csv('data/gene_cs10.tsv',sep="\t")
	merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=mergeoutfle,use_snpgene=True,df_genes=df_genes,recalc=False)

['snps', 'gene', 'statistic', 'pvalue', 'FDR', 'beta', 'pvalue_y', 'snpgenes', 'contig', 'start', 'end', 'note', 'goterm', 'iprterm', 'expgenes', 'note.1', 'goterm.1', 'iprterm.1']
['trait', 'snps', 'pvalue', 'snpgenes', 'note', 'goterm', 'iprterm']

'File df_eqtl_wgcna_onexpgenes_merged_21trichcs10_alltraits.tsv dont exists'

eqtl_wgcna_onexpgene_merged_21trichcs10_alltraits.tsv

df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait

triple_snpgenetrait_merged_21trichcs10_alltraits.tsv

tgzdownloads('alltgz1', txt=True, img=True)

'tar -cvzf alltgz1.tgz -T dwlist.txt > /dev/null'

#download gene annotation and save to data directory
datadir='data1'
GENEFILE=datadir+'/gene_cs10.tsv'
#pd.read_csv('https://icgrc.info/api/user/gene/list?annot=cs10',sep="\t").to_csv(GENEFILE,sep="\t",index=False)

#download expression matrix save to data directory
EXPFILE=datadir + '/21trichcs10_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?expds=21trichcs10',sep="\t").to_csv(EXPFILE,sep="\t",index=False)

#EXPFILE=datadir + '/GaoGuerrieroConneely_allexp.tsv'
#EXPFILE=datadir + '/Gao_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?transet=all&expds=Massimino2017&tablelimit=100000&limit=1000',sep="\t").to_csv(EXPFILE,sep="\t",index=False)


#download variant file in binary plink format at 
PLINKFILE='data/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam'
#PLINKFILE=None

PHENFILE=datadir+'/phen_all.tsv'
#pd.read_csv('https://icgrc.info/api/user/phenotype/list',sep="\t").to_csv(PHENFILE,sep="\t",index=False)

if False:
	import datetime
	setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
	# hold top results
	top_dfwgcna=[]
	top_dfeqtl=[]
	top_dfgwas=[]

	# enable module
	WGCNA=True
	EQTL=True
	GWAS=True
	MERGE=True
	REQUERY_URL=False

	phends='Zager2019,Booth2020'
	expds='21trichcs10'
	transet='cs10'
	label_url=None
	label_wgcna=['wgcna_21trichcs10allexp_cs10_zagerbooth_b']
	url_wgcna = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0&tablelimit=100000&limit=100000']


	icnt=0
	if WGCNA :
		print('STARTED WGCNA ' + str(datetime.datetime.now()))
		for iurl in url_wgcna:
			ilabel=label_wgcna[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
			moduletrait=[]
			topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=ilabel,cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=False,exp_file=EXPFILE)
			for itop in topmodules:
				moduletrait.append(itop[0]+","+itop[1])
			dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=ilabel,moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna)
			top_dfwgcna.append(dfwgcn)
			icnt+=1


	snpds='21trichs'
	phends='Zager2019,Booth2020'
	querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
	geneslabel='cannterp'
	ref='cs10'
	label_gwas=['gwas_21trich_allsnps_cs10_zagerbooth_cannterp_b']
	url_gwas= ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]

	phenall=set()
	icnt=0
	df_gwas=[]
	# collect available phenotypes first to make selection
	if GWAS:
		print('STARTED GWAS ' + str(datetime.datetime.now()))
		for iurl in url_gwas:
			ilabel=label_gwas[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
			df_gwas.append(c_converted_phenunits_values)
			phenall.update(set(get_phenotypes(c_converted_phenunits_values)))

		# show selection options, set to waitInput=True to get interactive response. For now selection selectionid is preselected
		select_options(phenall, waitInput=False)

		selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55] # most common cannabinoids/terpenes
		#selectionid=[3,4,6,7],  CBDA/THCA

	if GWAS:
		selection=select_options(phenall, waitInput=False, selected=selectionid)
		# verify selections as as intended
		print('selections: ' + str(selection))
		idf=0
		for df in df_gwas:
			dfgwas=do_gwas(df, label_gwas[idf],myunit,phenotypes=selection,rerunplink=False,maxp=maxpgwas,plink_file=PLINKFILE) #,assocmethod='linear')
			top_dfgwas.append(dfgwas)
			idf+=1


	ref="cs10"
	snpds='21trichs'
	expds='21trichcs10'
	transet='cs10'
	querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'

	url_eqtl = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
	label_eqtl=["eqtl_terpcan_" + snpds + "_allsnpsexps_" + expds+'_relax_b']

	# GENELOCFILE is derived from GENEFILE using only gene_id\tcontig\tfmin\tfmax
	GENELOCFILE=datadir+'/cs10xm_exp.genesloc.tsv'
	pd.read_csv(GENEFILE,sep="\t")[['name','contig','fmin','fmax']].rename(columns={'name':'geneid','contig':'chr','fmin':'s1','fmax':'s2'}).to_csv(GENELOCFILE,sep="\t",index=False)

	icnt=0
	if MATRIXEQT:
		print('STARTED EQTL ' + str(datetime.datetime.now()))
		for iurl in url_eqtl:
			ilabel=label_eqtl[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,ilabel,annot=None, recodeplink=False,maxp=maxpeqtl,plink_file=PLINKFILE,expgeneloc_file=GENELOCFILE,exp_file=EXPFILE)
			top_dfeqtl.append(dfeqtlall)
			icnt+=1

	'''
	top_dfeqtl=[]
	top_dfgwas=[]
	top_dfwgna=[]
	top_dfeqtl.append('eqtl_terpcan_21trichs_allsnpsexps_21trichcs10_relax.eqtl.all.top.tsv')
	top_dfwgcna.append('wgcna_21trichcs10allexp_cs10_zagerbooth_relax.wgcna.top.tsv')
	top_dfgwas.append('gwas_21trich_allsnps_cs10_zagerbooth_cannterp_relax.gwas.top.tsv')
	'''
	merge_output='merged_21trichcs10allsnpsallexps_trait_relax_b.tsv'

	if MERGE:
		print('STARTED MERGE ' + str(datetime.datetime.now()))
		df_gene=pd.read_csv(GENEFILE,sep="\t") #name    contig  fmin    fmax    strand  gb_keyword      go_term interpro_value)
		merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=merge_output,use_snpgene=True,df_genes=df_gene)

ICGRC Omics API Demo¶

Multiple data sources¶

Check batch effect, imputation and batch effect correction¶

WGCNA Analysis¶

eQTL analysis¶

Metabolite SNP Association (semi-mGWAS)¶

Merge multi tools¶

Running semi-offline using large dataset¶

	datatype	SAMN10330896	SAMN10330897	SAMN10330898	SAMN10330899	SAMN10330900	SAMN10330901	SAMN10330902	SAMN10330903	SAMN10330904	SAMN10330905	SAMN10330906	SAMN10330907	SAMN10330908	SAMN10330909	SAMN10330910	SAMN10330911	SAMN10330912	SAMN10330913	SAMN10330914	SAMN10330915	SAMN10330916	SAMN10330917	SAMN10330918	SAMN10330919	SAMN10330920	SAMN10330921	SAMN10330922	SAMN13750438	SAMN13750439	SAMN13750440	SAMN13750441	SAMN13750442	SAMN13750443	SAMN13750444	SAMN13750445	SAMN13750446	SAMN13750447	SAMN13750448	SAMN13750449	SAMN13750450	SAMN13750451	SAMN13750452
property
Loading... (need help?)

	datatype	SAMN10330896	SAMN10330897	SAMN10330898	SAMN10330899	SAMN10330900	SAMN10330901	SAMN10330902	SAMN10330903	SAMN10330904	SAMN10330905	SAMN10330906	SAMN10330907	SAMN10330908	SAMN10330909	SAMN10330910	SAMN10330911	SAMN10330912	SAMN10330913	SAMN10330914	SAMN10330915	SAMN10330916	SAMN10330917	SAMN10330918	SAMN10330919	SAMN10330920	SAMN10330921	SAMN10330922	SAMN13750438	SAMN13750439	SAMN13750440	SAMN13750441	SAMN13750442	SAMN13750443	SAMN13750444	SAMN13750445	SAMN13750446	SAMN13750447	SAMN13750448	SAMN13750449	SAMN13750450	SAMN13750451	SAMN13750452
property
Loading... (need help?)