%matplotlib inline
%load_ext autoreload
   
import os
import requests
from IPython.display import display, Image, FileLink
#from jupyter_datatables import init_datatables_mode
#from ipydatagrid import DataGrid
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
#init_notebook_mode(connected=True)
import itables.options as itablesopt
# display table options
#itablesopt.lengthMenu = [2, 5, 10, 20, 50, 100, 200, 500]
#itablesopt.maxBytes = 10000 , 0 to disable
#itablesopt.maxRows
#itablesopt.maxColumns

#%load_ext jupyter_require
#%requirejs d3 https://d3js.org/d3.v5.min
#init_datatables_mode()

%autoreload 2
import omics_api
from omics_api import *

# Flags to emable example
others=False
PLOT_PHEN=True
WGCNA=others
GWAS=others
MATRIXEQT=others
VERBOSE=True
MERGE=others
REQUERY_URL=False

CHECK_BE=True  # check and correct batch effect
# imputation strategy from https://doi.org/10.1038/s41598-023-30084-2
IMPUTE_M1=True # global mean
IMPUTE_M2=True # batch mean


DEBUGGNG=False

# strict setting
#maxp=1e-5

# threshold settings
maxpgwas=1e-20
maxpeqtl=1e-5
maxpwgcna=1e-5
mincorwgcna=0.7
cortopnwgcna=None
gstopnwgcna=None

top_dfgwas=[]
top_dfwgcna=[]
top_dfeqtl=[]
myunit='percent_of_dw'
#SHOW_ALL
#SHOW_DEBUG
#SHOW_ERRORONLY



if DEBUGGNG:
    setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
else:
    setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
#setVariables(loglevel=getSetting('SHOW_ERRORONLY'))

"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"

data = requests.get('https://icgrc.info/api/user/phenotype/dataset').json()
print('Phenotype datasets(phends) options')
display_table(data,columns=['phen_dataset','unit_name','samples_count','trait_count'])

Phenotype datasets(phends) options

#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1"]
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
#label_url=['Booth2020','Zager2019','GloerfeltTarp2023','BGZphenotypes']
api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
label_url=['Booth2020','Zager2019','BZphenotypes']
label2color=dict()
label2color['Azman2023']='red'
label2color['Booth2020']='blue'
label2color['GloerfeltTarp2023']='green'
label2color['Zager2019']='orange'
label2color['BZphenotypes']='yellow'

keep_unit='percent_of_dw'
#keep_unit='ug_per_gdw'
n_bins=10

icnt=0
dfs=[]
dfs_imputed_m2=[]
phenall=set()
keep_samples=set()
keep_phenotypes=set()
if PLOT_PHEN:
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
	
		if VERBOSE and icnt==0:
			display('in original units')
			display(drop_allnazero_rowcol(df_raw.loc[df_raw['datatype'].str.startswith('3 PHEN')]))

		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit='percent_of_dw')
		phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
		plot_histograms(c_converted_phenunits_values, 'changed_to_' + keep_unit, label_url[icnt]) #,label2color=label2color)
	
		if VERBOSE and icnt==0:
	
			display('coverted to ' +  keep_unit)
			display(drop_allnazero_rowcol(c_converted_phenunits.loc[c_converted_phenunits['datatype'].str.startswith('3 PHEN')]))
	
			display('coverted to (values only) ' +  keep_unit)
			display(drop_allnazero_rowcol(c_converted_phenunits_values.loc[c_converted_phenunits_values['datatype'].str.startswith('3 PHEN')]))
		
		dfs.append(c_converted_phenunits_values)
        
		if IMPUTE_M2:
			(df_corrected,dummy)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=False) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
			dfs_imputed_m2.append(df_corrected)
		icnt+=1

Booth2020

'in original units'

'coverted to percent_of_dw'

'coverted to (values only) percent_of_dw'

'cannot insert property, already exists'

original data

'generated becorrected_Booth2020.tsv'

becorrected_Booth2020.tsv

Zager2019

'cannot insert property, already exists'

if PLOT_PHEN:
    phenminmaxbin=None
    plot_histograms_multi(dfs, keep_unit,label_url,phenminmaxbin,label2color=label2color,nbins=n_bins,heatmap=True)

'file dont exists generated 3_-7202988356578264128_phenotypes_percent_of_dw.png'

'file dont exists generated 3_-7202988356578264128_phenotypes_percent_of_dw_density.png'

<Figure size 2000x2000 with 0 Axes>

'generated Booth2020-Zager2019-BZphenotypes.phen2samplesrc-by_sample_dataset.txt'

<Figure size 640x480 with 0 Axes>

'generated Booth2020-Zager2019-BZphenotypes.phensrc2sample-by_trait_dataset.txt'

'generated Booth2020-Zager2019-BZphenotypes.phensrc2sample-by_dataset_trait.txt'

<Figure size 640x480 with 0 Axes>

if PLOT_PHEN and IMPUTE_M2:
    phenminmaxbin=None
    df_orgcor=dfs + dfs_imputed_m2
    label_orgcor=[]
    for label in label_url:
        label_orgcor.append(label)
    for label in label_url:
        label_orgcor.append('m2_' + label)
    label2color['m2_Booth2020']='cyan'
    label2color['m2_Zager2019']='magenta'
    label2color['m2_BZphenotypes']='gray'        
    plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)

'file dont exists generated 6_-3390368805713138963_phenotypes_percent_of_dw.png'

'file dont exists generated 6_-3390368805713138963_phenotypes_percent_of_dw_density.png'

<Figure size 2000x2000 with 0 Axes>

if CHECK_BE and IMPUTE_M1:
    #api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
    #label_url=['BGZphenotypes']
    api_url_m1 = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
    label_url_m1=['BZphenotypes']
    icnt=0
    df_raw= read_url(api_url_m1[0], label_url_m1[0],requery=REQUERY_URL)
    (c_converted_phenunits, dfphen)=convert_units_to(df_raw,to_unit='percent_of_dw')
    (df_corrected, map_corrected_bybatch)=check_batcheffects(dfphen, label_url_m1[0], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
    #print(list(map_corrected_bybatch.keys()))
    #plot_histograms_multi(list(map_corrected_bybatch.keys()), keep_unit,list(map_corrected_bybatch.values()),phenminmaxbin,label2color=label2color,nbins=n_bins)

'for PCA has NaN'

'imputing using mean'

'correcting batch effect using rcombat'

'Rscript Langfelder-combat.R BZphenotypes BZphenotypes.rcombat.data.csv BZphenotypes.rcombat.annot.csv phenotype_dataset > /dev/null'

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data

'generated BZphenotypes.pca.png'

'generated BZphenotypes.pca.3d.png'

'generated BZphenotypes.pca.3d.html'

'generated BZphenotypes.be.pca.png'

'generated BZphenotypes.be.pca.3d.png'

setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
if CHECK_BE and IMPUTE_M2:
    dfs_imputed_m2_bz=pd.merge(dfs_imputed_m2[0],dfs_imputed_m2[1], left_on=['datatype','property'], right_on=['datatype','property'], how = 'outer')
    (df_corrected_rcb, map_corrected_bybatch_rcb)=check_batcheffects(dfs_imputed_m2_bz, 'm2rcb_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True,be_method='rcombat') #properties=['phenotype_dataset'])
    (df_corrected_eblm, map_corrected_bybatch_eblm)=check_batcheffects(dfs_imputed_m2_bz, 'm2eblm_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True,be_method='eblm') #properties=['phenotype_dataset'])

"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"

'for PCA has NaN'

'imputing using mean'

'correcting batch effect using rcombat'

'Rscript Langfelder-combat.R m2rcb_BZphenotypes m2rcb_BZphenotypes.rcombat.data.csv m2rcb_BZphenotypes.rcombat.annot.csv phenotype_dataset > /dev/null'

Loading required package: mgcv
Loading required package: nlme
This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Loading required package: BiocParallel
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data

'generated m2rcb_BZphenotypes.pca.png'

'generated m2rcb_BZphenotypes.pca.3d.png'

'generated m2rcb_BZphenotypes.pca.3d.html'

'generated m2rcb_BZphenotypes.be.pca.png'

if PLOT_PHEN and IMPUTE_M2:
    phenminmaxbin=None
    '''
    df_orgcor=dfs.copy() #+ dfs_imputed_m2
    label_orgcor=label_url.copy() # []
    '''
    label_orgcor=[]
    df_orgcor=dfs_imputed_m2.copy()
    for label in label_url:
        label_orgcor.append('m2_' + label)
    
    display(label_orgcor)

    if True:
        for batch in list(map_corrected_bybatch_rcb.keys()):
            label_orgcor.append('m2rcb_' + batch)
            df_orgcor.append(map_corrected_bybatch_rcb[batch])
        label_orgcor.append('m2rcb_BZphenotypes')
        df_orgcor.append(df_corrected_rcb)                       
        
    if True:
        for batch in list(map_corrected_bybatch_eblm.keys()):
            label_orgcor.append('m2eblm_' + batch)
            df_orgcor.append(map_corrected_bybatch_eblm[batch])
        label_orgcor.append('m2eblm_BZphenotypes')
        df_orgcor.append(df_corrected_eblm)                        

    display(label_orgcor)
    label2color['m2_Booth2020']='cyan'
    label2color['m2_Zager2019']='magenta'
    label2color['m2_BZphenotypes']='gray'        
    label2color['m2rcb_Booth2020']='darkblue'
    label2color['m2rcb_Zager2019']='pink'
    label2color['m2rcb_BZphenotypes']='brown'    
    label2color['m2eblm_Booth2020']='lightgreen'
    label2color['m2eblm_Zager2019']='darkviolet'
    label2color['m2eblm_BZphenotypes']='lightsalmon'    
    plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)

['m2_Booth2020', 'm2_Zager2019', 'm2_BZphenotypes']

['m2_Booth2020',
 'm2_Zager2019',
 'm2_BZphenotypes',
 'm2rcb_Booth2020',
 'm2rcb_Zager2019',
 'm2rcb_BZphenotypes',
 'm2eblm_Booth2020',
 'm2eblm_Zager2019',
 'm2eblm_BZphenotypes']

'file dont exists generated 9_-2897765732204754825_phenotypes_percent_of_dw.png'

'file dont exists generated 9_-2897765732204754825_phenotypes_percent_of_dw_density.png'

<Figure size 2000x2000 with 0 Axes>

data = requests.get('https://icgrc.info/api/user/expression/dataset').json()
print('Expression datasets (expds) options')
#print(pd.DataFrame(data,columns=['expds','analysis_name']).to_string(index=False))
#display(pd.DataFrame(data,columns=['expds','analysis_name']))
display_table(data,columns=['expds','analysis_name'])

data = requests.get('https://icgrc.info/api/user/expression/transet').json()
print('Transcriptome set (transet) options')
display_table(data,columns=['transet'])

Expression datasets (expds) options

Transcriptome set (transet) options

#import omics_api
#from omics_api import *

phends='Zager2019,Booth2020'
expds='21trichcs10'
transet='cs10'
#label_url=['21trichcs10_cs10_zagerbooth']
label_url=['21trichcs10_cs10_zagerbooth_be']

api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0']

#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichpkclone109&phends=Zager2019%2CBooth2020&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichpk_zagerbooth']

#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichcs10&phends=Zager2019&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichcs10_zager']

icnt=0
top_dfwgcna=[]

if WGCNA :
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
		if False: #CHECK_BE
			#display(c_converted_phenunits_values)
			(df_corrected, map_corrected_bybatch)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
			c_converted_phenunits_values=df_corrected

		moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']
		topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=label_url[icnt],cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=True)
		break
		for itop in topmodules:
			moduletrait.append(itop[0]+","+itop[1])
		dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=label_url[icnt],moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna,annotate=True)
		#do_wgcna_visualize(c_converted_phenunits_values,dflabel=label_url[icnt],recalc=True,moduletrait=moduletrait)
		top_dfwgcna.append(dfwgcn)
		#display(Image(filename=label_url[0] + '.ModuleTraitRelationship.png'))
		#display(Image(filename=label_url[0] + '.ClusterDendogram.png'))

		icnt+=1

data = requests.get('https://icgrc.info/api/user/variant/dataset').json()
print('Variant reference (ref) and dataset (snpds) options')
display_table(data,columns=['ref','snpds'])

Variant reference (ref) and dataset (snpds) options

ref="cs10"
snpds='21trichs'
expds='21trichcs10'
transet='cs10'
querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'


api_url = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
label_url=["eqtlb_terpcan_" + snpds + "_" + expds]
#label_url=["eqtl_terpcan_" + snpds + "_allsnps_" + expds]

icnt=0
top_dfeqtl=[]
if MATRIXEQT:
#if True:
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
		(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,label_url[icnt],annot='cs10', recodeplink=False,recalc=True,maxp=maxpeqtl,expgeneloc_file='data/cs10xm_exp.genesloc.tsv')
		top_dfeqtl.append(dfeqtlall)
		#display(Image(filename=label_url[0] + ".eqtlpvalue.png"))
		#display(Image(filename=label_url[0] + ".eqtlqqplot.png"))
		icnt+=1

if GWAS: # snp, phen
	snpds='21trichs'
	#phends='Booth2020'
	phends='Zager2019,Booth2020'
	querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
	geneslabel='cannterp'
	ref='cs10'
	label_url=['21trichs_cs10_zagerbooth_cannterp_covar']
	#label_url=['21trich_allsnps_cs10_zagerbooth_cannterp']
	
	api_url = ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]

dfs=[]
phenall=set()
icnt=0
if GWAS:
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
		moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']    
		dfs.append(c_converted_phenunits_values)
		phenall.update(set(get_phenotypes(c_converted_phenunits_values)))

if GWAS:
	select_options(phenall, waitInput=False)

#selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55]
selectionid=[3,4,6,7]
GWAS_FROMPLINK=False

#%autoreload 2
setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
top_dfgwas=[]
if GWAS:
	selection=select_options(phenall, waitInput=False, selected=selectionid)
	print('selections: ' + str(selection))
	idf=0
	df_gene=pd.read_csv('data/gene_cs10.tsv',sep="\t")
	for df in dfs:
		dfgwas=do_gwas(df, label_url[idf],myunit,phenotypes=selection,rerunplink=True,maxp=maxpgwas,recalc=True,annot='cs10') #,covar=['phenotype_dataset'],assocmethod='linear') #,df_genes=df_gene,annot='cs10') 
		top_dfgwas.append(dfgwas)
		#printdisplay(MESSAGE_INFO,label_url[idf] + '.gwas.top.tsv') #Image(filename=label_url[idf] + '_gwasmp_' + myunit + '_' + label_url[idf] +  '.qassoc.counts.png'))
		idf+=1

"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=2\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"

#%autoreload 2
#import omics_api
#from omics_api import *
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)

mergeoutfle='merged_21trichcs10_alltraits.tsv'
def printdflists(l):
	for df in l:
		print(df.columns.values.tolist())
#setVariables(loglevel=getSetting('SHOW_DEBUG'))
printdflists(top_dfeqtl)
printdflists(top_dfwgcna)
printdflists(top_dfgwas)
MERGE=True
if MERGE:
	df_genes=pd.read_csv('data/gene_cs10.tsv',sep="\t")
	merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=mergeoutfle,use_snpgene=True,df_genes=df_genes,recalc=False)

"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"

'File df_eqtl_wgcna_onexpgenes_merged_21trichcs10_alltraits.tsv dont exists'

'loaded df_eqtl_wgcna_onexpgenes_merged_21trichcs10_alltraits.tsv'

eqtl_wgcna_onexpgene_merged_21trichcs10_alltraits.tsv

'loaded merged_21trichcs10_alltraits.tsv'

df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait

triple_snpgenetrait_merged_21trichcs10_alltraits.tsv

tgzdownloads('alltgz1', txt=True, img=True)

'tar -cvzf alltgz1.tgz -T dwlist.txt > /dev/null'

'generated alltgz1.tgz'

#download gene annotation and save to data directory
datadir='data1'
GENEFILE=datadir+'/gene_cs10.tsv'
#pd.read_csv('https://icgrc.info/api/user/gene/list?annot=cs10',sep="\t").to_csv(GENEFILE,sep="\t",index=False)

#download expression matrix save to data directory
EXPFILE=datadir + '/21trichcs10_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?expds=21trichcs10',sep="\t").to_csv(EXPFILE,sep="\t",index=False)

#EXPFILE=datadir + '/GaoGuerrieroConneely_allexp.tsv'
#EXPFILE=datadir + '/Gao_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?transet=all&expds=Massimino2017&tablelimit=100000&limit=1000',sep="\t").to_csv(EXPFILE,sep="\t",index=False)


#download variant file in binary plink format at 
PLINKFILE='data/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam'
#PLINKFILE=None

PHENFILE=datadir+'/phen_all.tsv'
#pd.read_csv('https://icgrc.info/api/user/phenotype/list',sep="\t").to_csv(PHENFILE,sep="\t",index=False)

if False:
	import datetime
	setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
	# hold top results
	top_dfwgcna=[]
	top_dfeqtl=[]
	top_dfgwas=[]

	# enable module
	WGCNA=True
	EQTL=True
	GWAS=True
	MERGE=True
	REQUERY_URL=False

	phends='Zager2019,Booth2020'
	expds='21trichcs10'
	transet='cs10'
	label_url=None
	label_wgcna=['wgcna_21trichcs10allexp_cs10_zagerbooth_b']
	url_wgcna = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0&tablelimit=100000&limit=100000']


	icnt=0
	if WGCNA :
		print('STARTED WGCNA ' + str(datetime.datetime.now()))
		for iurl in url_wgcna:
			ilabel=label_wgcna[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
			moduletrait=[]
			topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=ilabel,cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=False,exp_file=EXPFILE)
			for itop in topmodules:
				moduletrait.append(itop[0]+","+itop[1])
			dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=ilabel,moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna)
			top_dfwgcna.append(dfwgcn)
			icnt+=1


	snpds='21trichs'
	phends='Zager2019,Booth2020'
	querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
	geneslabel='cannterp'
	ref='cs10'
	label_gwas=['gwas_21trich_allsnps_cs10_zagerbooth_cannterp_b']
	url_gwas= ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]

	phenall=set()
	icnt=0
	df_gwas=[]
	# collect available phenotypes first to make selection
	if GWAS:
		print('STARTED GWAS ' + str(datetime.datetime.now()))
		for iurl in url_gwas:
			ilabel=label_gwas[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
			df_gwas.append(c_converted_phenunits_values)
			phenall.update(set(get_phenotypes(c_converted_phenunits_values)))

		# show selection options, set to waitInput=True to get interactive response. For now selection selectionid is preselected
		select_options(phenall, waitInput=False)

		selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55] # most common cannabinoids/terpenes
		#selectionid=[3,4,6,7],  CBDA/THCA

	if GWAS:
		selection=select_options(phenall, waitInput=False, selected=selectionid)
		# verify selections as as intended
		print('selections: ' + str(selection))
		idf=0
		for df in df_gwas:
			dfgwas=do_gwas(df, label_gwas[idf],myunit,phenotypes=selection,rerunplink=False,maxp=maxpgwas,plink_file=PLINKFILE) #,assocmethod='linear')
			top_dfgwas.append(dfgwas)
			idf+=1


	ref="cs10"
	snpds='21trichs'
	expds='21trichcs10'
	transet='cs10'
	querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'

	url_eqtl = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
	label_eqtl=["eqtl_terpcan_" + snpds + "_allsnpsexps_" + expds+'_relax_b']

	# GENELOCFILE is derived from GENEFILE using only gene_id\tcontig\tfmin\tfmax
	GENELOCFILE=datadir+'/cs10xm_exp.genesloc.tsv'
	pd.read_csv(GENEFILE,sep="\t")[['name','contig','fmin','fmax']].rename(columns={'name':'geneid','contig':'chr','fmin':'s1','fmax':'s2'}).to_csv(GENELOCFILE,sep="\t",index=False)

	icnt=0
	if MATRIXEQT:
		print('STARTED EQTL ' + str(datetime.datetime.now()))
		for iurl in url_eqtl:
			ilabel=label_eqtl[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,ilabel,annot=None, recodeplink=False,maxp=maxpeqtl,plink_file=PLINKFILE,expgeneloc_file=GENELOCFILE,exp_file=EXPFILE)
			top_dfeqtl.append(dfeqtlall)
			icnt+=1

	'''
	top_dfeqtl=[]
	top_dfgwas=[]
	top_dfwgna=[]
	top_dfeqtl.append('eqtl_terpcan_21trichs_allsnpsexps_21trichcs10_relax.eqtl.all.top.tsv')
	top_dfwgcna.append('wgcna_21trichcs10allexp_cs10_zagerbooth_relax.wgcna.top.tsv')
	top_dfgwas.append('gwas_21trich_allsnps_cs10_zagerbooth_cannterp_relax.gwas.top.tsv')
	'''
	merge_output='merged_21trichcs10allsnpsallexps_trait_relax_b.tsv'

	if MERGE:
		print('STARTED MERGE ' + str(datetime.datetime.now()))
		df_gene=pd.read_csv(GENEFILE,sep="\t") #name    contig  fmin    fmax    strand  gb_keyword      go_term interpro_value)
		merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=merge_output,use_snpgene=True,df_genes=df_gene)

ICGRC Omics API Demo (imputations and batch effect detection and correction)¶

Multiple data sources¶

Check batch effect, imputation and batch effect correction¶

WGCNA Analysis¶

eQTL analysis¶

Metabolite SNP Association (semi-mGWAS)¶

Merge multi tools¶

Running semi-offline using large dataset¶