%matplotlib inline
%load_ext autoreload
   
import os
import requests
from IPython.display import display, Image, FileLink
#from jupyter_datatables import init_datatables_mode
#from ipydatagrid import DataGrid
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
#init_notebook_mode(connected=True)
import itables.options as itablesopt
# display table options
#itablesopt.lengthMenu = [2, 5, 10, 20, 50, 100, 200, 500]
#itablesopt.maxBytes = 10000 , 0 to disable
#itablesopt.maxRows
#itablesopt.maxColumns

#%load_ext jupyter_require
#%requirejs d3 https://d3js.org/d3.v5.min
#init_datatables_mode()

%autoreload 2
import omics_api
from omics_api import *

# Flags to emable example
others=True
PLOT_PHEN=False
WGCNA=others
GWAS=others
MATRIXEQT=others
VERBOSE=True
MERGE=others
REQUERY_URL=False

CHECK_BE=False  # check and correct batch effect
# imputation strategy from https://doi.org/10.1038/s41598-023-30084-2
IMPUTE_M1=False # global mean
IMPUTE_M2=False # batch mean


DEBUGGNG=False

# strict setting
#maxp=1e-5

# threshold settings
maxpgwas=1e-20
maxpeqtl=1e-5
maxpwgcna=1e-5
mincorwgcna=0.7
cortopnwgcna=None
gstopnwgcna=None

top_dfgwas=[]
top_dfwgcna=[]
top_dfeqtl=[]
myunit='percent_of_dw'
#SHOW_ALL
#SHOW_DEBUG
#SHOW_ERRORONLY



if DEBUGGNG:
    setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
else:
    setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
#setVariables(loglevel=getSetting('SHOW_ERRORONLY'))

"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"

data = requests.get('https://icgrc.info/api/user/phenotype/dataset').json()
print('Phenotype datasets(phends) options')
display_table(data,columns=['phen_dataset','unit_name','samples_count','trait_count'])

Phenotype datasets(phends) options
['phen_dataset', 'unit_name', 'samples_count', 'trait_count']

#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1"]
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
#label_url=['Booth2020','Zager2019','GloerfeltTarp2023','BGZphenotypes']
api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
label_url=['Booth2020','Zager2019','BZphenotypes']
label2color=dict()
label2color['Azman2023']='red'
label2color['Booth2020']='blue'
label2color['GloerfeltTarp2023']='green'
label2color['Zager2019']='orange'
label2color['BZphenotypes']='yellow'

keep_unit='percent_of_dw'
#keep_unit='ug_per_gdw'
n_bins=10

icnt=0
dfs=[]
dfs_imputed_m2=[]
phenall=set()
keep_samples=set()
keep_phenotypes=set()
if PLOT_PHEN:
	for iurl in api_url:
		print(label_url[icnt])
		df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
	
		if VERBOSE and icnt==0:
			display('in original units')
			display(drop_allnazero_rowcol(df_raw.loc[df_raw['datatype'].str.startswith('3 PHEN')]))

		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit='percent_of_dw')
		phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
		plot_histograms(c_converted_phenunits_values, 'changed_to_' + keep_unit, label_url[icnt]) #,label2color=label2color)
	
		if VERBOSE and icnt==0:
	
			display('coverted to ' +  keep_unit)
			display(drop_allnazero_rowcol(c_converted_phenunits.loc[c_converted_phenunits['datatype'].str.startswith('3 PHEN')]))
	
			display('coverted to (values only) ' +  keep_unit)
			display(drop_allnazero_rowcol(c_converted_phenunits_values.loc[c_converted_phenunits_values['datatype'].str.startswith('3 PHEN')]))
		
		dfs.append(c_converted_phenunits_values)
        
		if IMPUTE_M2:
			(df_corrected,dummy)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=False) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
			dfs_imputed_m2.append(df_corrected)
		icnt+=1

if PLOT_PHEN:
    phenminmaxbin=None
    plot_histograms_multi(dfs, keep_unit,label_url,phenminmaxbin,label2color=label2color,nbins=n_bins,heatmap=True)

if PLOT_PHEN and IMPUTE_M2:
    phenminmaxbin=None
    df_orgcor=dfs + dfs_imputed_m2
    label_orgcor=[]
    for label in label_url:
        label_orgcor.append(label)
    for label in label_url:
        label_orgcor.append('m2_' + label)
    label2color['m2_Booth2020']='cyan'
    label2color['m2_Zager2019']='magenta'
    label2color['m2_BZphenotypes']='gray'        
    plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)

if CHECK_BE and IMPUTE_M1:
    #api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
    #label_url=['BGZphenotypes']
    api_url_m1 = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
    label_url_m1=['BZphenotypes']
    icnt=0
    df_raw= read_url(api_url_m1[0], label_url_m1[0],requery=REQUERY_URL)
    (c_converted_phenunits, dfphen)=convert_units_to(df_raw,to_unit='percent_of_dw')
    (df_corrected, map_corrected_bybatch)=check_batcheffects(dfphen, label_url_m1[0], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
    #print(list(map_corrected_bybatch.keys()))
    #plot_histograms_multi(list(map_corrected_bybatch.keys()), keep_unit,list(map_corrected_bybatch.values()),phenminmaxbin,label2color=label2color,nbins=n_bins)

setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
if CHECK_BE and IMPUTE_M2:
    dfs_imputed_m2_bz=pd.merge(dfs_imputed_m2[0],dfs_imputed_m2[1], left_on=['datatype','property'], right_on=['datatype','property'], how = 'outer')
    (df_corrected_rcb, map_corrected_bybatch_rcb)=check_batcheffects(dfs_imputed_m2_bz, 'm2rcb_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True,be_method='rcombat') #properties=['phenotype_dataset'])
    (df_corrected_eblm, map_corrected_bybatch_eblm)=check_batcheffects(dfs_imputed_m2_bz, 'm2eblm_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True,be_method='eblm') #properties=['phenotype_dataset'])

"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"

if PLOT_PHEN and IMPUTE_M2:
    phenminmaxbin=None
    '''
    df_orgcor=dfs.copy() #+ dfs_imputed_m2
    label_orgcor=label_url.copy() # []
    '''
    label_orgcor=[]
    df_orgcor=dfs_imputed_m2.copy()
    for label in label_url:
        label_orgcor.append('m2_' + label)
    
    display(label_orgcor)

    if True:
        for batch in list(map_corrected_bybatch_rcb.keys()):
            label_orgcor.append('m2rcb_' + batch)
            df_orgcor.append(map_corrected_bybatch_rcb[batch])
        label_orgcor.append('m2rcb_BZphenotypes')
        df_orgcor.append(df_corrected_rcb)                       
        
    if True:
        for batch in list(map_corrected_bybatch_eblm.keys()):
            label_orgcor.append('m2eblm_' + batch)
            df_orgcor.append(map_corrected_bybatch_eblm[batch])
        label_orgcor.append('m2eblm_BZphenotypes')
        df_orgcor.append(df_corrected_eblm)                        

    display(label_orgcor)
    label2color['m2_Booth2020']='cyan'
    label2color['m2_Zager2019']='magenta'
    label2color['m2_BZphenotypes']='gray'        
    label2color['m2rcb_Booth2020']='darkblue'
    label2color['m2rcb_Zager2019']='pink'
    label2color['m2rcb_BZphenotypes']='brown'    
    label2color['m2eblm_Booth2020']='lightgreen'
    label2color['m2eblm_Zager2019']='darkviolet'
    label2color['m2eblm_BZphenotypes']='lightsalmon'    
    plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)

data = requests.get('https://icgrc.info/api/user/expression/dataset').json()
print('Expression datasets (expds) options')
#print(pd.DataFrame(data,columns=['expds','analysis_name']).to_string(index=False))
#display(pd.DataFrame(data,columns=['expds','analysis_name']))
display_table(data,columns=['expds','analysis_name'])

data = requests.get('https://icgrc.info/api/user/expression/transet').json()
print('Transcriptome set (transet) options')
display_table(data,columns=['transet'])

Expression datasets (expds) options
['expds', 'analysis_name']

Transcriptome set (transet) options
['transet']

import datetime

DATADIR='data'
EXPFILE=DATADIR + '/21trichcs10_allexp.tsv'

# download if not available
#pd.read_csv('https://icgrc.info/api/user/expression/list?expds=21trichcs10',sep="\t").to_csv(EXPFILE,sep="\t",index=False)


phends=['Zager2019','Booth2020']
expds='21trichcs10'
transet='cs10'
label_wgcna=[]
url_wgcna=[]
for i in range(len(phends)):
	label_wgcna.append('wgcna_21trichcs10allexp_cs10_' + phends[i])
	url_wgcna.append('https://icgrc.info/api/user/phenotype/list?phends=' + phends[i] + '&with_stdunits=1&tablelimit=100000&limit=100000')

    
icnt=0

top_dfwgcna=[]
if WGCNA :
	print('STARTED WGCNA ' + str(datetime.datetime.now()))
	for iurl in url_wgcna:
		ilabel=label_wgcna[icnt]
		df_raw= read_url(iurl, ilabel,requery=False)
		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
		moduletrait=[]
		topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=ilabel,cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=False,exp_file=EXPFILE)
		for itop in topmodules:
			moduletrait.append(itop[0]+","+itop[1])
		dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=ilabel,moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna)
		top_dfwgcna.append(dfwgcn)
		icnt+=1

STARTED WGCNA 2023-11-23 22:42:36.324218

'Rscript Langfelder-03a-relateModsToExt.R wgcna_21trichcs10allexp_cs10_Zager2019 > /dev/null'

Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust


Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor

dev.new(): using pdf(file="Rplots1.pdf")
Warning message:
In greenWhiteRed(50) :
  WGCNA::greenWhiteRed: this palette is not suitable for people
with green-red color blindness (the most common kind of color blindness).
Consider using the function blueWhiteRed instead.

top expressed gene-trait wgcna_21trichcs10allexp_cs10_Zager2019.wgcna.top.tsv

'Rscript Langfelder-03a-relateModsToExt.R wgcna_21trichcs10allexp_cs10_Booth2020 > /dev/null'

Loading required package: dynamicTreeCut
Loading required package: fastcluster

Attaching package: ‘fastcluster’

The following object is masked from ‘package:stats’:

    hclust


Attaching package: ‘WGCNA’

The following object is masked from ‘package:stats’:

    cor

dev.new(): using pdf(file="Rplots1.pdf")
Warning message:
In greenWhiteRed(50) :
  WGCNA::greenWhiteRed: this palette is not suitable for people
with green-red color blindness (the most common kind of color blindness).
Consider using the function blueWhiteRed instead.

top expressed gene-trait wgcna_21trichcs10allexp_cs10_Booth2020.wgcna.top.tsv

data = requests.get('https://icgrc.info/api/user/variant/dataset').json()
print('Variant reference (ref) and dataset (snpds) options')
display_table(data,columns=['ref','snpds'])

Variant reference (ref) and dataset (snpds) options
['ref', 'snpds']

PLINKFILE=DATADIR + '/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam'

GENEFILE=DATADIR+'/gene_cs10.tsv'
#pd.read_csv('https://icgrc.info/api/user/gene/list?annot=cs10',sep="\t").to_csv(GENEFILE,sep="\t",index=False)

# GENELOCFILE is derived from GENEFILE using only gene_id\tcontig\tfmin\tfmax
GENELOCFILE=DATADIR+'/cs10xm_exp.genesloc.tsv'

icnt=0
top_dfeqtl=[]
top_dfeqtltrans=[]
top_dfeqtlcis=[]
if MATRIXEQT:
	print('STARTED EQTL ' + str(datetime.datetime.now()))

	(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(None,'21trichcs10_allexp_allsnps',annot=None, recodeplink=False,maxp=maxpeqtl,plink_file=PLINKFILE,expgeneloc_file=GENELOCFILE,exp_file=EXPFILE)
	top_dfeqtl.append(dfeqtlall)
	top_dfeqtlcis.append(dfeqtlcis)
	top_dfeqtltrans.append(dfeqtltrans)

STARTED EQTL 2023-11-23 22:42:53.747486

top snp-expressed cis gene

top snp-expressed trans gene

top snp-expressed all gene 21trichcs10_allexp_allsnps.eqtl.all.top.tsv

if GWAS:
	snpds='21trichs'
	phends=['Zager2019','Booth2020']
	geneslabel='all'
	ref='cs10'
	label_gwas=[]
	url_gwas=[]
	for i in range(len(phends)):
		label_gwas.append('gwas_21trich_allsnps_cs10_' + phends[i] + '_' + geneslabel)
		url_gwas.append("https://icgrc.info/api/user/phenotype/list?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends[i] + "&with_stdunits=1")

phenall=set()
icnt=0
df_gwas=[]
# collect available phenotypes then make selection
if GWAS:
	print('STARTED GWAS ' + str(datetime.datetime.now()))
	for iurl in url_gwas:
		ilabel=label_gwas[icnt]
		df_raw= read_url(iurl, ilabel,requery=False)
		(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
		df_gwas.append(c_converted_phenunits_values)
		phenall.update(set(get_phenotypes(c_converted_phenunits_values)))

# select all traits
selectionid=list(range(1,len(phenall)+1))
selection=select_options(phenall, waitInput=False, selected=selectionid)

STARTED GWAS 2023-11-23 22:43:11.586554

'[25] nerolidol\t\t[26] plus-terpinene\t\t[27] terpinolene'

"pre-selected ['1,8-cineole', 'Delta9-tetrahydrocannabinol', 'Delta9-tetrahydrocannabinolic acid', 'Total_cannabinoids', 'Total_monoterpenes', 'Total_sesquiterpenes', 'Total_terpenoids', 'alpha-humulene', 'alpha-pinene', 'beta-caryophyllene', 'beta-myrcene', 'beta-ocimene', 'beta-pinene', 'borneol', 'camphene', 'camphor', 'cannabichromene', 'cannabidiol', 'cannabidiolic acid', 'cannabigerol', 'cannabinol', 'gamma-3-carene', 'limonene', 'linalool', 'nerolidol', 'plus-terpinene', 'terpinolene']"

top_dfgwas=[]
if GWAS:
	idf=0
	for df in df_gwas:
		dfgwas=do_gwas(df, label_gwas[idf],myunit,phenotypes=selection,rerunplink=False,maxp=maxpgwas,plink_file=PLINKFILE, plotgwasi=False ) #,assocmethod='linear')
		top_dfgwas.append(dfgwas)
		idf+=1

common samples of 3 PHEN with data/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam = 27	{'SAMN10330900', 'SAMN10330922', 'SAMN10330912', 'SAMN10330913', 'SAMN10330921', 'SAMN10330905', 'SAMN10330918', 'SAMN10330899', 'SAMN10330901', 'SAMN10330919', 'SAMN10330920', 'SAMN10330908', 'SAMN10330897', 'SAMN10330898', 'SAMN10330910', 'SAMN10330906', 'SAMN10330909', 'SAMN10330904', 'SAMN10330914', 'SAMN10330917', 'SAMN10330907', 'SAMN10330902', 'SAMN10330915', 'SAMN10330916', 'SAMN10330903', 'SAMN10330911', 'SAMN10330896'}

rm: cannot remove ‘*.qassoc.done’: No such file or directory

top snp-trait gwas_21trich_allsnps_cs10_Zager2019_all.gwas.top.tsv

common samples of 3 PHEN with data/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam = 27	{'SAMN10330900', 'SAMN10330922', 'SAMN10330912', 'SAMN10330913', 'SAMN10330921', 'SAMN10330905', 'SAMN10330918', 'SAMN10330899', 'SAMN10330901', 'SAMN10330919', 'SAMN10330920', 'SAMN10330908', 'SAMN10330897', 'SAMN10330898', 'SAMN10330910', 'SAMN10330906', 'SAMN10330909', 'SAMN10330904', 'SAMN10330914', 'SAMN10330917', 'SAMN10330907', 'SAMN10330902', 'SAMN10330915', 'SAMN10330916', 'SAMN10330903', 'SAMN10330911', 'SAMN10330896'}

rm: cannot remove ‘*.qassoc.done’: No such file or directory

top snp-trait gwas_21trich_allsnps_cs10_Booth2020_all.gwas.top.tsv

#%autoreload 2
#import omics_api
#from omics_api import *
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)

mergeoutfle='merged_21trichcs10_byds.tsv'
def printdflists(l):
	for df in l:
		print(df.columns.values.tolist())
#setVariables(loglevel=getSetting('SHOW_DEBUG'))
printdflists(top_dfeqtl)
printdflists(top_dfwgcna)
printdflists(top_dfgwas)

if MERGE:
	df_genes=pd.read_csv(GENEFILE,sep="\t")
	merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=mergeoutfle,use_snpgene=True,df_genes=df_genes,recalc=False)

"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"

['snps', 'gene', 'statistic', 'pvalue', 'FDR', 'beta']
['expgenes', 'trait', 'GS', 'p.GS']
['expgenes', 'trait', 'GS', 'p.GS']
['trait', 'CHR', 'snps', 'BP', 'NMISS', 'BETA', 'SE', 'R2', 'T', 'pvalue']
['trait', 'CHR', 'snps', 'BP', 'NMISS', 'BETA', 'SE', 'R2', 'T', 'pvalue']

'File df_eqtl_wgcna_onexpgenes_merged_21trichcs10_byds.tsv dont exists'

'loaded df_eqtl_wgcna_onexpgenes_merged_21trichcs10_byds.tsv'

eqtl_wgcna_onexpgene_merged_21trichcs10_byds.tsv
['expgenes', 'pvalue', 'snps', 'snpgenes', 'locus', 'note', 'goterm', 'iprterm', 'trait', 'GS', 'p.GS']

'skipping eqtl_gwas_onsnpgene_merged_21trichcs10_byds.tsv  shape=(447041346, 15)'

/home/lmansu10/jupyter/omics_api_utils.py:3365: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.
  df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait=pd.merge(df_eqtl_wgcna_onexpgenes,df_gwass, on=['trait','snpgenes']) #.drop_duplicates()

"df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait (74046, 9)\n[('trait', ''), ('snpgenes', ''), ('pvalue_eqtl', 'count'), ('pvalue_eqtl', 'min'), ('GS_wgcna', 'max'), ('pGS_wgcna', 'count'), ('pGS_wgcna', 'min'), ('pvalue_gwas', 'count'), ('pvalue_gwas', 'min')]"

'using unfiltered merged_21trichcs10_byds.tsv'

'generated merged_21trichcs10_byds.tsv'

tgzdownloads('alltgz_21trich_allsnpallexp', txt=True, img=True)

'tar -cvzf alltgz_21trich_allsnpallexp.tgz -T dwlist.txt > /dev/null'

'generated alltgz_21trich_allsnpallexp.tgz'

#download gene annotation and save to data directory
datadir='data1'
GENEFILE=datadir+'/gene_cs10.tsv'
#pd.read_csv('https://icgrc.info/api/user/gene/list?annot=cs10',sep="\t").to_csv(GENEFILE,sep="\t",index=False)

#download expression matrix save to data directory
EXPFILE=datadir + '/21trichcs10_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?expds=21trichcs10',sep="\t").to_csv(EXPFILE,sep="\t",index=False)

#EXPFILE=datadir + '/GaoGuerrieroConneely_allexp.tsv'
#EXPFILE=datadir + '/Gao_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?transet=all&expds=Massimino2017&tablelimit=100000&limit=1000',sep="\t").to_csv(EXPFILE,sep="\t",index=False)


#download variant file in binary plink format at 
PLINKFILE='data/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam'
#PLINKFILE=None

PHENFILE=datadir+'/phen_all.tsv'
#pd.read_csv('https://icgrc.info/api/user/phenotype/list',sep="\t").to_csv(PHENFILE,sep="\t",index=False)

if False:
	import datetime
	setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
	# hold top results
	top_dfwgcna=[]
	top_dfeqtl=[]
	top_dfgwas=[]

	# enable module
	WGCNA=True
	EQTL=True
	GWAS=True
	MERGE=True
	REQUERY_URL=False

	phends='Zager2019,Booth2020'
	expds='21trichcs10'
	transet='cs10'
	label_url=None
	label_wgcna=['wgcna_21trichcs10allexp_cs10_zagerbooth_b']
	url_wgcna = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0&tablelimit=100000&limit=100000']


	icnt=0
	if WGCNA :
		print('STARTED WGCNA ' + str(datetime.datetime.now()))
		for iurl in url_wgcna:
			ilabel=label_wgcna[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
			moduletrait=[]
			topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=ilabel,cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=False,exp_file=EXPFILE)
			for itop in topmodules:
				moduletrait.append(itop[0]+","+itop[1])
			dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=ilabel,moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna)
			top_dfwgcna.append(dfwgcn)
			icnt+=1


	snpds='21trichs'
	phends='Zager2019,Booth2020'
	querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
	geneslabel='cannterp'
	ref='cs10'
	label_gwas=['gwas_21trich_allsnps_cs10_zagerbooth_cannterp_b']
	url_gwas= ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]

	phenall=set()
	icnt=0
	df_gwas=[]
	# collect available phenotypes first to make selection
	if GWAS:
		print('STARTED GWAS ' + str(datetime.datetime.now()))
		for iurl in url_gwas:
			ilabel=label_gwas[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
			df_gwas.append(c_converted_phenunits_values)
			phenall.update(set(get_phenotypes(c_converted_phenunits_values)))

		# show selection options, set to waitInput=True to get interactive response. For now selection selectionid is preselected
		select_options(phenall, waitInput=False)

		selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55] # most common cannabinoids/terpenes
		#selectionid=[3,4,6,7],  CBDA/THCA

	if GWAS:
		selection=select_options(phenall, waitInput=False, selected=selectionid)
		# verify selections as as intended
		print('selections: ' + str(selection))
		idf=0
		for df in df_gwas:
			dfgwas=do_gwas(df, label_gwas[idf],myunit,phenotypes=selection,rerunplink=False,maxp=maxpgwas,plink_file=PLINKFILE) #,assocmethod='linear')
			top_dfgwas.append(dfgwas)
			idf+=1


	ref="cs10"
	snpds='21trichs'
	expds='21trichcs10'
	transet='cs10'
	querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'

	url_eqtl = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
	label_eqtl=["eqtl_terpcan_" + snpds + "_allsnpsexps_" + expds+'_relax_b']

	# GENELOCFILE is derived from GENEFILE using only gene_id\tcontig\tfmin\tfmax
	GENELOCFILE=datadir+'/cs10xm_exp.genesloc.tsv'
	pd.read_csv(GENEFILE,sep="\t")[['name','contig','fmin','fmax']].rename(columns={'name':'geneid','contig':'chr','fmin':'s1','fmax':'s2'}).to_csv(GENELOCFILE,sep="\t",index=False)

	icnt=0
	if MATRIXEQT:
		print('STARTED EQTL ' + str(datetime.datetime.now()))
		for iurl in url_eqtl:
			ilabel=label_eqtl[icnt]
			df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
			(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,ilabel,annot=None, recodeplink=False,maxp=maxpeqtl,plink_file=PLINKFILE,expgeneloc_file=GENELOCFILE,exp_file=EXPFILE)
			top_dfeqtl.append(dfeqtlall)
			icnt+=1

	'''
	top_dfeqtl=[]
	top_dfgwas=[]
	top_dfwgna=[]
	top_dfeqtl.append('eqtl_terpcan_21trichs_allsnpsexps_21trichcs10_relax.eqtl.all.top.tsv')
	top_dfwgcna.append('wgcna_21trichcs10allexp_cs10_zagerbooth_relax.wgcna.top.tsv')
	top_dfgwas.append('gwas_21trich_allsnps_cs10_zagerbooth_cannterp_relax.gwas.top.tsv')
	'''
	merge_output='merged_21trichcs10allsnpsallexps_trait_relax_b.tsv'

	if MERGE:
		print('STARTED MERGE ' + str(datetime.datetime.now()))
		df_gene=pd.read_csv(GENEFILE,sep="\t") #name    contig  fmin    fmax    strand  gb_keyword      go_term interpro_value)
		merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=merge_output,use_snpgene=True,df_genes=df_gene)

ICGRC Omics API Demo (using batch offline data)¶

Multiple data sources¶

Check batch effect, imputation and batch effect correction¶

WGCNA Analysis¶

eQTL analysis¶

Metabolite SNP Association (semi-mGWAS)¶

Merge multi tools¶

Running semi-offline using large dataset¶