ICGRC Omics API Demo (imputations and batch effect detection and correction)¶
About this notebook Merging datasets from different sources can introduce biases to give false signals in the analysis. Methods and tools are avaiable to detect these biases and correct the data to remove them. Although these methods work well with abundant datasets, we still implemented them here using our limited data to demonstrate the capability. We look forward to fully utilize these when more datasets are avaiable, or on other Tripal sites for heavily studied species with many datasets.
%matplotlib inline
%load_ext autoreload
import os
import requests
from IPython.display import display, Image, FileLink
#from jupyter_datatables import init_datatables_mode
#from ipydatagrid import DataGrid
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
#init_notebook_mode(connected=True)
import itables.options as itablesopt
# display table options
#itablesopt.lengthMenu = [2, 5, 10, 20, 50, 100, 200, 500]
#itablesopt.maxBytes = 10000 , 0 to disable
#itablesopt.maxRows
#itablesopt.maxColumns
#%load_ext jupyter_require
#%requirejs d3 https://d3js.org/d3.v5.min
#init_datatables_mode()
%autoreload 2
import omics_api
from omics_api import *
# Flags to emable example
others=False
PLOT_PHEN=True
WGCNA=others
GWAS=others
MATRIXEQT=others
VERBOSE=True
MERGE=others
REQUERY_URL=False
CHECK_BE=True # check and correct batch effect
# imputation strategy from https://doi.org/10.1038/s41598-023-30084-2
IMPUTE_M1=True # global mean
IMPUTE_M2=True # batch mean
DEBUGGNG=False
# strict setting
#maxp=1e-5
# threshold settings
maxpgwas=1e-20
maxpeqtl=1e-5
maxpwgcna=1e-5
mincorwgcna=0.7
cortopnwgcna=None
gstopnwgcna=None
top_dfgwas=[]
top_dfwgcna=[]
top_dfeqtl=[]
myunit='percent_of_dw'
#SHOW_ALL
#SHOW_DEBUG
#SHOW_ERRORONLY
if DEBUGGNG:
setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
else:
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
#setVariables(loglevel=getSetting('SHOW_ERRORONLY'))
"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"
Multiple data sources¶
This demo plots dstribution of datasets from multiple sources
Avalable options for phenotypes
data = requests.get('https://icgrc.info/api/user/phenotype/dataset').json()
print('Phenotype datasets(phends) options')
display_table(data,columns=['phen_dataset','unit_name','samples_count','trait_count'])
Phenotype datasets(phends) options
phen_dataset | unit_name | samples_count | trait_count |
---|---|---|---|
Loading... (need help?) |
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1"]
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
#label_url=['Booth2020','Zager2019','GloerfeltTarp2023','BGZphenotypes']
api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
label_url=['Booth2020','Zager2019','BZphenotypes']
label2color=dict()
label2color['Azman2023']='red'
label2color['Booth2020']='blue'
label2color['GloerfeltTarp2023']='green'
label2color['Zager2019']='orange'
label2color['BZphenotypes']='yellow'
keep_unit='percent_of_dw'
#keep_unit='ug_per_gdw'
n_bins=10
icnt=0
dfs=[]
dfs_imputed_m2=[]
phenall=set()
keep_samples=set()
keep_phenotypes=set()
if PLOT_PHEN:
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
if VERBOSE and icnt==0:
display('in original units')
display(drop_allnazero_rowcol(df_raw.loc[df_raw['datatype'].str.startswith('3 PHEN')]))
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit='percent_of_dw')
phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
plot_histograms(c_converted_phenunits_values, 'changed_to_' + keep_unit, label_url[icnt]) #,label2color=label2color)
if VERBOSE and icnt==0:
display('coverted to ' + keep_unit)
display(drop_allnazero_rowcol(c_converted_phenunits.loc[c_converted_phenunits['datatype'].str.startswith('3 PHEN')]))
display('coverted to (values only) ' + keep_unit)
display(drop_allnazero_rowcol(c_converted_phenunits_values.loc[c_converted_phenunits_values['datatype'].str.startswith('3 PHEN')]))
dfs.append(c_converted_phenunits_values)
if IMPUTE_M2:
(df_corrected,dummy)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=False) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
dfs_imputed_m2.append(df_corrected)
icnt+=1
Booth2020
'in original units'
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'coverted to percent_of_dw'
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'coverted to (values only) percent_of_dw'
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'cannot insert property, already exists'
original data
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'generated becorrected_Booth2020.tsv'
becorrected_Booth2020.tsv
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
Zager2019
'cannot insert property, already exists'
original data
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'generated becorrected_Zager2019.tsv'
becorrected_Zager2019.tsv
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
BZphenotypes
'for PCA has NaN'
'imputing using mean'
'cannot insert property, already exists'
original data
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'generated becorrected_BZphenotypes.tsv'
becorrected_BZphenotypes.tsv
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
/home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:,'datatype']=df1.loc[:,'datatype'] /home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:,'datatype']=df1.loc[:,'datatype']
if PLOT_PHEN:
phenminmaxbin=None
plot_histograms_multi(dfs, keep_unit,label_url,phenminmaxbin,label2color=label2color,nbins=n_bins,heatmap=True)
'file dont exists generated 3_-7202988356578264128_phenotypes_percent_of_dw.png'
'file dont exists generated 3_-7202988356578264128_phenotypes_percent_of_dw_density.png'
<Figure size 2000x2000 with 0 Axes>
'generated Booth2020-Zager2019-BZphenotypes.phen2samplesrc-by_sample_dataset.txt'
<Figure size 640x480 with 0 Axes>
'generated Booth2020-Zager2019-BZphenotypes.phensrc2sample-by_trait_dataset.txt'
'generated Booth2020-Zager2019-BZphenotypes.phensrc2sample-by_dataset_trait.txt'
<Figure size 640x480 with 0 Axes>
The imputation and batch correction strategies described in (Hui 2023, https://doi.org/10.1038/s41598-023-30084-2) are used here. Briefly, M1 uses global mean imputation, M2 uses batch mean imputation, and M3 uses cross-batch mean imputation. They found M2 gave the best performance. M1 and M2 strategies are tried here.
Plot original data after unit conversion data, and data imputed by batch
if PLOT_PHEN and IMPUTE_M2:
phenminmaxbin=None
df_orgcor=dfs + dfs_imputed_m2
label_orgcor=[]
for label in label_url:
label_orgcor.append(label)
for label in label_url:
label_orgcor.append('m2_' + label)
label2color['m2_Booth2020']='cyan'
label2color['m2_Zager2019']='magenta'
label2color['m2_BZphenotypes']='gray'
plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)
'file dont exists generated 6_-3390368805713138963_phenotypes_percent_of_dw.png'
'file dont exists generated 6_-3390368805713138963_phenotypes_percent_of_dw_density.png'
<Figure size 2000x2000 with 0 Axes>
Check batch effect, imputation and batch effect correction¶
Check for batch effects using Principal Component Analysis, and correction by ComBat or EmpericalBayesLM. PCA and ComBat require imputation accomplished by scikit-learn SimpleImputer (most_frequent, mean, median) and KNNImputer. Another option is Probabilistic PCA, PPCA, can compute PCA with missing values, and returns imputed data at the same time.
The available options are:
on_missing [mean,median,most_frequent,KNNImputer,ppca]
be_method [rcombat, eblm]
These tools are typically used on high throughput datasets like transcriptomic expression, proteomics or metabolomics with hundreds to thousands of data points. Here we use it on published, low-throughput metabolite concentration datasets.
Required python modules
- scikit-learn
- pyppca
- plotly
Required R packages
- sva
- WGCNA
M1 imputation (global mean)
if CHECK_BE and IMPUTE_M1:
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
#label_url=['BGZphenotypes']
api_url_m1 = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
label_url_m1=['BZphenotypes']
icnt=0
df_raw= read_url(api_url_m1[0], label_url_m1[0],requery=REQUERY_URL)
(c_converted_phenunits, dfphen)=convert_units_to(df_raw,to_unit='percent_of_dw')
(df_corrected, map_corrected_bybatch)=check_batcheffects(dfphen, label_url_m1[0], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
#print(list(map_corrected_bybatch.keys()))
#plot_histograms_multi(list(map_corrected_bybatch.keys()), keep_unit,list(map_corrected_bybatch.values()),phenminmaxbin,label2color=label2color,nbins=n_bins)
'for PCA has NaN'
'imputing using mean'
'correcting batch effect using rcombat'
'Rscript Langfelder-combat.R BZphenotypes BZphenotypes.rcombat.data.csv BZphenotypes.rcombat.annot.csv phenotype_dataset > /dev/null'
Loading required package: mgcv Loading required package: nlme This is mgcv 1.9-0. For overview type 'help("mgcv-package")'. Loading required package: genefilter Loading required package: BiocParallel Found2batches Adjusting for0covariate(s) or covariate level(s) Standardizing Data across genes Fitting L/S model and finding priors Finding parametric adjustments Adjusting the Data
'generated BZphenotypes.pca.png'
'generated BZphenotypes.pca.3d.png'
'generated BZphenotypes.pca.3d.html'
'generated BZphenotypes.be.pca.png'
'generated BZphenotypes.be.pca.3d.png'
'generated BZphenotypes.be.pca.3d.html'
original data
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'generated becorrected_BZphenotypes.tsv'
becorrected_BZphenotypes.tsv
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
/home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
M2 imputation (batch mean)
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
if CHECK_BE and IMPUTE_M2:
dfs_imputed_m2_bz=pd.merge(dfs_imputed_m2[0],dfs_imputed_m2[1], left_on=['datatype','property'], right_on=['datatype','property'], how = 'outer')
(df_corrected_rcb, map_corrected_bybatch_rcb)=check_batcheffects(dfs_imputed_m2_bz, 'm2rcb_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True,be_method='rcombat') #properties=['phenotype_dataset'])
(df_corrected_eblm, map_corrected_bybatch_eblm)=check_batcheffects(dfs_imputed_m2_bz, 'm2eblm_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=True,be_method='eblm') #properties=['phenotype_dataset'])
"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"
'for PCA has NaN'
'imputing using mean'
'correcting batch effect using rcombat'
'Rscript Langfelder-combat.R m2rcb_BZphenotypes m2rcb_BZphenotypes.rcombat.data.csv m2rcb_BZphenotypes.rcombat.annot.csv phenotype_dataset > /dev/null'
Loading required package: mgcv Loading required package: nlme This is mgcv 1.9-0. For overview type 'help("mgcv-package")'. Loading required package: genefilter Loading required package: BiocParallel Found2batches Adjusting for0covariate(s) or covariate level(s) Standardizing Data across genes Fitting L/S model and finding priors Finding parametric adjustments Adjusting the Data
'generated m2rcb_BZphenotypes.pca.png'
'generated m2rcb_BZphenotypes.pca.3d.png'
'generated m2rcb_BZphenotypes.pca.3d.html'
'generated m2rcb_BZphenotypes.be.pca.png'
'generated m2rcb_BZphenotypes.be.pca.3d.png'
'generated m2rcb_BZphenotypes.be.pca.3d.html'
original data
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'generated becorrected_m2rcb_BZphenotypes.tsv'
becorrected_m2rcb_BZphenotypes.tsv
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
/home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
'for PCA has NaN'
'imputing using mean'
'correcting batch effect using eblm'
'Rscript Langfelder-eblm.R m2eblm_BZphenotypes m2eblm_BZphenotypes.eblm.data.csv m2eblm_BZphenotypes.eblm.annot.csv phenotype_dataset > /dev/null'
Loading required package: mgcv Loading required package: nlme This is mgcv 1.9-0. For overview type 'help("mgcv-package")'. Loading required package: genefilter Loading required package: BiocParallel Loading required package: dynamicTreeCut Loading required package: fastcluster Attaching package: ‘fastcluster’ The following object is masked from ‘package:stats’: hclust Attaching package: ‘WGCNA’ The following object is masked from ‘package:stats’: cor
'generated m2eblm_BZphenotypes.pca.png'
'generated m2eblm_BZphenotypes.pca.3d.png'
'generated m2eblm_BZphenotypes.pca.3d.html'
'generated m2eblm_BZphenotypes.be.pca.png'
'generated m2eblm_BZphenotypes.be.pca.3d.png'
'generated m2eblm_BZphenotypes.be.pca.3d.html'
original data
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'generated becorrected_m2eblm_BZphenotypes.tsv'
becorrected_m2eblm_BZphenotypes.tsv
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
/home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/lmansu10/jupyter/omics_api_utils.py:4403: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Plot and compare the batch imputed, ComBat corrected and EmpericalBayesLM corrected data
if PLOT_PHEN and IMPUTE_M2:
phenminmaxbin=None
'''
df_orgcor=dfs.copy() #+ dfs_imputed_m2
label_orgcor=label_url.copy() # []
'''
label_orgcor=[]
df_orgcor=dfs_imputed_m2.copy()
for label in label_url:
label_orgcor.append('m2_' + label)
display(label_orgcor)
if True:
for batch in list(map_corrected_bybatch_rcb.keys()):
label_orgcor.append('m2rcb_' + batch)
df_orgcor.append(map_corrected_bybatch_rcb[batch])
label_orgcor.append('m2rcb_BZphenotypes')
df_orgcor.append(df_corrected_rcb)
if True:
for batch in list(map_corrected_bybatch_eblm.keys()):
label_orgcor.append('m2eblm_' + batch)
df_orgcor.append(map_corrected_bybatch_eblm[batch])
label_orgcor.append('m2eblm_BZphenotypes')
df_orgcor.append(df_corrected_eblm)
display(label_orgcor)
label2color['m2_Booth2020']='cyan'
label2color['m2_Zager2019']='magenta'
label2color['m2_BZphenotypes']='gray'
label2color['m2rcb_Booth2020']='darkblue'
label2color['m2rcb_Zager2019']='pink'
label2color['m2rcb_BZphenotypes']='brown'
label2color['m2eblm_Booth2020']='lightgreen'
label2color['m2eblm_Zager2019']='darkviolet'
label2color['m2eblm_BZphenotypes']='lightsalmon'
plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)
['m2_Booth2020', 'm2_Zager2019', 'm2_BZphenotypes']
['m2_Booth2020', 'm2_Zager2019', 'm2_BZphenotypes', 'm2rcb_Booth2020', 'm2rcb_Zager2019', 'm2rcb_BZphenotypes', 'm2eblm_Booth2020', 'm2eblm_Zager2019', 'm2eblm_BZphenotypes']
'file dont exists generated 9_-2897765732204754825_phenotypes_percent_of_dw.png'
'file dont exists generated 9_-2897765732204754825_phenotypes_percent_of_dw_density.png'
<Figure size 2000x2000 with 0 Axes>
WGCNA Analysis¶
This demo uses expression and phenotype data for Weighted Gene Co-expression Network Analysis (WGCNA), using the WGCNA R package (https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA). The output is the correlation of Gene Modules to Traits, were modules are a set of genes with similar expresson patterns. The top trait-module pairs are returned with the set number or p-value cutoff.
Requirements
- R
- R packages: WGCNA
Avalable options for expression
data = requests.get('https://icgrc.info/api/user/expression/dataset').json()
print('Expression datasets (expds) options')
#print(pd.DataFrame(data,columns=['expds','analysis_name']).to_string(index=False))
#display(pd.DataFrame(data,columns=['expds','analysis_name']))
display_table(data,columns=['expds','analysis_name'])
data = requests.get('https://icgrc.info/api/user/expression/transet').json()
print('Transcriptome set (transet) options')
display_table(data,columns=['transet'])
#import omics_api
#from omics_api import *
phends='Zager2019,Booth2020'
expds='21trichcs10'
transet='cs10'
#label_url=['21trichcs10_cs10_zagerbooth']
label_url=['21trichcs10_cs10_zagerbooth_be']
api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0']
#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichpkclone109&phends=Zager2019%2CBooth2020&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichpk_zagerbooth']
#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichcs10&phends=Zager2019&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichcs10_zager']
icnt=0
top_dfwgcna=[]
if WGCNA :
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
if False: #CHECK_BE
#display(c_converted_phenunits_values)
(df_corrected, map_corrected_bybatch)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
c_converted_phenunits_values=df_corrected
moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']
topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=label_url[icnt],cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=True)
break
for itop in topmodules:
moduletrait.append(itop[0]+","+itop[1])
dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=label_url[icnt],moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna,annotate=True)
#do_wgcna_visualize(c_converted_phenunits_values,dflabel=label_url[icnt],recalc=True,moduletrait=moduletrait)
top_dfwgcna.append(dfwgcn)
#display(Image(filename=label_url[0] + '.ModuleTraitRelationship.png'))
#display(Image(filename=label_url[0] + '.ClusterDendogram.png'))
icnt+=1
eQTL analysis¶
This demo uses expression and variant data for eQTL analysis using Matrix eQTL R package (https://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/runit.html). The analysis can be between any SNP-gene pair, neighboring pairs (cis), or distant pairs (trans). The output are the top SNP-Gene pairs the set number or p-value cutoff.
Requirements
- R
- R packages: MatrixEQTL
Avalaible options for variants
data = requests.get('https://icgrc.info/api/user/variant/dataset').json()
print('Variant reference (ref) and dataset (snpds) options')
display_table(data,columns=['ref','snpds'])
Variant reference (ref) and dataset (snpds) options
ref | snpds |
---|---|
Loading... (need help?) |
Select options
ref="cs10"
snpds='21trichs'
expds='21trichcs10'
transet='cs10'
querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'
api_url = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
label_url=["eqtlb_terpcan_" + snpds + "_" + expds]
#label_url=["eqtl_terpcan_" + snpds + "_allsnps_" + expds]
icnt=0
top_dfeqtl=[]
if MATRIXEQT:
#if True:
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,label_url[icnt],annot='cs10', recodeplink=False,recalc=True,maxp=maxpeqtl,expgeneloc_file='data/cs10xm_exp.genesloc.tsv')
top_dfeqtl.append(dfeqtlall)
#display(Image(filename=label_url[0] + ".eqtlpvalue.png"))
#display(Image(filename=label_url[0] + ".eqtlqqplot.png"))
icnt+=1
Metabolite SNP Association (semi-mGWAS)¶
This demonstration uses traits/metabolite associaton with SNPs using plink --assoc feature. This generates the Manhattan Plot for each trait on the genomic location of genes that satify the functional annotation or accession constraints. The result is the list of top SNP-Trait pairs according to the set number or p-value cutoff.
Requirements
- R
- Python package: qqman
- plink
if GWAS: # snp, phen
snpds='21trichs'
#phends='Booth2020'
phends='Zager2019,Booth2020'
querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
geneslabel='cannterp'
ref='cs10'
label_url=['21trichs_cs10_zagerbooth_cannterp_covar']
#label_url=['21trich_allsnps_cs10_zagerbooth_cannterp']
api_url = ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]
dfs=[]
phenall=set()
icnt=0
if GWAS:
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']
dfs.append(c_converted_phenunits_values)
phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
if GWAS:
select_options(phenall, waitInput=False)
Select phenotypes to process
Set the of list numbers for phenotypes to perform mGWAS
#selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55]
selectionid=[3,4,6,7]
GWAS_FROMPLINK=False
#%autoreload 2
setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
top_dfgwas=[]
if GWAS:
selection=select_options(phenall, waitInput=False, selected=selectionid)
print('selections: ' + str(selection))
idf=0
df_gene=pd.read_csv('data/gene_cs10.tsv',sep="\t")
for df in dfs:
dfgwas=do_gwas(df, label_url[idf],myunit,phenotypes=selection,rerunplink=True,maxp=maxpgwas,recalc=True,annot='cs10') #,covar=['phenotype_dataset'],assocmethod='linear') #,df_genes=df_gene,annot='cs10')
top_dfgwas.append(dfgwas)
#printdisplay(MESSAGE_INFO,label_url[idf] + '.gwas.top.tsv') #Image(filename=label_url[idf] + '_gwasmp_' + myunit + '_' + label_url[idf] + '.qassoc.counts.png'))
idf+=1
"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=2\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"
Merge multi tools¶
Merge the result from the tools above to find multiple evedince of association. The steps include
- eQTL gives top snp-expression pairs, and snp gene-expressed gene pairs
- WGCNA gives top expressed gene - trait pairs
- mGWAS gives top snp-trait, and snp gene-trait pairs
The merging connects the expression genes common from the eQTL and WGCNA results, giving new snp gene-trait pairs
- The new snp gene-trait pairs are then intersected with the mGWAS result.
The intersecting pairs have two evidence paths of gene-gtrait association. The result below gives 144 gene-trait pairs, with 23 genes
#%autoreload 2
#import omics_api
#from omics_api import *
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
mergeoutfle='merged_21trichcs10_alltraits.tsv'
def printdflists(l):
for df in l:
print(df.columns.values.tolist())
#setVariables(loglevel=getSetting('SHOW_DEBUG'))
printdflists(top_dfeqtl)
printdflists(top_dfwgcna)
printdflists(top_dfgwas)
MERGE=True
if MERGE:
df_genes=pd.read_csv('data/gene_cs10.tsv',sep="\t")
merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=mergeoutfle,use_snpgene=True,df_genes=df_genes,recalc=False)
"set variables: \nkeep_unit=percent_of_dw\nr_path=Rscript\nplink_path=plink\nLOG_LEVEL=1\nunit_converter={('percent_of_dw', 'ug_per_gdw'): 10000.0, ('ug_per_gdw', 'percent_of_dw'): 0.0001}"
'File df_eqtl_wgcna_onexpgenes_merged_21trichcs10_alltraits.tsv dont exists'
'loaded df_eqtl_wgcna_onexpgenes_merged_21trichcs10_alltraits.tsv'
eqtl_wgcna_onexpgene_merged_21trichcs10_alltraits.tsv
snps | snpgenes | expgenes | gene | pvalue | trait | GS | p.GS |
---|---|---|---|---|---|---|---|
Loading... (need help?) |
'loaded merged_21trichcs10_alltraits.tsv'
df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait
trait | snpgenes | pvalue_eqtl | pvalue_eqtl.1 | GS_wgcna | pGS_wgcna | pGS_wgcna.1 | pvalue_gwas | pvalue_gwas.1 |
---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
triple_snpgenetrait_merged_21trichcs10_alltraits.tsv
trait | snpgenes | pvalue_eqtl_count | pvalue_eqtl_min | GS_wgcna_max | pGS_wgcna_count | pGS_wgcna_min | pvalue_gwas_count | pvalue_gwas_min | gene | contig | fmin | fmax | strand | locus | gb_keyword | go_term | interpro_value |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
tgzdownloads('alltgz1', txt=True, img=True)
'tar -cvzf alltgz1.tgz -T dwlist.txt > /dev/null'
'generated alltgz1.tgz'
Running semi-offline using large dataset¶
Performing genome-wide or transcriptome-wide analyses requires large datasets. It is not practical to use retail web-service API query but the datasets may be downloaded in bulk, stored, and accessed locally. The API has feature to use local file instead of constantly querying the web-server. Most of these files are still downloadable using the web-service but requested in bulk by datasets. These datasets include:
- gene files by reference and annotation from api/user/gene/list
- expression matrix by dataset from api/user/expression/list
- variant SNPs data in binary Plink format by dataset avaialable at
This section demonstrates bulk downloading the necessary datasets, and repeating the analyses described above using whole-genome SNPs and whole-transcriptome expression data. Since these runs take time, run this as a python script from the command line outside of jupyter.
#download gene annotation and save to data directory
datadir='data1'
GENEFILE=datadir+'/gene_cs10.tsv'
#pd.read_csv('https://icgrc.info/api/user/gene/list?annot=cs10',sep="\t").to_csv(GENEFILE,sep="\t",index=False)
#download expression matrix save to data directory
EXPFILE=datadir + '/21trichcs10_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?expds=21trichcs10',sep="\t").to_csv(EXPFILE,sep="\t",index=False)
#EXPFILE=datadir + '/GaoGuerrieroConneely_allexp.tsv'
#EXPFILE=datadir + '/Gao_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?transet=all&expds=Massimino2017&tablelimit=100000&limit=1000',sep="\t").to_csv(EXPFILE,sep="\t",index=False)
#download variant file in binary plink format at
PLINKFILE='data/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam'
#PLINKFILE=None
PHENFILE=datadir+'/phen_all.tsv'
#pd.read_csv('https://icgrc.info/api/user/phenotype/list',sep="\t").to_csv(PHENFILE,sep="\t",index=False)
if False:
import datetime
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
# hold top results
top_dfwgcna=[]
top_dfeqtl=[]
top_dfgwas=[]
# enable module
WGCNA=True
EQTL=True
GWAS=True
MERGE=True
REQUERY_URL=False
phends='Zager2019,Booth2020'
expds='21trichcs10'
transet='cs10'
label_url=None
label_wgcna=['wgcna_21trichcs10allexp_cs10_zagerbooth_b']
url_wgcna = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0&tablelimit=100000&limit=100000']
icnt=0
if WGCNA :
print('STARTED WGCNA ' + str(datetime.datetime.now()))
for iurl in url_wgcna:
ilabel=label_wgcna[icnt]
df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
moduletrait=[]
topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=ilabel,cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=False,exp_file=EXPFILE)
for itop in topmodules:
moduletrait.append(itop[0]+","+itop[1])
dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=ilabel,moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna)
top_dfwgcna.append(dfwgcn)
icnt+=1
snpds='21trichs'
phends='Zager2019,Booth2020'
querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
geneslabel='cannterp'
ref='cs10'
label_gwas=['gwas_21trich_allsnps_cs10_zagerbooth_cannterp_b']
url_gwas= ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]
phenall=set()
icnt=0
df_gwas=[]
# collect available phenotypes first to make selection
if GWAS:
print('STARTED GWAS ' + str(datetime.datetime.now()))
for iurl in url_gwas:
ilabel=label_gwas[icnt]
df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
df_gwas.append(c_converted_phenunits_values)
phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
# show selection options, set to waitInput=True to get interactive response. For now selection selectionid is preselected
select_options(phenall, waitInput=False)
selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55] # most common cannabinoids/terpenes
#selectionid=[3,4,6,7], CBDA/THCA
if GWAS:
selection=select_options(phenall, waitInput=False, selected=selectionid)
# verify selections as as intended
print('selections: ' + str(selection))
idf=0
for df in df_gwas:
dfgwas=do_gwas(df, label_gwas[idf],myunit,phenotypes=selection,rerunplink=False,maxp=maxpgwas,plink_file=PLINKFILE) #,assocmethod='linear')
top_dfgwas.append(dfgwas)
idf+=1
ref="cs10"
snpds='21trichs'
expds='21trichcs10'
transet='cs10'
querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'
url_eqtl = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
label_eqtl=["eqtl_terpcan_" + snpds + "_allsnpsexps_" + expds+'_relax_b']
# GENELOCFILE is derived from GENEFILE using only gene_id\tcontig\tfmin\tfmax
GENELOCFILE=datadir+'/cs10xm_exp.genesloc.tsv'
pd.read_csv(GENEFILE,sep="\t")[['name','contig','fmin','fmax']].rename(columns={'name':'geneid','contig':'chr','fmin':'s1','fmax':'s2'}).to_csv(GENELOCFILE,sep="\t",index=False)
icnt=0
if MATRIXEQT:
print('STARTED EQTL ' + str(datetime.datetime.now()))
for iurl in url_eqtl:
ilabel=label_eqtl[icnt]
df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,ilabel,annot=None, recodeplink=False,maxp=maxpeqtl,plink_file=PLINKFILE,expgeneloc_file=GENELOCFILE,exp_file=EXPFILE)
top_dfeqtl.append(dfeqtlall)
icnt+=1
'''
top_dfeqtl=[]
top_dfgwas=[]
top_dfwgna=[]
top_dfeqtl.append('eqtl_terpcan_21trichs_allsnpsexps_21trichcs10_relax.eqtl.all.top.tsv')
top_dfwgcna.append('wgcna_21trichcs10allexp_cs10_zagerbooth_relax.wgcna.top.tsv')
top_dfgwas.append('gwas_21trich_allsnps_cs10_zagerbooth_cannterp_relax.gwas.top.tsv')
'''
merge_output='merged_21trichcs10allsnpsallexps_trait_relax_b.tsv'
if MERGE:
print('STARTED MERGE ' + str(datetime.datetime.now()))
df_gene=pd.read_csv(GENEFILE,sep="\t") #name contig fmin fmax strand gb_keyword go_term interpro_value)
merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=merge_output,use_snpgene=True,df_genes=df_gene)