ICGRC Omics API Demo¶
This Jupyter notebook demonstrates the utility of the ICGRC Omics API (https://icgrc.info/api_doc) to serve multi-omics datasets from a Tripal database. The API is designed for merging multi-omics datasets, and from multiple data sources. The use-cases here include:
- Merging phenotype data from multiple sources, transforming them into compatiple units, and plotting their distribution
- Batch effect correction and imputation
- Weighted Gene Co-expression Network Analysis (WGCNA) to list top Trait-Gene pairs using Phenotype and Expression data
- Matrix eQTL to list top SNP-Gene pairs using Variant and Expression data
- mGWAS to list top SNP-Metabolite pairs using Variant and Phenotype data
- Combine results of eQTL, WGCNA and mGWAS to find shared genes and traits to trace multiple evidence paths.
- Semi-offline mode using pre-downloaded large datasets like whole-genome SNPs and gene annotatons, and whole-transcriptome expression data
This demo uses the omics_api library designed to interface with the ICGRC omics API web service. These python libraries are use in all applications: matplotlib,numpy, wand. Additional software need to be installed by the client like plink, R and and python packages used by specific use-cases described in the sections below.
Goal oriented notebooks
This notebook is a short demo more focused on the API features, the results presented may not be statistically robust. The following notebooks are goal-oriented examples with actual/reliable results.
%matplotlib inline
%load_ext autoreload
import os
import requests
from IPython.display import display, Image, FileLink
#from jupyter_datatables import init_datatables_mode
#from ipydatagrid import DataGrid
from itables import init_notebook_mode
#init_notebook_mode(all_interactive=True)
init_notebook_mode(connected=True)
import itables.options as itablesopt
# display table options
#itablesopt.lengthMenu = [2, 5, 10, 20, 50, 100, 200, 500]
#itablesopt.maxBytes = 10000 , 0 to disable
#itablesopt.maxRows
#itablesopt.maxColumns
#%load_ext jupyter_require
#%requirejs d3 https://d3js.org/d3.v5.min
#init_datatables_mode()
%autoreload 2
import omics_api
from omics_api import *
# Flags to emable example
others=True
PLOT_PHEN=others
WGCNA=others
GWAS=others
MATRIXEQT=others
VERBOSE=True
MERGE=others
REQUERY_URL=False
CHECK_BE=True # check and correct batch effect
# imputation strategy from https://doi.org/10.1038/s41598-023-30084-2
IMPUTE_M1=True # global mean
IMPUTE_M2=True # batch mean
DEBUGGNG=False
# strict setting
#maxp=1e-5
# threshold settings
maxpgwas=1e-20
maxpeqtl=1e-5
maxpwgcna=1e-5
mincorwgcna=0.7
cortopnwgcna=None
gstopnwgcna=None
top_dfgwas=[]
top_dfwgcna=[]
top_dfeqtl=[]
myunit='percent_of_dw'
#SHOW_ALL
#SHOW_DEBUG
#SHOW_ERRORONLY
if DEBUGGNG:
setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
else:
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
#setVariables(loglevel=getSetting('SHOW_ERRORONLY'))
Multiple data sources¶
This demo plots dstribution of datasets from multiple sources
Avalable options for phenotypes
data = requests.get('https://icgrc.info/api/user/phenotype/dataset').json()
print('Phenotype datasets(phends) options')
display_table(data,columns=['phen_dataset','unit_name','samples_count','trait_count'])
Phenotype datasets(phends) options
phen_dataset | unit_name | samples_count | trait_count |
---|---|---|---|
Loading... (need help?) |
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&private=1&byacc=1&hassnp=1&with_stdunits=1"]
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=GloerfeltTarp2023&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
#label_url=['Booth2020','Zager2019','GloerfeltTarp2023','BGZphenotypes']
api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Zager2019&private=1&with_stdunits=1","https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
label_url=['Booth2020','Zager2019','BZphenotypes']
label2color=dict()
label2color['Azman2023']='red'
label2color['Booth2020']='blue'
label2color['GloerfeltTarp2023']='green'
label2color['Zager2019']='orange'
label2color['BZphenotypes']='yellow'
keep_unit='percent_of_dw'
#keep_unit='ug_per_gdw'
n_bins=10
icnt=0
dfs=[]
dfs_imputed_m2=[]
phenall=set()
keep_samples=set()
keep_phenotypes=set()
if PLOT_PHEN:
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
if VERBOSE and icnt==0:
display('in original units')
display(drop_allnazero_rowcol(df_raw.loc[df_raw['datatype'].str.startswith('3 PHEN')]))
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit='percent_of_dw')
phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
plot_histograms(c_converted_phenunits_values, 'changed_to_' + keep_unit, label_url[icnt]) #,label2color=label2color)
if VERBOSE and icnt==0:
display('coverted to ' + keep_unit)
display(drop_allnazero_rowcol(c_converted_phenunits.loc[c_converted_phenunits['datatype'].str.startswith('3 PHEN')]))
display('coverted to (values only) ' + keep_unit)
display(drop_allnazero_rowcol(c_converted_phenunits_values.loc[c_converted_phenunits_values['datatype'].str.startswith('3 PHEN')]))
dfs.append(c_converted_phenunits_values)
if IMPUTE_M2:
(df_corrected,dummy)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='mean',correct_BE=False) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
dfs_imputed_m2.append(df_corrected)
icnt+=1
Booth2020
'in original units'
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'coverted to percent_of_dw'
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
'coverted to (values only) percent_of_dw'
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
original data
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
becorrected_Booth2020.tsv
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
Zager2019 original data
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
becorrected_Zager2019.tsv
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
BZphenotypes original data
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
becorrected_BZphenotypes.tsv
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
/home/lmansu10/jupyter/omics_api_utils.py:4254: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/lmansu10/jupyter/omics_api_utils.py:4254: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
if PLOT_PHEN:
phenminmaxbin=None
plot_histograms_multi(dfs, keep_unit,label_url,phenminmaxbin,label2color=label2color,nbins=n_bins)
<Figure size 2000x2000 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
if PLOT_PHEN and IMPUTE_M2:
phenminmaxbin=None
df_orgcor=dfs + dfs_imputed_m2
label_orgcor=[]
for label in label_url:
label_orgcor.append(label)
for label in label_url:
label_orgcor.append('m2_' + label)
label2color['m2_Booth2020']='cyan'
label2color['m2_Zager2019']='magenta'
label2color['m2_BZphenotypes']='gray'
plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)
<Figure size 2000x2000 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
Check batch effect, imputation and batch effect correction¶
Check for batch effects by Principal Component Analysis, and correction by pyComBat. PCA and ComBat require imputation accomplished by scikit-learn SimpleImputer (most_frequent, mean, median) and KNNImputer. Another option is Probabilistic PCA, PPCA, can compute PCA with missing values, and returns imputed data at the same time
Required python modules
- scikit-learn
- pyppca
- plotly
- pyComBat
if CHECK_BE and IMPUTE_M1:
#api_url = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019,GloerfeltTarp2023&with_stdunits=1"]
#label_url=['BGZphenotypes']
api_url1 = ["https://icgrc.info/api/user/phenotype/all?phends=Booth2020,Zager2019&with_stdunits=1"]
label_url1=['BZphenotypes']
icnt=0
df_raw= read_url(api_url1[0], label_url1[0],requery=REQUERY_URL)
(c_converted_phenunits, dfphen)=convert_units_to(df_raw,to_unit='percent_of_dw')
(df_corrected, map_corrected_bybatch)=check_batcheffects(dfphen, label_url1[0], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
#print(list(map_corrected_bybatch.keys()))
#plot_histograms_multi(list(map_corrected_bybatch.keys()), keep_unit,list(map_corrected_bybatch.values()),phenminmaxbin,label2color=label2color,nbins=n_bins)
Found 2 batches. Adjusting for 0 covariate(s) or covariate level(s). Standardizing Data across genes. Fitting L/S model and finding priors. Finding parametric adjustments. Adjusting the Data
original data
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
becorrected_BZphenotypes.tsv
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
/home/lmansu10/jupyter/omics_api_utils.py:4250: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/lmansu10/jupyter/omics_api_utils.py:4250: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#dfs_imputed_m2_0=dfs_imputed_m2[0]
#dfs_imputed_m2_1=dfs_imputed_m2[1]
#dfs_imputed_m2_0=dfs_imputed_m2_0.index=dfs_imputed_m2_0['property']
#dfs_imputed_m2_1=dfs_imputed_m2_1.index=dfs_imputed_m2_1['property']
#dfs_imputed_m2_bz=pd.merge(dfs_imputed_m2_0,dfs_imputed_m2_1, left_index=True, right_index=True, how = 'outer')
dfs_imputed_m2_bz=pd.merge(dfs_imputed_m2[0],dfs_imputed_m2[1], left_on=['datatype','property'], right_on=['datatype','property'], how = 'outer')
#init_notebook_mode(connected=True)
init_notebook_mode(all_interactive=True)
display(dfs_imputed_m2_bz)
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
#init_notebook_mode(connected=True)
init_notebook_mode(all_interactive=True)
if CHECK_BE and IMPUTE_M2:
(df_corrected, map_corrected_bybatch)=check_batcheffects(dfs_imputed_m2_bz, 'm2_BZphenotypes', TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset'])
Found 2 batches. Adjusting for 0 covariate(s) or covariate level(s). Standardizing Data across genes. Fitting L/S model and finding priors. Finding parametric adjustments. Adjusting the Data
/home/lmansu10/miniconda3/envs/R4/lib/python3.10/site-packages/combat/pycombat.py:159: RuntimeWarning: divide by zero encountered in divide
original data
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
becorrected_m2_BZphenotypes.tsv
datatype | property | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
/home/lmansu10/jupyter/omics_api_utils.py:4254: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/lmansu10/jupyter/omics_api_utils.py:4254: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
if PLOT_PHEN and IMPUTE_M2:
phenminmaxbin=None
df_orgcor=dfs + dfs_imputed_m2
label_orgcor=[]
for label in label_url:
label_orgcor.append(label)
for label in label_url:
label_orgcor.append('m2_' + label)
for batch in list(map_corrected_bybatch.keys()):
label_orgcor.append('m2cb_' + batch)
df_orgcor.append(map_corrected_bybatch[batch])
label_orgcor.append('m2cb_BZphenotypes')
df_orgcor.append(df_corrected)
label2color['m2_Booth2020']='cyan'
label2color['m2_Zager2019']='magenta'
label2color['m2_BZphenotypes']='gray'
label2color['m2cb_Booth2020']='darkblue'
label2color['m2cb_Zager2019']='pink'
label2color['m2cb_BZphenotypes']='brown'
plot_histograms_multi(df_orgcor, keep_unit,label_orgcor,phenminmaxbin,label2color=label2color,nbins=n_bins)
<Figure size 2000x2000 with 0 Axes>
<Figure size 640x480 with 0 Axes>
<Figure size 640x480 with 0 Axes>
map_corrected_bybatch['BZphenotypes']=df_corrected
dfs_m1=dfs
dfs_m2_cb=dfs
for cor in list(map_corrected_bybatch.keys()):
label_url.append('m2cb_' + cor)
dfs_m2_cb.append(map_corrected_bybatch[cor])
label2color['m2cb_Booth2020']='cyan'
label2color['m2cb_GloerfeltTarp2023']='pink'
label2color['m2cb_Zager2019']='magenta'
label2color['m2cb_BZphenotypes']='gray'
setVariables(imgsize_xlarge_in=(15,30))
plot_histograms_multi(dfs, keep_unit,label_url,phenminmaxbin,label2color=label2color,nbins=n_bins,heatmap=False)
<Figure size 1500x3000 with 0 Axes>
WGCNA Analysis¶
This demo uses expression and phenotype data for Weighted Gene Co-expression Network Analysis (WGCNA), using the WGCNA R package (https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA). The output is the correlation of Gene Modules to Traits, were modules are a set of genes with similar expresson patterns. The top trait-module pairs are returned with the set number or p-value cutoff.
Requirements
- R
- R packages: WGCNA
Avalable options for expression
data = requests.get('https://icgrc.info/api/user/expression/dataset').json()
print('Expression datasets (expds) options')
#print(pd.DataFrame(data,columns=['expds','analysis_name']).to_string(index=False))
#display(pd.DataFrame(data,columns=['expds','analysis_name']))
display_table(data,columns=['expds','analysis_name'])
data = requests.get('https://icgrc.info/api/user/expression/transet').json()
print('Transcriptome set (transet) options')
display_table(data,columns=['transet'])
#import omics_api
#from omics_api import *
phends='Zager2019,Booth2020'
expds='21trichcs10'
transet='cs10'
#label_url=['21trichcs10_cs10_zagerbooth']
label_url=['21trichcs10_cs10_zagerbooth_be']
api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0']
#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichpkclone109&phends=Zager2019%2CBooth2020&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichpk_zagerbooth']
#api_url = ['https://icgrc.info/api/user/expression,phenotype/list?transet=all&expds=21trichcs10&phends=Zager2019&with_stdunits=0&tablelimit=100000&limit=100000']
#label_url=['21trichcs10_zager']
icnt=0
top_dfwgcna=[]
if WGCNA :
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
if False: #CHECK_BE
#display(c_converted_phenunits_values)
(df_corrected, map_corrected_bybatch)=check_batcheffects(c_converted_phenunits_values, label_url[icnt], TYPE_PHEN, whiten=True, batch_id='phenotype_dataset',on_missing='ppca',correct_BE=True) #properties=['phenotype_dataset']) #properties=['NCBI BioProject'])
c_converted_phenunits_values=df_corrected
moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']
topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=label_url[icnt],cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=True)
break
for itop in topmodules:
moduletrait.append(itop[0]+","+itop[1])
dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=label_url[icnt],moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna,annotate=True)
#do_wgcna_visualize(c_converted_phenunits_values,dflabel=label_url[icnt],recalc=True,moduletrait=moduletrait)
top_dfwgcna.append(dfwgcn)
#display(Image(filename=label_url[0] + '.ModuleTraitRelationship.png'))
#display(Image(filename=label_url[0] + '.ClusterDendogram.png'))
icnt+=1
21trichcs10_cs10_zagerbooth_be
rm: cannot remove ‘21trichcs10_cs10_zagerbooth_be*’: No such file or directory
mydf
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
df_idprop
datatype | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
property | |||||||||||||||||||||||||||||||||||||||||||
Loading... (need help?) |
df_annot
property | phenotype_dataset | NCBI BioProject |
---|---|---|
Loading... (need help?) |
mydf
datatype | property | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
df_idprop
datatype | SAMN10330896 | SAMN10330897 | SAMN10330898 | SAMN10330899 | SAMN10330900 | SAMN10330901 | SAMN10330902 | SAMN10330903 | SAMN10330904 | SAMN10330905 | SAMN10330906 | SAMN10330907 | SAMN10330908 | SAMN10330909 | SAMN10330910 | SAMN10330911 | SAMN10330912 | SAMN10330913 | SAMN10330914 | SAMN10330915 | SAMN10330916 | SAMN10330917 | SAMN10330918 | SAMN10330919 | SAMN10330920 | SAMN10330921 | SAMN10330922 | SAMN13750438 | SAMN13750439 | SAMN13750440 | SAMN13750441 | SAMN13750442 | SAMN13750443 | SAMN13750444 | SAMN13750445 | SAMN13750446 | SAMN13750447 | SAMN13750448 | SAMN13750449 | SAMN13750450 | SAMN13750451 | SAMN13750452 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
property | |||||||||||||||||||||||||||||||||||||||||||
Loading... (need help?) |
df_annot
property | NCBI BioProject | expression_dataset |
---|---|---|
Loading... (need help?) |
'Rscript Langfelder-01-dataInput.R 21trichcs10_cs10_zagerbooth_be 21trichcs10_cs10_zagerbooth_be.genes.csv 21trichcs10_cs10_zagerbooth_be 21trichcs10_cs10_zagerbooth_be.trait.csv > /dev/null'
Loading required package: dynamicTreeCut Loading required package: fastcluster Attaching package: ‘fastcluster’ The following object is masked from ‘package:stats’: hclust Attaching package: ‘WGCNA’ The following object is masked from ‘package:stats’: cor dev.new(): using pdf(file="Rplots1.pdf") Warning message: In numbers2colors(datTraits, signed = FALSE) : (some columns in) 'x' are constant. Their color will be the color of NA.
'Rscript Langfelder-02-networkConstr-auto.R 21trichcs10_cs10_zagerbooth_be > /dev/null'
Loading required package: dynamicTreeCut Loading required package: fastcluster Attaching package: ‘fastcluster’ The following object is masked from ‘package:stats’: hclust Attaching package: ‘WGCNA’ The following object is masked from ‘package:stats’: cor dev.new(): using pdf(file="Rplots1.pdf") dev.new(): using pdf(file="Rplots2.pdf")
'Rscript Langfelder-03a-relateModsToExt.R 21trichcs10_cs10_zagerbooth_be > /dev/null'
Loading required package: dynamicTreeCut Loading required package: fastcluster Attaching package: ‘fastcluster’ The following object is masked from ‘package:stats’: hclust Attaching package: ‘WGCNA’ The following object is masked from ‘package:stats’: cor dev.new(): using pdf(file="Rplots1.pdf") Warning message: In greenWhiteRed(50) : WGCNA::greenWhiteRed: this palette is not suitable for people with green-red color blindness (the most common kind of color blindness). Consider using the function blueWhiteRed instead.
eQTL analysis¶
This demo uses expression and variant data for eQTL analysis using Matrix eQTL R package (https://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/runit.html). The analysis can be between any SNP-gene pair, neighboring pairs (cis), or distant pairs (trans). The output are the top SNP-Gene pairs the set number or p-value cutoff.
Requirements
- R
- R packages: MatrixEQTL
Avalaible options for variants
data = requests.get('https://icgrc.info/api/user/variant/dataset').json()
print('Variant reference (ref) and dataset (snpds) options')
display_table(data,columns=['ref','snpds'])
Variant reference (ref) and dataset (snpds) options
ref | snpds |
---|---|
Loading... (need help?) |
Select options
ref="cs10"
snpds='21trichs'
expds='21trichcs10'
transet='cs10'
querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'
api_url = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
label_url=["eqtlb_terpcan_" + snpds + "_" + expds]
#label_url=["eqtl_terpcan_" + snpds + "_allsnps_" + expds]
icnt=0
top_dfeqtl=[]
if MATRIXEQT:
#if True:
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,label_url[icnt],annot='cs10', recodeplink=False,recalc=True,maxp=maxpeqtl,expgeneloc_file='data/cs10xm_exp.genesloc.tsv')
top_dfeqtl.append(dfeqtlall)
#display(Image(filename=label_url[0] + ".eqtlpvalue.png"))
#display(Image(filename=label_url[0] + ".eqtlqqplot.png"))
icnt+=1
eqtlb_terpcan_21trichs_21trichcs10
'plink --bfile snp_eqtlb_terpcan_21trichs_21trichcs10 --recode A-transpose --out snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose --allow-extra-chr --noweb > /dev/null'
'cut -f1,4 snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw > snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.postmp'
'head -n1 snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw | cut -f1,2,3,4,5,6 --complement > /dev/null'
plink samples with exp data 63/63
"cut -f1,2,3,4,5,6 --complement snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw | tail -n +2 | sed 's/NA//g' > snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.allele"
'echo SAMN09747683\tSAMN09747684\tSAMN09747685\tSAMN09747686\tSAMN09747687\tSAMN09747688\tSAMN09747689\tSAMN09747690\tSAMN09747691\tSAMN10330896\tSAMN10330897\tSAMN10330898\tSAMN10330899\tSAMN10330900\tSAMN10330901\tSAMN10330902\tSAMN10330903\tSAMN10330904\tSAMN10330905\tSAMN10330906\tSAMN10330907\tSAMN10330908\tSAMN10330909\tSAMN10330910\tSAMN10330911\tSAMN10330912\tSAMN10330913\tSAMN10330914\tSAMN10330915\tSAMN10330916\tSAMN10330917\tSAMN10330918\tSAMN10330919\tSAMN10330920\tSAMN10330921\tSAMN10330922\tSAMN13503266\tSAMN13503268\tSAMN13503269\tSAMN13503270\tSAMN13503272\tSAMN13503274\tSAMN13503277\tSAMN13503278\tSAMN13503281\tSAMN13503282\tSAMN13503283\tSAMN13503285\tSAMN13750438\tSAMN13750439\tSAMN13750440\tSAMN13750441\tSAMN13750442\tSAMN13750443\tSAMN13750444\tSAMN13750445\tSAMN13750446\tSAMN13750447\tSAMN13750448\tSAMN13750449\tSAMN13750450\tSAMN13750451\tSAMN13750452 > snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.newheader'
'cat snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.newheader snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.allele > eqtlb_terpcan_21trichs_21trichcs10_snp.tsvtmp'
'paste snp_eqtlb_terpcan_21trichs_21trichcs10_recodeAtranspose.traw.pos eqtlb_terpcan_21trichs_21trichcs10_snp.tsvtmp > eqtlb_terpcan_21trichs_21trichcs10_snp.tsv'
'ln -s data/cs10xm_exp.genesloc.tsv eqtlb_terpcan_21trichs_21trichcs10_exp.genesloc.tsv > /dev/null'
'Rscript matrix_eqtl.R eqtlb_terpcan_21trichs_21trichcs10 1e-05 > /dev/null'
rm: cannot remove ‘Rplots*.pdf’: No such file or directory Rows read: 1356 done. Rows read: 17 done. Processing covariates Task finished in 0.001 seconds Processing gene expression data (imputation, residualization) Task finished in 0.002 seconds Creating output file(s) Task finished in 0.015 seconds Performing eQTL analysis 100.00% done, 759 eQTLs Task finished in 0.042 seconds
rm: cannot remove ‘Rplots*.pdf’: No such file or directory
'Rscript matrix_eqtl_cistrans.R eqtlb_terpcan_21trichs_21trichcs10 1e-05 > /dev/null'
rm: cannot remove ‘Rplots*.pdf’: No such file or directory Rows read: 1356 done. Rows read: 17 done. Matching data files and location files 17 of 17 genes matched 1356 of 1356 SNPs matched Task finished in 0.004 seconds Reordering genes Task finished in 0.069 seconds Processing covariates Task finished in 0.001 seconds Processing gene expression data (imputation, residualization) Task finished in 0.001 seconds Creating output file(s) Task finished in 0.025 seconds Performing eQTL analysis 100.00% done, 69 cis-eQTLs, 690 trans-eQTLs Task finished in 0.046 seconds
top snp-expressed cis gene annotated
pvalue | snps | snpgenes | contig | start | end | note | goterm | iprterm | expgenes | note.1 | goterm.1 | iprterm.1 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
top snp-expressed trans gene
snps | gene | statistic | pvalue | FDR | beta | pvalue_y | snpgenes | contig | start | end | note | goterm | iprterm | expgenes | note.1 | goterm.1 | iprterm.1 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
top snp-expressed all gene annotated
snps | gene | statistic | pvalue | FDR | beta | pvalue_y | snpgenes | contig | start | end | note | goterm | iprterm | expgenes | note.1 | goterm.1 | iprterm.1 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
Metabolite SNP Association (semi-mGWAS)¶
This demonstration uses traits/metabolite associaton with SNPs using plink --assoc feature. This generates the Manhattan Plot for each trait on the genomic location of genes that satify the functional annotation or accession constraints. The result is the list of top SNP-Trait pairs according to the set number or p-value cutoff.
Requirements
- R
- Python package: qqman
- plink
if GWAS: # snp, phen
snpds='21trichs'
#phends='Booth2020'
phends='Zager2019,Booth2020'
querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
geneslabel='cannterp'
ref='cs10'
label_url=['21trichs_cs10_zagerbooth_cannterp_covar']
#label_url=['21trich_allsnps_cs10_zagerbooth_cannterp']
api_url = ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]
dfs=[]
phenall=set()
icnt=0
if GWAS:
for iurl in api_url:
print(label_url[icnt])
df_raw= read_url(iurl, label_url[icnt],requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
moduletrait=[] #['darkred,cannabidiol', 'royalblue,alpha-pinene', 'royalblue,Delta9_tetrahydrocannabinol']
dfs.append(c_converted_phenunits_values)
phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
21trichs_cs10_zagerbooth_cannterp_covar
if GWAS:
select_options(phenall, waitInput=False)
Select phenotypes to process
Set the of list numbers for phenotypes to perform mGWAS
#selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55]
selectionid=[3,4,6,7]
GWAS_FROMPLINK=False
#%autoreload 2
setVariables(loglevel=getSetting('SHOW_DEBUG'),keep_unit=myunit)
top_dfgwas=[]
if GWAS:
selection=select_options(phenall, waitInput=False, selected=selectionid)
print('selections: ' + str(selection))
idf=0
df_gene=pd.read_csv('data/gene_cs10.tsv',sep="\t")
for df in dfs:
dfgwas=do_gwas(df, label_url[idf],myunit,phenotypes=selection,rerunplink=True,maxp=maxpgwas,recalc=True,annot='cs10') #,covar=['phenotype_dataset'],assocmethod='linear') #,df_genes=df_gene,annot='cs10')
top_dfgwas.append(dfgwas)
#printdisplay(MESSAGE_INFO,label_url[idf] + '.gwas.top.tsv') #Image(filename=label_url[idf] + '_gwasmp_' + myunit + '_' + label_url[idf] + '.qassoc.counts.png'))
idf+=1
selections: ['Delta9-tetrahydrocannabinol', 'Delta9-tetrahydrocannabinolic acid', 'Percent_Total_Cannabinoids_per_dry_weight', 'THCA_to_CBDA_ratio']
rm: cannot remove ‘21trichs_cs10_zagerbooth_cannterp_covar*’: No such file or directory
(1684, 44)
['datatype', 'property', 'SAMN10330896', 'SAMN10330897', 'SAMN10330898', 'SAMN10330899', 'SAMN10330900', 'SAMN10330901', 'SAMN10330902', 'SAMN10330903', 'SAMN10330904', 'SAMN10330905', 'SAMN10330906', 'SAMN10330907', 'SAMN10330908', 'SAMN10330909', 'SAMN10330910', 'SAMN10330911', 'SAMN10330912', 'SAMN10330913', 'SAMN10330914', 'SAMN10330915', 'SAMN10330916', 'SAMN10330917', 'SAMN10330918', 'SAMN10330919', 'SAMN10330920', 'SAMN10330921', 'SAMN10330922', 'SAMN13750438', 'SAMN13750439', 'SAMN13750440', 'SAMN13750441', 'SAMN13750442', 'SAMN13750443', 'SAMN13750444', 'SAMN13750445', 'SAMN13750446', 'SAMN13750447', 'SAMN13750448', 'SAMN13750449', 'SAMN13750450', 'SAMN13750451', 'SAMN13750452']
'python HapMap_to_PLINK.py snp_21trichs_cs10_zagerbooth_cannterp_covar.hmp snp_21trichs_cs10_zagerbooth_cannterp_covar.map > snp_21trichs_cs10_zagerbooth_cannterp_covar.ped'
'plink --file snp_21trichs_cs10_zagerbooth_cannterp_covar --allow-extra-chr --make-bed --out snp_21trichs_cs10_zagerbooth_cannterp_covar'
PLINK v1.90b6.12 64-bit (28 Oct 2019) www.cog-genomics.org/plink/1.9/ (C) 2005-2019 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to snp_21trichs_cs10_zagerbooth_cannterp_covar.log. Options in effect: --allow-extra-chr --file snp_21trichs_cs10_zagerbooth_cannterp_covar --make-bed --out snp_21trichs_cs10_zagerbooth_cannterp_covar 332375 MB RAM detected; reserving 166187 MB for main workspace. .ped scan complete (for binary autoconversion). Performing single-pass .bed write (1588 variants, 42 people). --file: snp_21trichs_cs10_zagerbooth_cannterp_covar-temporary.bed + snp_21trichs_cs10_zagerbooth_cannterp_covar-temporary.bim + snp_21trichs_cs10_zagerbooth_cannterp_covar-temporary.fam written. 1588 variants loaded from .bim file. 42 people (0 males, 0 females, 42 ambiguous) loaded from .fam. Ambiguous sex IDs written to snp_21trichs_cs10_zagerbooth_cannterp_covar.nosex . Using 1 thread (no multithreaded calculations invoked). Before main variant filters, 42 founders and 0 nonfounders present. Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done. Total genotyping rate is 0.692515. 1588 variants and 42 people pass filters and QC. Note: No phenotypes present. --make-bed to snp_21trichs_cs10_zagerbooth_cannterp_covar.bed + snp_21trichs_cs10_zagerbooth_cannterp_covar.bim + snp_21trichs_cs10_zagerbooth_cannterp_covar.fam ... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99%done.
Warning: Variant 6 triallelic; setting rarest alleles missing. Warning: Variant 33 triallelic; setting rarest alleles missing. Warning: Variant 134 triallelic; setting rarest alleles missing. Warning: Variant 149 triallelic; setting rarest alleles missing. Warning: Variant 206 triallelic; setting rarest alleles missing. Warning: Variant 229 triallelic; setting rarest alleles missing. Warning: Variant 281 triallelic; setting rarest alleles missing. Warning: Variant 285 triallelic; setting rarest alleles missing. Warning: Variant 310 triallelic; setting rarest alleles missing. Warning: Variant 424 triallelic; setting rarest alleles missing. Warning: Variant 450 triallelic; setting rarest alleles missing. Warning: Variant 480 triallelic; setting rarest alleles missing. Warning: Variant 481 triallelic; setting rarest alleles missing. Warning: Variant 546 triallelic; setting rarest alleles missing. Warning: Variant 626 triallelic; setting rarest alleles missing. Warning: Variant 636 triallelic; setting rarest alleles missing. Warning: Variant 664 triallelic; setting rarest alleles missing. Warning: Variant 681 triallelic; setting rarest alleles missing. Warning: Variant 1005 triallelic; setting rarest alleles missing. Warning: Variant 1028 triallelic; setting rarest alleles missing. Warning: Variant 1067 triallelic; setting rarest alleles missing. Warning: Variant 1081 triallelic; setting rarest alleles missing. Warning: Variant 1106 triallelic; setting rarest alleles missing. Warning: Variant 1117 triallelic; setting rarest alleles missing. Warning: Variant 1137 triallelic; setting rarest alleles missing. Warning: Variant 1151 triallelic; setting rarest alleles missing. Warning: Variant 1203 triallelic; setting rarest alleles missing. Warning: Variant 1341 triallelic; setting rarest alleles missing. Warning: Variant 1348 triallelic; setting rarest alleles missing. Warning: Variant 1412 triallelic; setting rarest alleles missing.
'plink --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --assoc counts --pheno phen_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt --out assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --allow-extra-chr --allow-no-sex && touch assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.done &> assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.out &'
PLINK v1.90b6.12 64-bit (28 Oct 2019) www.cog-genomics.org/plink/1.9/ (C) 2005-2019 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.log. Options in effect: --allow-extra-chr --allow-no-sex --assoc counts --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --out assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --pheno phen_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt 332375 MB RAM detected; reserving 166187 MB for main workspace. 1588 variants loaded from .bim file. 42 people (0 males, 0 females, 42 ambiguous) loaded from .fam. Ambiguous sex IDs written to assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.nosex . 42 phenotype values present after --pheno. Using 1 thread (no multithreaded calculations invoked). Before main variant filters, 42 founders and 0 nonfounders present. Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done. Total genotyping rate is 0.692515. 1588 variants and 42 people pass filters and QC. Phenotype data is quantitative. Writing QT --assoc report to assoc_Delta9-tetrahydrocannabinolic_acid_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc ... 0%4%13%18%27%46%48%57%80%96%done.
'plink --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --assoc counts --pheno phen_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt --out assoc_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --allow-extra-chr --allow-no-sex && touch assoc_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.done &> assoc_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.out &'
PLINK v1.90b6.12 64-bit (28 Oct 2019) www.cog-genomics.org/plink/1.9/ (C) 2005-2019 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to assoc_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.log. Options in effect: --allow-extra-chr --allow-no-sex --assoc counts --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --out assoc_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --pheno phen_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt 332375 MB RAM detected; reserving 166187 MB for main workspace. 1588 variants loaded from .bim file. 42 people (0 males, 0 females, 42 ambiguous) loaded from .fam. Ambiguous sex IDs written to assoc_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.nosex . 27 phenotype values present after --pheno. Using 1 thread (no multithreaded calculations invoked). Before main variant filters, 42 founders and 0 nonfounders present. Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done. Total genotyping rate is 0.692515. 1588 variants and 42 people pass filters and QC. Phenotype data is quantitative. Writing QT --assoc report to assoc_Delta9-tetrahydrocannabinol_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc ... 0%4%13%18%27%46%48%57%80%96%done.
'plink --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --assoc counts --pheno phen_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt --out assoc_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --allow-extra-chr --allow-no-sex && touch assoc_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.done &> assoc_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.out &'
PLINK v1.90b6.12 64-bit (28 Oct 2019) www.cog-genomics.org/plink/1.9/ (C) 2005-2019 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to assoc_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.log. Options in effect: --allow-extra-chr --allow-no-sex --assoc counts --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --out assoc_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --pheno phen_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt 332375 MB RAM detected; reserving 166187 MB for main workspace. 1588 variants loaded from .bim file. 42 people (0 males, 0 females, 42 ambiguous) loaded from .fam. Ambiguous sex IDs written to assoc_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.nosex . 15 phenotype values present after --pheno. Using 1 thread (no multithreaded calculations invoked). Before main variant filters, 42 founders and 0 nonfounders present. Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done. Total genotyping rate is 0.692515. 1588 variants and 42 people pass filters and QC. Phenotype data is quantitative. Writing QT --assoc report to assoc_Percent_Total_Cannabinoids_per_dry_weight_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc ... 0%4%13%18%27%46%48%57%80%96%done.
'plink --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --assoc counts --pheno phen_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt --out assoc_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --allow-extra-chr --allow-no-sex && touch assoc_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.done &> assoc_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc.out &'
PLINK v1.90b6.12 64-bit (28 Oct 2019) www.cog-genomics.org/plink/1.9/ (C) 2005-2019 Shaun Purcell, Christopher Chang GNU General Public License v3 Logging to assoc_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.log. Options in effect: --allow-extra-chr --allow-no-sex --assoc counts --bfile snp_21trichs_cs10_zagerbooth_cannterp_covar --out assoc_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts --pheno phen_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.txt 332375 MB RAM detected; reserving 166187 MB for main workspace. 1588 variants loaded from .bim file. 42 people (0 males, 0 females, 42 ambiguous) loaded from .fam. Ambiguous sex IDs written to assoc_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.nosex . 15 phenotype values present after --pheno. Using 1 thread (no multithreaded calculations invoked). Before main variant filters, 42 founders and 0 nonfounders present. Calculating allele frequencies... 0%1%2%3%4%5%6%7%8%9%10%11%12%13%14%15%16%17%18%19%20%21%22%23%24%25%26%27%28%29%30%31%32%33%34%35%36%37%38%39%40%41%42%43%44%45%46%47%48%49%50%51%52%53%54%55%56%57%58%59%60%61%62%63%64%65%66%67%68%69%70%71%72%73%74%75%76%77%78%79%80%81%82%83%84%85%86%87%88%89%90%91%92%93%94%95%96%97%98%99% done. Total genotyping rate is 0.692515. 1588 variants and 42 people pass filters and QC. Phenotype data is quantitative. Writing QT --assoc report to assoc_THCA_to_CBDA_ratio_percent_of_dw_21trichs_cs10_zagerbooth_cannterp_covar.qassoc.counts.qassoc ... 0%4%13%18%27%46%48%57%80%96%done.
top snp-trait 21trichs_cs10_zagerbooth_cannterp_covar.gwas.top.snpgene.tsv
trait | snps | pvalue | snpgenes | note | goterm | iprterm |
---|---|---|---|---|---|---|
Loading... (need help?) |
Merge multi tools¶
Merge the result from the tools above to find multiple evedince of association. The steps include
- eQTL gives top snp-expression pairs, and snp gene-expressed gene pairs
- WGCNA gives top expressed gene - trait pairs
- mGWAS gives top snp-trait, and snp gene-trait pairs
The merging connects the expression genes common from the eQTL and WGCNA results, giving new snp gene-trait pairs
- The new snp gene-trait pairs are then intersected with the mGWAS result.
The intersecting pairs have two evidence paths of gene-gtrait association. The result below gives 144 gene-trait pairs, with 23 genes
#%autoreload 2
#import omics_api
#from omics_api import *
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
mergeoutfle='merged_21trichcs10_alltraits.tsv'
def printdflists(l):
for df in l:
print(df.columns.values.tolist())
#setVariables(loglevel=getSetting('SHOW_DEBUG'))
printdflists(top_dfeqtl)
printdflists(top_dfwgcna)
printdflists(top_dfgwas)
MERGE=True
if MERGE:
df_genes=pd.read_csv('data/gene_cs10.tsv',sep="\t")
merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=mergeoutfle,use_snpgene=True,df_genes=df_genes,recalc=False)
['snps', 'gene', 'statistic', 'pvalue', 'FDR', 'beta', 'pvalue_y', 'snpgenes', 'contig', 'start', 'end', 'note', 'goterm', 'iprterm', 'expgenes', 'note.1', 'goterm.1', 'iprterm.1'] ['trait', 'snps', 'pvalue', 'snpgenes', 'note', 'goterm', 'iprterm']
'File df_eqtl_wgcna_onexpgenes_merged_21trichcs10_alltraits.tsv dont exists'
eqtl_wgcna_onexpgene_merged_21trichcs10_alltraits.tsv
snps | snpgenes | expgenes | gene | pvalue | trait | GS | p.GS |
---|---|---|---|---|---|---|---|
Loading... (need help?) |
df_eqtl_wgcna_onexpgenes_gwas_onsnpgenetrait
trait | snpgenes | pvalue_eqtl | pvalue_eqtl.1 | GS_wgcna | pGS_wgcna | pGS_wgcna.1 | pvalue_gwas | pvalue_gwas.1 |
---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
triple_snpgenetrait_merged_21trichcs10_alltraits.tsv
trait | snpgenes | pvalue_eqtl_count | pvalue_eqtl_min | GS_wgcna_max | pGS_wgcna_count | pGS_wgcna_min | pvalue_gwas_count | pvalue_gwas_min | gene | contig | fmin | fmax | strand | locus | gb_keyword | go_term | interpro_value |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading... (need help?) |
tgzdownloads('alltgz1', txt=True, img=True)
'tar -cvzf alltgz1.tgz -T dwlist.txt > /dev/null'
Running semi-offline using large dataset¶
Performing genome-wide or transcriptome-wide analyses requires large datasets. It is not practical to use retail web-service API query but the datasets may be downloaded in bulk, stored, and accessed locally. The API has feature to use local file instead of constantly querying the web-server. Most of these files are still downloadable using the web-service but requested in bulk by datasets. These datasets include:
- gene files by reference and annotation from api/user/gene/list
- expression matrix by dataset from api/user/expression/list
- variant SNPs data in binary Plink format by dataset avaialable at
This section demonstrates bulk downloading the necessary datasets, and repeating the analyses described above using whole-genome SNPs and whole-transcriptome expression data. Since these runs take time, run this as a python script from the command line outside of jupyter.
#download gene annotation and save to data directory
datadir='data1'
GENEFILE=datadir+'/gene_cs10.tsv'
#pd.read_csv('https://icgrc.info/api/user/gene/list?annot=cs10',sep="\t").to_csv(GENEFILE,sep="\t",index=False)
#download expression matrix save to data directory
EXPFILE=datadir + '/21trichcs10_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?expds=21trichcs10',sep="\t").to_csv(EXPFILE,sep="\t",index=False)
#EXPFILE=datadir + '/GaoGuerrieroConneely_allexp.tsv'
#EXPFILE=datadir + '/Gao_allexp.tsv'
#pd.read_csv('https://icgrc.info/api/user/expression/list?transet=all&expds=Massimino2017&tablelimit=100000&limit=1000',sep="\t").to_csv(EXPFILE,sep="\t",index=False)
#download variant file in binary plink format at
PLINKFILE='data/cs10-21trichomes-all-gatkgenotype-allsnps-samn-plink-nofam'
#PLINKFILE=None
PHENFILE=datadir+'/phen_all.tsv'
#pd.read_csv('https://icgrc.info/api/user/phenotype/list',sep="\t").to_csv(PHENFILE,sep="\t",index=False)
if False:
import datetime
setVariables(loglevel=getSetting('SHOW_ERRORONLY'),keep_unit=myunit)
# hold top results
top_dfwgcna=[]
top_dfeqtl=[]
top_dfgwas=[]
# enable module
WGCNA=True
EQTL=True
GWAS=True
MERGE=True
REQUERY_URL=False
phends='Zager2019,Booth2020'
expds='21trichcs10'
transet='cs10'
label_url=None
label_wgcna=['wgcna_21trichcs10allexp_cs10_zagerbooth_b']
url_wgcna = ['https://icgrc.info/api/user/expression,phenotype/list?transet=' + transet + '&expds=' + expds + '&phends=' + phends + '&with_stdunits=0&tablelimit=100000&limit=100000']
icnt=0
if WGCNA :
print('STARTED WGCNA ' + str(datetime.datetime.now()))
for iurl in url_wgcna:
ilabel=label_wgcna[icnt]
df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
moduletrait=[]
topmodules=do_wgcna_prepare(c_converted_phenunits_values,dflabel=ilabel,cortopn=cortopnwgcna,maxp=maxpwgcna,mincor=mincorwgcna,recalc=False,exp_file=EXPFILE)
for itop in topmodules:
moduletrait.append(itop[0]+","+itop[1])
dfwgcn=do_wgcna_modulesgenes(c_converted_phenunits_values,dflabel=ilabel,moduletrait=moduletrait,maxp=maxpwgcna,gstopn=gstopnwgcna,mings=mincorwgcna)
top_dfwgcna.append(dfwgcn)
icnt+=1
snpds='21trichs'
phends='Zager2019,Booth2020'
querygenes="GO:0006721,GO:0008299,GO:0030639,terpene,isopentenyl,cannabid,cannabic,cannabiv,cannabin"
geneslabel='cannterp'
ref='cs10'
label_gwas=['gwas_21trich_allsnps_cs10_zagerbooth_cannterp_b']
url_gwas= ["https://icgrc.info/api/user/variant,phenotype/"+ querygenes + "?ref=" + ref + "&snpds=" + snpds + "&tablelimit=10000000000&limit=2000000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000&phends=" + phends + "&with_stdunits=1"]
phenall=set()
icnt=0
df_gwas=[]
# collect available phenotypes first to make selection
if GWAS:
print('STARTED GWAS ' + str(datetime.datetime.now()))
for iurl in url_gwas:
ilabel=label_gwas[icnt]
df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
(c_converted_phenunits, c_converted_phenunits_values)=convert_units_to(df_raw,to_unit=myunit)
df_gwas.append(c_converted_phenunits_values)
phenall.update(set(get_phenotypes(c_converted_phenunits_values)))
# show selection options, set to waitInput=True to get interactive response. For now selection selectionid is preselected
select_options(phenall, waitInput=False)
selectionid=[3,4,6,7,8,9,10,11,15,16,17,19,21,28,29,30,31,32,33,34,45,46,47,53,55] # most common cannabinoids/terpenes
#selectionid=[3,4,6,7], CBDA/THCA
if GWAS:
selection=select_options(phenall, waitInput=False, selected=selectionid)
# verify selections as as intended
print('selections: ' + str(selection))
idf=0
for df in df_gwas:
dfgwas=do_gwas(df, label_gwas[idf],myunit,phenotypes=selection,rerunplink=False,maxp=maxpgwas,plink_file=PLINKFILE) #,assocmethod='linear')
top_dfgwas.append(dfgwas)
idf+=1
ref="cs10"
snpds='21trichs'
expds='21trichcs10'
transet='cs10'
querygenes='cannabid,cannabic,cannabiv,cannabin,terpene'
url_eqtl = ["https://icgrc.info/api/user/expression,variant/" + querygenes + "?ref=" + ref + "&snpds=" + snpds + "&transet=" + transet + "&expds=" + expds + "&tablelimit=100000000&limit=100000&fmissing_lt=0.7&maf_gt=0.2&upstream=1000"]
label_eqtl=["eqtl_terpcan_" + snpds + "_allsnpsexps_" + expds+'_relax_b']
# GENELOCFILE is derived from GENEFILE using only gene_id\tcontig\tfmin\tfmax
GENELOCFILE=datadir+'/cs10xm_exp.genesloc.tsv'
pd.read_csv(GENEFILE,sep="\t")[['name','contig','fmin','fmax']].rename(columns={'name':'geneid','contig':'chr','fmin':'s1','fmax':'s2'}).to_csv(GENELOCFILE,sep="\t",index=False)
icnt=0
if MATRIXEQT:
print('STARTED EQTL ' + str(datetime.datetime.now()))
for iurl in url_eqtl:
ilabel=label_eqtl[icnt]
df_raw= read_url(iurl, ilabel,requery=REQUERY_URL)
(dfeqtlall,dfeqtlcis,dfeqtltrans)=do_matrixeqtl(df_raw,ilabel,annot=None, recodeplink=False,maxp=maxpeqtl,plink_file=PLINKFILE,expgeneloc_file=GENELOCFILE,exp_file=EXPFILE)
top_dfeqtl.append(dfeqtlall)
icnt+=1
'''
top_dfeqtl=[]
top_dfgwas=[]
top_dfwgna=[]
top_dfeqtl.append('eqtl_terpcan_21trichs_allsnpsexps_21trichcs10_relax.eqtl.all.top.tsv')
top_dfwgcna.append('wgcna_21trichcs10allexp_cs10_zagerbooth_relax.wgcna.top.tsv')
top_dfgwas.append('gwas_21trich_allsnps_cs10_zagerbooth_cannterp_relax.gwas.top.tsv')
'''
merge_output='merged_21trichcs10allsnpsallexps_trait_relax_b.tsv'
if MERGE:
print('STARTED MERGE ' + str(datetime.datetime.now()))
df_gene=pd.read_csv(GENEFILE,sep="\t") #name contig fmin fmax strand gb_keyword go_term interpro_value)
merge_matrixeqtl_wgcna_gwas(eqtls=top_dfeqtl, wgcnas=top_dfwgcna,gwass=top_dfgwas,outfile=merge_output,use_snpgene=True,df_genes=df_gene)