1. What are the most present protein domains per biological function?¶
from intermine.webservice import Service
import numpy as np
import scipy.io
import seaborn as sns
from scipy import stats, optimize, interpolate
import pandas as pd
from collections import defaultdict
import math
import matplotlib.pyplot as plt
from scipy.stats import norm, lognorm
from scipy import stats
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import os, fnmatch
1.1. Selecting the modules to evaluate the protein domains¶
import os
script_dir = os.path.dirname('__file__') #<-- absolute dir the script is in
rel_path_domains="datasets/proteins-domains-from-Pfam.xlsx"
abs_file_path_domains = os.path.join(script_dir, rel_path_domains)
# os.chdir('../') #<-- for binder os.chdir('../')
my_path_domains=abs_file_path_domains
data_domains=pd.read_excel(my_path_domains,header=0,index_col='Unnamed: 0')
data_domains=data_domains.dropna()
data_domains.head()
name | domain-name | domain-descrip | domain-start | domain-end | domain-method | domain-id | |
---|---|---|---|---|---|---|---|
0 | AAC1 | PF00153 | Mito_carr; Mitochondrial substrate/solute carrier | 10 | 106 | Pfam | 2056375 |
1 | AAC1 | PF00153 | Mito_carr; Mitochondrial substrate/solute carrier | 116 | 210 | Pfam | 2056376 |
2 | AAC1 | PF00153 | Mito_carr; Mitochondrial substrate/solute carrier | 217 | 304 | Pfam | 2056377 |
3 | AAC3 | PF00153 | Mito_carr; Mitochondrial substrate/solute carrier | 9 | 105 | Pfam | 2062638 |
4 | AAC3 | PF00153 | Mito_carr; Mitochondrial substrate/solute carrier | 115 | 208 | Pfam | 2062639 |
pathways=[ 'galactose degradation','phospholipid biosynthesis', "ethanol degradation"]
biological_processes_slim=["cell budding","lipid binding","cytokinesis"]
biological_processes_goterm=["cell morphogenesis involved in conjugation with cellular fusion","budding cell apical bud growth","establishment of cell polarity","cytoskeleton organization"]
def from_go_to_genes(go,label):
#label=["GOTerm" or "GOSlimTerm"]
service = Service('https://yeastmine.yeastgenome.org/yeastmine/service', token = 'K1P7y5Oeb608Hcu06e44')
query = service.new_query("Gene")
query.add_constraint("goAnnotation.ontologyTerm.parents", label)
query.add_view(
"symbol", "goAnnotation.evidence.code.annotType",
"goAnnotation.ontologyTerm.parents.name"
)
query.add_constraint("goAnnotation.qualifier", "!=", "NOT", code = "C")
query.add_constraint("goAnnotation.qualifier", "IS NULL", code = "D")
query.add_constraint("goAnnotation.evidence.code.annotType", "=", "manually curated", code = "F")
query.add_constraint("goAnnotation.ontologyTerm.parents.name", "=", go, code = "G")
query.set_logic("(C or D) and F and G")
data_toy=defaultdict(dict)
for row,counter in zip(query.rows(),np.arange(0,len(query.rows()))):
data_toy['gene-name'][counter]=row["symbol"]
data_toy['evidence'][counter]=row["goAnnotation.evidence.code.annotType"]
data_toy['annotation'][counter]=row["goAnnotation.ontologyTerm.parents.name"]
data_toy_pd=pd.DataFrame(data_toy)
data=data_toy_pd.drop_duplicates()
data.index=np.arange(0,len(data))
return data
go=biological_processes_goterm[2]
data=from_go_to_genes(go,label='GOTerm')
big_data=[]
for i in np.arange(0,len(biological_processes_goterm)):
go=biological_processes_goterm[i]
data=from_go_to_genes(go,label='GOTerm')
big_data.append(data['gene-name'].tolist())
flat_list = [item for sublist in big_data for item in sublist]
res = [i for i in flat_list if i]
flat_list_unique=np.unique(res)
# Selecting the meaningful columns in the respective dataset
domain_id_list=data_domains['domain-name']
#query_gene=data['gene-name']
query_gene=flat_list_unique
# Initialising the arrays
protein_a_list=[]
population = np.arange(0,len(query_gene))
for m in population:
protein_a=data_domains[data_domains['name']==query_gene[m]]
protein_a_list.append(protein_a['domain-name'].tolist())
def remove_empty_domains(protein_list_search):
index=[]
for i in np.arange(0,len(protein_list_search)):
if protein_list_search[i]==[]:
index.append(i) ## index of empty values for the protein_a_list meaning they dont have any annotated domain
y=[x for x in np.arange(0,len(protein_list_search)) if x not in index] # a list with non empty values from protein_a list
protein_list_search_new=[]
for i in y:
protein_list_search_new.append(protein_list_search[i])
return protein_list_search_new
## evaluating the function
protein_a_list_new=remove_empty_domains(protein_a_list)
print('The empty domain in the data were:', len(protein_a_list)-len(protein_a_list_new), 'out of', len(protein_a_list),'domains')
The empty domain in the data were: 39 out of 287 domains
protein_module=[]
for i in np.arange(0,len(protein_a_list_new)):
protein_module.append(np.unique(protein_a_list_new[i]))# just taking the unique domains per protein in the module
protein_module=np.concatenate(protein_module, axis=0 )
unique_domains,index_domains,index_counts=np.unique(protein_module,return_index=True,return_counts=True) # counting how many proteins have the same domain per module
stats_module=pd.DataFrame()
stats_module['unique-domains']=unique_domains
stats_module['counts-each-domains']=index_counts
stats_module['index-for-original-domains-array']=index_domains
stats_module['ratio-of-sharing']=(index_counts-1)/len(unique_domains)
stats_module.set_index=np.arange(0,len(stats_module))
1.2. Sorting the dataframe by the ratio of sharing¶
stats_module['domain-descrip']=np.zeros_like(unique_domains)
stats_module['protein-name']=np.zeros_like(unique_domains)
#stats_module['module-name']=np.zeros_like(unique_domains)
for i in np.arange(0,len(unique_domains)):
stats_module['domain-descrip'][i]=data_domains[data_domains['domain-name']==unique_domains[i]]['domain-descrip'].tolist()[0]
names=data_domains[data_domains['domain-name']==unique_domains[i]]['name'].tolist()
inter=set(names).intersection(query_gene)
stats_module['protein-name'][i]=list(inter) # print the intresection of all proteins with those domains with the protein from the module
#stats_module['module-name'][i]=go
stats_module_sorted=stats_module.sort_values(by=['ratio-of-sharing'],ascending=False)
stats_module_sorted.head()
c:\users\linigodelacruz\appdata\local\continuum\anaconda3\envs\wintest\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
c:\users\linigodelacruz\appdata\local\continuum\anaconda3\envs\wintest\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
unique-domains | counts-each-domains | index-for-original-domains-array | ratio-of-sharing | domain-descrip | protein-name | |
---|---|---|---|---|---|---|
8 | PF00069 | 20 | 18 | 0.074510 | Pkinase; Protein kinase domain | [CLA4, SPS1, CDC5, PBS2, SSK2, KCC4, BCK1, ARK... |
3 | PF00018 | 15 | 0 | 0.054902 | SH3_1; SH3 domain | [YSC84, LSB1, RVS167, MYO3, BZZ1, BEM1, BOI1, ... |
17 | PF00169 | 8 | 39 | 0.027451 | PH; Pleckstrin homology domain | [BEM3, BUD4, CLA4, SLM1, BOI1, SLM2, BEM2, BOI2] |
9 | PF00071 | 7 | 91 | 0.023529 | Ras; Small GTPase superfamily | [RHO3, RHO2, TEM1, CDC42, RHO1, RSR1, RHO4] |
14 | PF00134 | 6 | 116 | 0.019608 | Cyclin_N; Cyclin, N-terminal | [PCL1, CLB4, CLB3, PCL2, CLB1, CLB5] |
stats_module_sorted_diff_zero=stats_module_sorted[stats_module_sorted['ratio-of-sharing']!=0]
plt.bar(x=stats_module_sorted_diff_zero['domain-descrip'],height=stats_module_sorted_diff_zero['ratio-of-sharing']);
plt.xticks(rotation=90)
plt.ylabel('ratio of sharing of domains')
plt.title('Module:' + go)
Text(0.5, 1.0, 'Module:cytoskeleton organization')