1. What are the most present protein domains per biological function?

from intermine.webservice import Service
import numpy as np
import scipy.io
import seaborn as sns
from scipy import stats, optimize, interpolate
import pandas as pd
from collections import defaultdict 
import math
import matplotlib.pyplot as plt
from scipy.stats import norm, lognorm
from scipy import stats
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import os, fnmatch

1.1. Selecting the modules to evaluate the protein domains

import os
script_dir = os.path.dirname('__file__') #<-- absolute dir the script is in
rel_path_domains="datasets/proteins-domains-from-Pfam.xlsx"

abs_file_path_domains = os.path.join(script_dir, rel_path_domains)

# os.chdir('../') #<-- for binder os.chdir('../')
my_path_domains=abs_file_path_domains
data_domains=pd.read_excel(my_path_domains,header=0,index_col='Unnamed: 0')
data_domains=data_domains.dropna()
data_domains.head()
name domain-name domain-descrip domain-start domain-end domain-method domain-id
0 AAC1 PF00153 Mito_carr; Mitochondrial substrate/solute carrier 10 106 Pfam 2056375
1 AAC1 PF00153 Mito_carr; Mitochondrial substrate/solute carrier 116 210 Pfam 2056376
2 AAC1 PF00153 Mito_carr; Mitochondrial substrate/solute carrier 217 304 Pfam 2056377
3 AAC3 PF00153 Mito_carr; Mitochondrial substrate/solute carrier 9 105 Pfam 2062638
4 AAC3 PF00153 Mito_carr; Mitochondrial substrate/solute carrier 115 208 Pfam 2062639
pathways=[ 'galactose degradation','phospholipid biosynthesis', "ethanol degradation"]
biological_processes_slim=["cell budding","lipid binding","cytokinesis"]
biological_processes_goterm=["cell morphogenesis involved in conjugation with cellular fusion","budding cell apical bud growth","establishment of cell polarity","cytoskeleton organization"]
def from_go_to_genes(go,label):
    #label=["GOTerm" or "GOSlimTerm"]
    service = Service('https://yeastmine.yeastgenome.org/yeastmine/service', token = 'K1P7y5Oeb608Hcu06e44')
    query = service.new_query("Gene")
    query.add_constraint("goAnnotation.ontologyTerm.parents", label)
    query.add_view(
        "symbol", "goAnnotation.evidence.code.annotType",
        "goAnnotation.ontologyTerm.parents.name"
    )
    query.add_constraint("goAnnotation.qualifier", "!=", "NOT", code = "C")
    query.add_constraint("goAnnotation.qualifier", "IS NULL", code = "D")
    query.add_constraint("goAnnotation.evidence.code.annotType", "=", "manually curated", code = "F")
    query.add_constraint("goAnnotation.ontologyTerm.parents.name", "=", go, code = "G")
    query.set_logic("(C or D) and F and G")

    data_toy=defaultdict(dict)

    for row,counter in zip(query.rows(),np.arange(0,len(query.rows()))):

        data_toy['gene-name'][counter]=row["symbol"]
        data_toy['evidence'][counter]=row["goAnnotation.evidence.code.annotType"]
        data_toy['annotation'][counter]=row["goAnnotation.ontologyTerm.parents.name"]


    data_toy_pd=pd.DataFrame(data_toy)
    data=data_toy_pd.drop_duplicates()

    data.index=np.arange(0,len(data))
    return data
go=biological_processes_goterm[2]
data=from_go_to_genes(go,label='GOTerm')
big_data=[]


for i in np.arange(0,len(biological_processes_goterm)):
    go=biological_processes_goterm[i]
    data=from_go_to_genes(go,label='GOTerm')
    big_data.append(data['gene-name'].tolist())
    
flat_list = [item for sublist in big_data for item in sublist]
res = [i for i in flat_list if i] 
flat_list_unique=np.unique(res)
# Selecting the meaningful columns in the respective dataset
domain_id_list=data_domains['domain-name']
#query_gene=data['gene-name']
query_gene=flat_list_unique

# Initialising the arrays
protein_a_list=[]

population = np.arange(0,len(query_gene))


for m in population:

    protein_a=data_domains[data_domains['name']==query_gene[m]]
    protein_a_list.append(protein_a['domain-name'].tolist())
    

    
def remove_empty_domains(protein_list_search):
    index=[]
    for i in np.arange(0,len(protein_list_search)):
        if protein_list_search[i]==[]:
            index.append(i) ## index of empty values for the protein_a_list meaning they dont have any annotated domain

    y=[x for x in np.arange(0,len(protein_list_search)) if x not in index] # a list with non empty values from protein_a list

    protein_list_search_new=[]
    
    for i in y:
        protein_list_search_new.append(protein_list_search[i])
        
    return protein_list_search_new

## evaluating the function

protein_a_list_new=remove_empty_domains(protein_a_list)
print('The empty domain in the data were:', len(protein_a_list)-len(protein_a_list_new), 'out of', len(protein_a_list),'domains')
The empty domain in the data were: 39 out of 287 domains
protein_module=[]

for i in np.arange(0,len(protein_a_list_new)):
    protein_module.append(np.unique(protein_a_list_new[i]))# just taking the unique domains per protein in the module 
protein_module=np.concatenate(protein_module, axis=0 )
unique_domains,index_domains,index_counts=np.unique(protein_module,return_index=True,return_counts=True) # counting how many proteins have the same domain per module 
stats_module=pd.DataFrame()
stats_module['unique-domains']=unique_domains
stats_module['counts-each-domains']=index_counts
stats_module['index-for-original-domains-array']=index_domains
stats_module['ratio-of-sharing']=(index_counts-1)/len(unique_domains)

stats_module.set_index=np.arange(0,len(stats_module))

1.2. Sorting the dataframe by the ratio of sharing

stats_module['domain-descrip']=np.zeros_like(unique_domains)
stats_module['protein-name']=np.zeros_like(unique_domains)
#stats_module['module-name']=np.zeros_like(unique_domains)
for i in np.arange(0,len(unique_domains)):
    stats_module['domain-descrip'][i]=data_domains[data_domains['domain-name']==unique_domains[i]]['domain-descrip'].tolist()[0]
    names=data_domains[data_domains['domain-name']==unique_domains[i]]['name'].tolist()
    inter=set(names).intersection(query_gene)
    stats_module['protein-name'][i]=list(inter) # print the intresection of all proteins with those domains with the protein from the module 
    #stats_module['module-name'][i]=go
stats_module_sorted=stats_module.sort_values(by=['ratio-of-sharing'],ascending=False)
stats_module_sorted.head()
c:\users\linigodelacruz\appdata\local\continuum\anaconda3\envs\wintest\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
c:\users\linigodelacruz\appdata\local\continuum\anaconda3\envs\wintest\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
unique-domains counts-each-domains index-for-original-domains-array ratio-of-sharing domain-descrip protein-name
8 PF00069 20 18 0.074510 Pkinase; Protein kinase domain [CLA4, SPS1, CDC5, PBS2, SSK2, KCC4, BCK1, ARK...
3 PF00018 15 0 0.054902 SH3_1; SH3 domain [YSC84, LSB1, RVS167, MYO3, BZZ1, BEM1, BOI1, ...
17 PF00169 8 39 0.027451 PH; Pleckstrin homology domain [BEM3, BUD4, CLA4, SLM1, BOI1, SLM2, BEM2, BOI2]
9 PF00071 7 91 0.023529 Ras; Small GTPase superfamily [RHO3, RHO2, TEM1, CDC42, RHO1, RSR1, RHO4]
14 PF00134 6 116 0.019608 Cyclin_N; Cyclin, N-terminal [PCL1, CLB4, CLB3, PCL2, CLB1, CLB5]
stats_module_sorted_diff_zero=stats_module_sorted[stats_module_sorted['ratio-of-sharing']!=0]
plt.bar(x=stats_module_sorted_diff_zero['domain-descrip'],height=stats_module_sorted_diff_zero['ratio-of-sharing']);
plt.xticks(rotation=90)
plt.ylabel('ratio of sharing of domains')
plt.title('Module:' + go)
Text(0.5, 1.0, 'Module:cytoskeleton organization')
_images/evaluating-protein-domains-per-module_17_1.png