1. What are the most present protein domains per biological function?¶

from intermine.webservice import Service
import numpy as np
import scipy.io
import seaborn as sns
from scipy import stats, optimize, interpolate
import pandas as pd
from collections import defaultdict 
import math
import matplotlib.pyplot as plt
from scipy.stats import norm, lognorm
from scipy import stats
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import os, fnmatch

1.1. Selecting the modules to evaluate the protein domains¶

import os
script_dir = os.path.dirname('__file__') #<-- absolute dir the script is in
rel_path_domains="datasets/proteins-domains-from-Pfam.xlsx"

abs_file_path_domains = os.path.join(script_dir, rel_path_domains)

# os.chdir('../') #<-- for binder os.chdir('../')
my_path_domains=abs_file_path_domains
data_domains=pd.read_excel(my_path_domains,header=0,index_col='Unnamed: 0')
data_domains=data_domains.dropna()

data_domains.head()

	name	domain-name	domain-descrip	domain-start	domain-end	domain-method	domain-id
0	AAC1	PF00153	Mito_carr; Mitochondrial substrate/solute carrier	10	106	Pfam	2056375
1	AAC1	PF00153	Mito_carr; Mitochondrial substrate/solute carrier	116	210	Pfam	2056376
2	AAC1	PF00153	Mito_carr; Mitochondrial substrate/solute carrier	217	304	Pfam	2056377
3	AAC3	PF00153	Mito_carr; Mitochondrial substrate/solute carrier	9	105	Pfam	2062638
4	AAC3	PF00153	Mito_carr; Mitochondrial substrate/solute carrier	115	208	Pfam	2062639

pathways=[ 'galactose degradation','phospholipid biosynthesis', "ethanol degradation"]
biological_processes_slim=["cell budding","lipid binding","cytokinesis"]
biological_processes_goterm=["cell morphogenesis involved in conjugation with cellular fusion","budding cell apical bud growth","establishment of cell polarity","cytoskeleton organization"]

def from_go_to_genes(go,label):
    #label=["GOTerm" or "GOSlimTerm"]
    service = Service('https://yeastmine.yeastgenome.org/yeastmine/service', token = 'K1P7y5Oeb608Hcu06e44')
    query = service.new_query("Gene")
    query.add_constraint("goAnnotation.ontologyTerm.parents", label)
    query.add_view(
        "symbol", "goAnnotation.evidence.code.annotType",
        "goAnnotation.ontologyTerm.parents.name"
    )
    query.add_constraint("goAnnotation.qualifier", "!=", "NOT", code = "C")
    query.add_constraint("goAnnotation.qualifier", "IS NULL", code = "D")
    query.add_constraint("goAnnotation.evidence.code.annotType", "=", "manually curated", code = "F")
    query.add_constraint("goAnnotation.ontologyTerm.parents.name", "=", go, code = "G")
    query.set_logic("(C or D) and F and G")

    data_toy=defaultdict(dict)

    for row,counter in zip(query.rows(),np.arange(0,len(query.rows()))):

        data_toy['gene-name'][counter]=row["symbol"]
        data_toy['evidence'][counter]=row["goAnnotation.evidence.code.annotType"]
        data_toy['annotation'][counter]=row["goAnnotation.ontologyTerm.parents.name"]


    data_toy_pd=pd.DataFrame(data_toy)
    data=data_toy_pd.drop_duplicates()

    data.index=np.arange(0,len(data))
    return data

go=biological_processes_goterm[2]
data=from_go_to_genes(go,label='GOTerm')

big_data=[]


for i in np.arange(0,len(biological_processes_goterm)):
    go=biological_processes_goterm[i]
    data=from_go_to_genes(go,label='GOTerm')
    big_data.append(data['gene-name'].tolist())
    

flat_list = [item for sublist in big_data for item in sublist]
res = [i for i in flat_list if i] 
flat_list_unique=np.unique(res)

# Selecting the meaningful columns in the respective dataset
domain_id_list=data_domains['domain-name']
#query_gene=data['gene-name']
query_gene=flat_list_unique

# Initialising the arrays
protein_a_list=[]

population = np.arange(0,len(query_gene))

for m in population:

    protein_a=data_domains[data_domains['name']==query_gene[m]]
    protein_a_list.append(protein_a['domain-name'].tolist())

def remove_empty_domains(protein_list_search):
    index=[]
    for i in np.arange(0,len(protein_list_search)):
        if protein_list_search[i]==[]:
            index.append(i) ## index of empty values for the protein_a_list meaning they dont have any annotated domain

    y=[x for x in np.arange(0,len(protein_list_search)) if x not in index] # a list with non empty values from protein_a list

    protein_list_search_new=[]
    
    for i in y:
        protein_list_search_new.append(protein_list_search[i])
        
    return protein_list_search_new

## evaluating the function

protein_a_list_new=remove_empty_domains(protein_a_list)

print('The empty domain in the data were:', len(protein_a_list)-len(protein_a_list_new), 'out of', len(protein_a_list),'domains')

The empty domain in the data were: 39 out of 287 domains

protein_module=[]

for i in np.arange(0,len(protein_a_list_new)):
    protein_module.append(np.unique(protein_a_list_new[i]))# just taking the unique domains per protein in the module 
protein_module=np.concatenate(protein_module, axis=0 )
unique_domains,index_domains,index_counts=np.unique(protein_module,return_index=True,return_counts=True) # counting how many proteins have the same domain per module 

stats_module=pd.DataFrame()
stats_module['unique-domains']=unique_domains
stats_module['counts-each-domains']=index_counts
stats_module['index-for-original-domains-array']=index_domains
stats_module['ratio-of-sharing']=(index_counts-1)/len(unique_domains)

stats_module.set_index=np.arange(0,len(stats_module))

1.2. Sorting the dataframe by the ratio of sharing¶

stats_module['domain-descrip']=np.zeros_like(unique_domains)
stats_module['protein-name']=np.zeros_like(unique_domains)
#stats_module['module-name']=np.zeros_like(unique_domains)
for i in np.arange(0,len(unique_domains)):
    stats_module['domain-descrip'][i]=data_domains[data_domains['domain-name']==unique_domains[i]]['domain-descrip'].tolist()[0]
    names=data_domains[data_domains['domain-name']==unique_domains[i]]['name'].tolist()
    inter=set(names).intersection(query_gene)
    stats_module['protein-name'][i]=list(inter) # print the intresection of all proteins with those domains with the protein from the module 
    #stats_module['module-name'][i]=go
stats_module_sorted=stats_module.sort_values(by=['ratio-of-sharing'],ascending=False)
stats_module_sorted.head()

c:\users\linigodelacruz\appdata\local\continuum\anaconda3\envs\wintest\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
c:\users\linigodelacruz\appdata\local\continuum\anaconda3\envs\wintest\lib\site-packages\ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

	unique-domains	counts-each-domains	index-for-original-domains-array	ratio-of-sharing	domain-descrip	protein-name
8	PF00069	20	18	0.074510	Pkinase; Protein kinase domain	[CLA4, SPS1, CDC5, PBS2, SSK2, KCC4, BCK1, ARK...
3	PF00018	15	0	0.054902	SH3_1; SH3 domain	[YSC84, LSB1, RVS167, MYO3, BZZ1, BEM1, BOI1, ...
17	PF00169	8	39	0.027451	PH; Pleckstrin homology domain	[BEM3, BUD4, CLA4, SLM1, BOI1, SLM2, BEM2, BOI2]
9	PF00071	7	91	0.023529	Ras; Small GTPase superfamily	[RHO3, RHO2, TEM1, CDC42, RHO1, RSR1, RHO4]
14	PF00134	6	116	0.019608	Cyclin_N; Cyclin, N-terminal	[PCL1, CLB4, CLB3, PCL2, CLB1, CLB5]

stats_module_sorted_diff_zero=stats_module_sorted[stats_module_sorted['ratio-of-sharing']!=0]
plt.bar(x=stats_module_sorted_diff_zero['domain-descrip'],height=stats_module_sorted_diff_zero['ratio-of-sharing']);
plt.xticks(rotation=90)
plt.ylabel('ratio of sharing of domains')
plt.title('Module:' + go)

Text(0.5, 1.0, 'Module:cytoskeleton organization')

_images/evaluating-protein-domains-per-module_17_1.png

3. Feature postprocessing from paper. 2. What is the contribution of paralogs to SL pairs that share protein domains?