BIOPAX Dataset basic metrics computation¶

We demonstarte here how to use Patterns to compute simple metrics ro compare BIOPAX datasets

In [1]:
import logging
 
logging.getLogger().setLevel(logging.INFO)
In [2]:
from biopax_explorer.pattern.rack import Rack
from biopax_explorer.pattern.pattern import PatternExecutor, Pattern
from biopax_explorer.query import  EntityNode
from biopax_explorer.biopax.utils import gen_utils as gu
from biopax_explorer.biopax import *

import matplotlib.pyplot as plt
plt.set_loglevel("info")
import pathlib

import numpy as np
In [3]:
#reactome biopax file level 3 : https://reactome.org/download-data
#imported in fuseki triple store that serves them as sparql endpoints

dataset1 = "reactome_homo"
dataset2 = "reactome_mus"

dataset1_label = "Homo sapiens"
dataset2_label = "Mus musculus"

db = "http://db:3030" # with local triple store deployed with docker-compose
 
r = Rack()
pe1 = PatternExecutor(db,dataset1) # create a Pattern executor for a dataset
pe2 = PatternExecutor(db,dataset2) 
In [4]:
def entitiesByClass(obj):
    p=Pattern()    
    en = EntityNode("ENTITY", obj)
    p.define(en) 
    return p
def definePatterns():
    objlist=[Protein(),Complex(),SmallMolecule(),
             Control(),BiochemicalReaction()] 
    return definePatternsFromObj(objlist)

def definePatteyBiosource():
    objlist=[BioSource] 
    return definePatternsFromObj(objlist)
def definePatternsFromObj(objlist):
    
    pdict={}
    for obj in objlist:
      pdict[obj.__class__.__name__]=entitiesByClass(obj)
    return pdict
def countSummarize(result):
  uniqbyuri={}
  i=0
  for entity_row in result:
    for entity in entity_row:
      i=i+1 
      #print("%s uri:%s" %(i,entity.pk))
      uniqbyuri[entity.pk]=1
  return len(uniqbyuri.keys())
In [5]:
 

 

def executeQueriesByDataset(pe,patternMap):
  counts_ds={}
  for clsn, p in patternMap.items():
    print("""  execute  pattern on %s""" %(clsn))  
    result = pe.executePattern(p)
    count=countSummarize(result)
    counts_ds[clsn]=count
  return counts_ds
In [6]:
print("""  ds1""",dataset1)
patternMap=definePatterns()
query_counts_ds1=executeQueriesByDataset(pe1,patternMap)
print(query_counts_ds1)
print("""#------------------""")
  ds1 reactome_homo
  execute  pattern on Protein
  execute  pattern on Complex
  execute  pattern on SmallMolecule
  execute  pattern on Control
  execute  pattern on BiochemicalReaction
{'Protein': 31649, 'Complex': 15700, 'SmallMolecule': 5196, 'Control': 8838, 'BiochemicalReaction': 14709}
#------------------
In [7]:
print("""  ds2""",dataset2)
patternMap=definePatterns()

query_counts_ds2=executeQueriesByDataset(pe2,patternMap)
print(query_counts_ds2)
print("""#------------------""")
  ds2 reactome_mus
  execute  pattern on Protein
  execute  pattern on Complex
  execute  pattern on SmallMolecule
  execute  pattern on Control
  execute  pattern on BiochemicalReaction
{'Protein': 23605, 'Complex': 11216, 'SmallMolecule': 4460, 'Control': 5906, 'BiochemicalReaction': 9615}
#------------------
In [8]:
# Sample datasets with counts for each query
#query_counts_ds1 = {'Protein': 31649, 'Complex': 15700, 'SmallMolecule': 5196, 'Control': 8838, 'BiochemicalReaction': 14709, 'Gene': 0}
#query_counts_ds2 = {'Protein': 31649, 'Complex': 15700, 'SmallMolecule': 5196, 'Control': 8838, 'BiochemicalReaction': 14709, 'Gene': 0}
In [9]:
def define_figure(query_counts_ds1,query_counts_ds2,
                  dataset1_label,dataset2_label,xlabel,ylabel, title):

  # Extracting queries and counts for each dataset
  queries = list(query_counts_ds1.keys())
  counts_ds1 = list(query_counts_ds1.values())
  counts_ds2 = list(query_counts_ds2.values())

  # Setting the width of the bars
  bar_width = 0.35

  # Creating index for the x-axis
  index = np.arange(len(queries))

  # Plotting
  plt.figure(figsize=(10, 6))

  # Plotting bars for dataset DS1
  plt.bar(index, counts_ds1, bar_width, label=dataset1_label, color='skyblue')

  # Plotting bars for dataset DS2
  plt.bar(index + bar_width, counts_ds2, bar_width, label=dataset2_label, color='orange')

  # Adding labels and titles
  plt.xlabel(xlabel, fontsize=18)

  plt.ylabel(ylabel, fontsize=18)
  plt.title(title, fontsize=18)
    
  plt.xticks(index + bar_width / 2, queries, rotation=45)

  # Adding legend
  plt.legend( fontsize=18)
  plt.xticks(fontsize=16)  # Adjust the font size of x-axis labels
  plt.yticks(fontsize=16)  # Adjust the font size of y-axis labels

  plt.tight_layout()
  
  return plt
In [10]:
 
exfile="data/output/metrics_simple.pdf"  
xlabel="BioPAX classes"
ylabel="Count of Matches"
title='Number of BioPAX entities in Reactome Datasets'
plt=define_figure(query_counts_ds1,query_counts_ds2,
                  dataset1_label,dataset2_label,xlabel,ylabel,title)
# Save the plot as a PDF file
plt.savefig(exfile)
 
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [11]:
dataset_list=[dataset1,dataset2]
In [12]:
parentClasses=["PhysicalEntity","EntityReference","Interaction"]

sma={}
for cl, children in gu.classes_children().items():
   if cl in parentClasses:
      sma[cl.lower()]=1
 
      for child in children:
         sma[child.lower()]=1
   
selectedClasses=list(sma.keys())
In [13]:
def count_uniq(matches,pmetalabels): 
  global selectedClasses
  uniq={}
  for entity_row in matches:
    for entity in entity_row:
      if entity.cls.lower() in selectedClasses:
        if entity.meta_label is not None and entity.meta_label in pmetalabels:
            if entity.pk in uniq.keys():
             uniq[entity.pk]=uniq[entity.pk]+1
            else:
             uniq[entity.pk]=1
  return len(list(uniq.keys()))   
In [14]:
def prp(data,pe,p,label,pmetalabels):
  res = pe.executePattern(p)

  #data={label:{"matches":count_uniq(res,pmetalabels)}}
  data[label]=count_uniq(res,pmetalabels) 
  print(data)
  querylist=pe.queries(p)


#  for q in querylist:
#      print("#---generated sparql query---\n\n")
#      print(q)


  return data
In [15]:
ds_data={}
for dataset in dataset_list:
  data={}  
  pe = PatternExecutor(db,dataset) # create a Pattern executor for a dataset
 
  data= prp(data,pe,r.controlsStateChange(),"controls-state-change-of",["C1"])
 
  #data= prp(data,pe,r.stateChange(),"state-change",["C1","c"])
   
  data=prp(data,pe,r.controlsPhosphorylation(),"controls-phosphorylation",["C1"])
 
  data=prp(data,pe,r.reactsWith(),"reacts-with",["SMR1"])
 
  data=prp(data,pe,r.usedToProduce(),"used-to-produce",["BR"])
 
  data=prp(data,pe,r.controlsExpressionWithTemplateReac(),"controls-expression-of",["Control"])
 
  data=prp(data,pe,r.inComplexWith(),"in-complex-with",["COMPLEX"])
 
  data=prp(data,pe,r.controlsMetabolicCatalysis(),"control-metabolic-catalysis",["Control"])
  ds_data[dataset]=data
{'controls-state-change-of': 1616}
{'controls-state-change-of': 1616, 'controls-phosphorylation': 707}
{'controls-state-change-of': 1616, 'controls-phosphorylation': 707, 'reacts-with': 104}
{'controls-state-change-of': 1616, 'controls-phosphorylation': 707, 'reacts-with': 104, 'used-to-produce': 4281}
{'controls-state-change-of': 1616, 'controls-phosphorylation': 707, 'reacts-with': 104, 'used-to-produce': 4281, 'controls-expression-of': 5}
{'controls-state-change-of': 1616, 'controls-phosphorylation': 707, 'reacts-with': 104, 'used-to-produce': 4281, 'controls-expression-of': 5, 'in-complex-with': 3026}
{'controls-state-change-of': 1616, 'controls-phosphorylation': 707, 'reacts-with': 104, 'used-to-produce': 4281, 'controls-expression-of': 5, 'in-complex-with': 3026, 'control-metabolic-catalysis': 4462}
{'controls-state-change-of': 1316}
{'controls-state-change-of': 1316, 'controls-phosphorylation': 540}
{'controls-state-change-of': 1316, 'controls-phosphorylation': 540, 'reacts-with': 96}
{'controls-state-change-of': 1316, 'controls-phosphorylation': 540, 'reacts-with': 96, 'used-to-produce': 3341}
{'controls-state-change-of': 1316, 'controls-phosphorylation': 540, 'reacts-with': 96, 'used-to-produce': 3341, 'controls-expression-of': 32}
{'controls-state-change-of': 1316, 'controls-phosphorylation': 540, 'reacts-with': 96, 'used-to-produce': 3341, 'controls-expression-of': 32, 'in-complex-with': 2542}
{'controls-state-change-of': 1316, 'controls-phosphorylation': 540, 'reacts-with': 96, 'used-to-produce': 3341, 'controls-expression-of': 32, 'in-complex-with': 2542, 'control-metabolic-catalysis': 3424}
In [16]:
print(ds_data)
{'reactome_homo': {'controls-state-change-of': 1616, 'controls-phosphorylation': 707, 'reacts-with': 104, 'used-to-produce': 4281, 'controls-expression-of': 5, 'in-complex-with': 3026, 'control-metabolic-catalysis': 4462}, 'reactome_mus': {'controls-state-change-of': 1316, 'controls-phosphorylation': 540, 'reacts-with': 96, 'used-to-produce': 3341, 'controls-expression-of': 32, 'in-complex-with': 2542, 'control-metabolic-catalysis': 3424}}
In [17]:
subset={}
selectedIndex=[0,1,3,5,6,7,8,9]
for ds, mapd in ds_data.items():
  subset[ds]={}  
  i=-1  
  for n,ct in mapd.items():
      i=i+1
      if i in selectedIndex:
           subset[ds][n]=ct
          
In [18]:
exfile="data/output/metrics_adv.pdf"  
xlabel="Pattern occurences count"
ylabel="Count of Matches"
title='Number of Pattern matches in Reactome Datasets'
plt=define_figure(subset[dataset1],subset[dataset2],
                  dataset1_label,dataset2_label,xlabel,ylabel,title)
# Save the plot as a PDF file
plt.savefig(exfile)
No description has been provided for this image
In [ ]:
 
In [ ]: