From 99bb3aa9a899defabadd207958b3dc08f5d8e676 Mon Sep 17 00:00:00 2001
From: "samira.vandenbogaard" <samira.vandenbogaard@gmail.com>
Date: Fri, 17 Jan 2025 17:07:09 +0100
Subject: [PATCH] updated pam generation to include more complex gprs

---
 src/PAModelpy/EnzymeSectors.py        | 13 ++++++++-----
 src/PAModelpy/utils/pam_generation.py | 23 ++++++++++++++++++-----
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/PAModelpy/EnzymeSectors.py b/src/PAModelpy/EnzymeSectors.py
index 3ca7847..cf7e910 100644
--- a/src/PAModelpy/EnzymeSectors.py
+++ b/src/PAModelpy/EnzymeSectors.py
@@ -193,11 +193,14 @@ def add(self, model):
                 if enzyme_id in model.enzyme_variables and not self._enzyme_is_enzyme_complex(protein_reaction, enzyme_id):
                     enzyme = model.enzymes.get_by_id(enzyme_id)
                     self._add_reaction_to_enzyme(model, enzyme, rxn_id, kcat)
-                    self.rxn2protein[rxn_id] = {**self.rxn2protein[rxn_id],
-                                                **{enzyme_id: {
-                                                    **kcat,
-                                                    'genes': enzyme.genes,
-                                                    'protein_reaction_association': protein_reaction}}}
+                    if rxn_id == 'ALDD3y_copy1':print(rxn_id, self.rxn2protein[rxn_id])
+                    self.rxn2protein[rxn_id].update({
+                        enzyme_id: {
+                            **kcat,
+                            'genes': enzyme.genes,
+                            'protein_reaction_association': protein_reaction
+                        }
+                    })
 
                 else:
                     if self.protein2gene != {}:
diff --git a/src/PAModelpy/utils/pam_generation.py b/src/PAModelpy/utils/pam_generation.py
index 731f835..e187be6 100644
--- a/src/PAModelpy/utils/pam_generation.py
+++ b/src/PAModelpy/utils/pam_generation.py
@@ -4,6 +4,7 @@
 from typing import TypedDict, Literal, Union, Tuple, Iterable
 import re
 import os
+import ast
 
 from collections import defaultdict
 from dataclasses import dataclass, field
@@ -85,14 +86,16 @@ def parse_gpr_information(gpr_info:str,
 
     # #only get the genes associated with this enzyme
     gpr_list = _parse_gpr(gpr_info)
-    gpr_list = _filter_sublists(gpr_list, genes)
-
     if genes is None: return gpr_list
 
+    gpr_list = _filter_sublists(gpr_list, genes)
+
     #convert the genes to the associated proteins
     # enzyme_relations = []
-    # if '_'in enzyme_id:
-    enzyme_relations = [enzyme_id.split('_')]
+    if any([len(info)>1 for info in gpr_list]):
+        enzyme_relations = [enzyme_id.split('_')]
+    else:
+        enzyme_relations = [[enzyme_id]]
     # for sublist in gpr_list:
     #     enz_sublist = []
     #     for item in sublist:
@@ -283,7 +286,7 @@ def _order_enzyme_complex_id(enz_id:str,
 
 def parse_reaction2protein(enzyme_db: pd.DataFrame,
                            model: cobra.Model,
-                           other_enzyme_id_pattern: str = r'E[0-9][0-9]*') -> dict:
+                           other_enzyme_id_pattern: str = r'(E[0-9][0-9]*|Enzyme_[A-Za-z0-9_]+)') -> dict:
     rxn_info2protein = {}
     protein2gpr = defaultdict(list)
     #remove copy number substrings from the reaction to make it matchable to enzyme information
@@ -312,7 +315,14 @@ def parse_reaction2protein(enzyme_db: pd.DataFrame,
         rxn_info = rxn_info2protein.setdefault(rxn_id, ReactionInformation(rxn_id))
         #sometimes, multiple copies are associated with a single reaction
         rxns = rxn_info.get_reaction_from_model(model)
+        #the genes are generally stored in a list, which needs to be recovered from the string formatted column
         genes = catalytic_reaction_info.gene.iloc[0]
+        if isinstance(genes, str) and genes[-1] == ']':
+            genes = ast.literal_eval(catalytic_reaction_info.gene.iloc[0])
+        elif isinstance(genes, str):
+            genes = [genes]
+
+
         for rxn in rxns:
             # If no genes are associated with the reaction, this reaction is not catalyzed by an enzyme
             if (not len(rxn.genes) > 0) and (not isinstance(genes, list)):  continue
@@ -324,6 +334,7 @@ def parse_reaction2protein(enzyme_db: pd.DataFrame,
                                                                                       enzyme_id,
                                                                                       gene2protein)
 
+
             protein2gpr[enzyme_id]+= gene_reaction_relation
 
             enzyme_info = enzyme_information(rxn_id=rxn.id,
@@ -338,6 +349,8 @@ def parse_reaction2protein(enzyme_db: pd.DataFrame,
             rxn_info.enzymes[enzyme_id] = enzyme_info
             rxn_info2protein[rxn.id] = rxn_info
 
+
+
     # if no enzyme info is found, add dummy enzyme with median kcat and molmass
     rxn_info2protein, protein2gpr = _check_if_all_model_reactions_are_in_rxn_info2protein(model,
                                                                                           rxn_info2protein,