improvements in parsing gene-protein-reaction mapping

iAMB-RWTH-Aachen · Jan 16, 2025 · 08d5c69 · 08d5c69
1 parent d4c86d5
commit 08d5c69
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 24 deletions.
diff --git a/src/PAModelpy/Enzyme.py b/src/PAModelpy/Enzyme.py
@@ -86,6 +86,7 @@ def __init__(
         self.annotation = {
             "type": "Constraint"
         }  # you can add an annotation for an enzyme
+
     @property
     def kcat_values(self):
         """Returns a dictionary with kcat values for each associated reaction.
@@ -120,7 +121,7 @@ def concentration(
         return concentration
 
     @concentration.setter
-    def concentration(self, conc):
+    def concentration(self, conc:float):
         self.enzyme_variable.concentration = conc
 
     @property
@@ -132,10 +133,10 @@ def model(self, model):
         self._model = model
         self.enzyme_id_regex = model.ENZYME_ID_REGEX
 
-    def set_forward_concentration(self, conc):
+    def set_forward_concentration(self, conc:float):
         self.enzyme_variable.set_forward_concentration(conc)
 
-    def set_reverse_concentration(self, conc):
+    def set_reverse_concentration(self, conc:float):
         self.enzyme_variable.set_reverse_concentration(conc)
 
     def create_constraint(self, extension: str = None):
@@ -345,6 +346,7 @@ def __getstate__(self):
         # Return the state to be pickled
         state = self.__dict__.copy()
         state['catalytic_events'] = list(self.catalytic_events)
+        state['transcripts'] = list(self.transcripts)
         # Handle any non-serializable attributes here
         return state
 
@@ -379,11 +381,11 @@ def __init__(
         self,
         id: str,
         rxn2kcat: Dict,
-            enzymes: DictList = DictList(),
+        enzymes: DictList = DictList(),
         genes: list = [],
-            upper_bound: Union[int, float] = 1000.0,
-            name: Optional[str] = None,
-            molmass: Union[int, float] = DEFAULT_ENZYME_MOL_MASS, ):
+        upper_bound: Union[int, float] = 1000.0,
+        name: Optional[str] = None,
+        molmass: Union[int, float] = DEFAULT_ENZYME_MOL_MASS, ):
         super().__init__(
             id=id,
             rxn2kcat=rxn2kcat,
@@ -398,6 +400,10 @@ def __init__(
         self.add_enzymes(enzymes)
 
     def add_enzymes(self, enzymes: DictList):
+        #if required: recover dictlist after pickle
+        if not isinstance(self.enzymes, DictList):
+            self.enzymes = DictList(self.enzymes)
+
         for enzyme in enzymes:
             if enzyme not in self.enzymes:# and not isinstance(enzyme, EnzymeComplex):
                 self.enzymes.append(enzyme)
@@ -450,7 +456,7 @@ def __getstate__(self):
         # Return the state to be pickled
         state = self.__dict__.copy()
         state['catalytic_events'] = list(self.catalytic_events)
-
+        state['enzymes'] = list(self.enzymes)
         # Handle any non-serializable attributes here
         return state
 

diff --git a/src/PAModelpy/EnzymeSectors.py b/src/PAModelpy/EnzymeSectors.py
@@ -219,7 +219,7 @@ def add(self, model):
                                 #e.g.: B1_B2_B3 should not be added if B3_B1_B2 is already in the model
                                 enzyme_complex_id = '_'.join(sorted(pr))
 
-                                if enzyme_complex_id not in model.enzymes:
+                                if enzyme_complex_id not in model.enzyme_variables:
                                     enzyme = EnzymeComplex(
                                         id=enzyme_complex_id,
                                         rxn2kcat={rxn_id: kcat},
@@ -245,6 +245,7 @@ def add(self, model):
                                     enz_complex.add_enzymes([enzyme])
 
                     else:
+
                         model.add_enzymes([enzyme])
 
                         self.constraints += [enzyme]
@@ -342,7 +343,12 @@ def _add_reaction_to_enzyme(self, model, enzyme:Union[Enzyme, EnzymeComplex], rx
             model.add_catalytic_events([enzyme.catalytic_events.get_by_id(f'CE_{rxn_id}')])
 
     def _enzyme_is_enzyme_complex(self, protein_reaction, enzyme_id):
-        return any([((enzyme_id in pr) & (len(pr) > 1)) for pr in protein_reaction])
+        return any(
+            [all(
+                [sorted(pr) == sorted(enzyme_id.split('_')) and  #enzyme should take part in enzyme complex
+                 len(pr)>1] # enzyme complex needs to have at least 2 proteins
+            ) for pr in protein_reaction]
+        )
 
     def _get_model_genes_from_enzyme(self, enzyme_id: str, model: Model) -> list:
         """
@@ -360,6 +366,7 @@ def _get_model_genes_from_enzyme(self, enzyme_id: str, model: Model) -> list:
         gene_list = []
         for genes_or in gene_id_list:
             genes_and_list = []
+            # print(genes_or, enzyme_id)
             for gene_and in genes_or:
                 #check if there is an and relation (then gene and should be a list and not a string)
                 if isinstance(gene_and, list):

diff --git a/src/PAModelpy/PAModel.py b/src/PAModelpy/PAModel.py
@@ -1339,14 +1339,14 @@ def change_reaction_bounds(
     def change_reaction_ub(self, rxn_id: str, upper_bound: float = None):
         if self._sensitivity:
             self.constraints[rxn_id + "_ub"].ub = upper_bound
-            self.reactions.get_by_id(rxn_id).upper_bound = upper_bound*0.01
+            self.reactions.get_by_id(rxn_id).upper_bound = upper_bound*1.01
         else:
             self.reactions.get_by_id(rxn_id).upper_bound = upper_bound
 
     def change_reaction_lb(self, rxn_id: str, lower_bound: float = None):
         if self._sensitivity:
             self.constraints[rxn_id + "_lb"].ub = -lower_bound
-            self.reactions.get_by_id(rxn_id).lower_bound = lower_bound*0.01
+            self.reactions.get_by_id(rxn_id).lower_bound = lower_bound*1.01
         else:
             self.reactions.get_by_id(rxn_id).lower_bound = lower_bound
 

diff --git a/src/PAModelpy/utils/pam_generation.py b/src/PAModelpy/utils/pam_generation.py
@@ -102,7 +102,7 @@ def parse_gpr_information(gpr_info:str,
                     enzyme_relations += [enz_sublist]
                 elif gene2protein[item].split('_') not in enzyme_relations:
                     enzyme_relations += [gene2protein[item].split('_')]
-        enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'))
+        enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'), how='all')
     return sorted(gpr_list), sorted(enzyme_relations)
 
 def get_protein_gene_mapping(enzyme_db: pd.DataFrame, model) -> tuple[dict, dict]:
@@ -115,7 +115,7 @@ def get_protein_gene_mapping(enzyme_db: pd.DataFrame, model) -> tuple[dict, dict
         rxn = model.reactions.get_by_id(rxn_id)
         # get the identifiers and replace nan values by dummy placeholders
         enzyme_id = row['enzyme_id']
-        gene_id = row['gene']
+        gene_id = row['genes']
 
         # check if there are genes associates with the reaction
         if len(rxn.genes) > 0 or isinstance(gene_id, str):
@@ -144,7 +144,8 @@ def _parse_gpr(gpr_info):
         parsed_gpr.append(and_genes)
     return parsed_gpr
 
-def _filter_sublists(nested_list:list[list[str]], target_list:list[str]) -> list[str]:
+def _filter_sublists(nested_list:list[list[str]], target_list:list[str],
+                     how: Literal['any', 'all'] = 'any') -> list[str]:
     """
     Keeps only sublists from `nested_list` that contain at least one identifier present in `target_list`.
 
@@ -155,7 +156,10 @@ def _filter_sublists(nested_list:list[list[str]], target_list:list[str]) -> list
     Returns:
         list of lists: The filtered nested list.
     """
-    filtered_list = [sublist for sublist in nested_list if any([item in target_list for item in sublist])]
+    if how == 'any':
+        filtered_list = [sublist for sublist in nested_list if any([item in target_list for item in sublist])]
+    else:
+        filtered_list = [sublist for sublist in nested_list if all([item in target_list for item in sublist])]
 
     return filtered_list
 
@@ -189,7 +193,8 @@ def _extract_reaction_id(input_str):
 
 def _check_if_all_model_reactions_are_in_rxn_info2protein(model: cobra.Model,
                                                           rxn_info2protein:dict[str, ReactionInformation],
-                                                          protein2gpr:defaultdict[str,list]
+                                                          protein2gpr:defaultdict[str,list],
+                                                          gene2protein:dict[str, str]
                                                           ) -> Tuple[dict[str, ReactionInformation],defaultdict[str,list]]:
     for rxn in model.reactions:
         rxn_id = _extract_reaction_id(
@@ -205,27 +210,31 @@ def _check_if_all_model_reactions_are_in_rxn_info2protein(model: cobra.Model,
         print('No enzyme information found for reaction: ' + rxn.id)
 
         rxn_info = ReactionInformation(rxn.id)
-        rxn_info.model_reactions = rxn
+        rxn_info.model_reactions = [rxn]
         kwargs = {}
         if rxn_info.check_reaction_reversibility() > 0:
             kwargs = {'kcat_b':0}
         elif rxn_info.check_reaction_reversibility() < 0:
             kwargs = {'kcat_f': 0}
 
 
-        gpr_info = parse_gpr_information(rxn.gpr)
-        if gpr_info is None: gpr_info = [[f'gene_{rxn.id}']]
         enzyme_info = enzyme_information(rxn.id, rxn._genes, **kwargs)
         rxn_info.enzymes[enzyme_info['enzyme_id']] = enzyme_info
 
+        gpr_info,_ = parse_gpr_information(str(rxn.gpr),
+                                         genes=[g.id for g in rxn._genes],
+                                         enzyme_id=enzyme_info['enzyme_id'],
+                                         gene2protein=gene2protein)
+        if gpr_info is None: gpr_info = [[f'gene_{rxn.id}']]
+
         rxn_info2protein[rxn.id] = rxn_info
         protein2gpr[enzyme_info['enzyme_id']] = gpr_info
 
     return rxn_info2protein, protein2gpr
 
 def _get_genes_for_proteins(enzyme_db: pd.DataFrame, model) -> dict:
     protein2gene = defaultdict(list)
-    gene2protein = {}
+    gene2protein = defaultdict(list)
     for index, row in enzyme_db.iterrows():
         # Parse data from the row
         rxn_id = row['rxn_id']
@@ -243,7 +252,7 @@ def _get_genes_for_proteins(enzyme_db: pd.DataFrame, model) -> dict:
             genes = [gene.id for gene in rxn.genes]
 
         for gene in genes:
-            gene2protein[gene] = enzyme_id
+            gene2protein[gene].append(enzyme_id)
         # Create gene-protein-reaction associations
         protein2gene[enzyme_id].append(genes)
         # assume that there is a single protein if the previous relation was not assigned a gene
@@ -260,13 +269,27 @@ def _get_kcat_value_from_catalytic_reaction_df(catalytic_reaction_df: pd.DataFra
         return catalytic_reaction_df.kcat_values[
             catalytic_reaction_df.direction == direction].iloc[0]
 
+def _order_enzyme_complex_id(enz_id:str,
+                              other_enzyme_id_pattern: str = r'E[0-9][0-9]*')->str:
+    # Define the regex pattern for protein IDs, obtained from UniProtKB, 2024-08-07
+    # https://www.uniprot.org/help/accession_numbers
+    protein_id_pattern = r'(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})'
+
+    # Define the regex pattern to match protein IDs
+    protein_id_regex = re.compile(protein_id_pattern + r'|' + other_enzyme_id_pattern)
+    proteins = [obj.group(0).strip('_') for obj in re.finditer(protein_id_regex, enz_id)]
+    return "_".join(sorted(proteins))
+
 
 def parse_reaction2protein(enzyme_db: pd.DataFrame, model: cobra.Model) -> dict:
     rxn_info2protein = {}
     protein2gpr = defaultdict(list)
     #remove copy number substrings from the reaction to make it matchable to enzyme information
     filtered_model_reactions = [_extract_reaction_id(r.id) for r in model.reactions]
 
+    #make sure all enzyme complexes have an id ordered in a structured way
+    enzyme_db['enzyme_id'] = enzyme_db['enzyme_id'].map(_order_enzyme_complex_id, na_action='ignore')
+
     # replace NaN values with unique identifiers
     enzyme_db.loc[enzyme_db['enzyme_id'].isnull(), 'enzyme_id'] = [f'E{i}' for i in
                                                                    range(enzyme_db['enzyme_id'].isnull().sum())]
@@ -312,11 +335,14 @@ def parse_reaction2protein(enzyme_db: pd.DataFrame, model: cobra.Model) -> dict:
             rxn_info2protein[rxn.id] = rxn_info
 
     # if no enzyme info is found, add dummy enzyme with median kcat and molmass
-    rxn_info2protein, protein2gpr = _check_if_all_model_reactions_are_in_rxn_info2protein(model, rxn_info2protein, protein2gpr)
+    rxn_info2protein, protein2gpr = _check_if_all_model_reactions_are_in_rxn_info2protein(model,
+                                                                                          rxn_info2protein,
+                                                                                          protein2gpr,
+                                                                                          gene2protein)
+
 
     #convert the dataobject dict to a normal dict for correct parsing by PAModelpy
     rxn2protein = {rxn_id: dict(rxn_info.enzymes) for rxn_id, rxn_info in rxn_info2protein.items()}
-
     return rxn2protein, dict(protein2gpr)
 
 
@@ -384,6 +410,7 @@ def set_up_pam(pam_info_file:str = '',
                 enzyme_db['rxn_id'] = enzyme_db['rxn_id'].apply(_check_rxn_identifier_format)
         # create enzyme objects for each gene-associated reaction
         rxn2protein, protein2gene = parse_reaction2protein(enzyme_db, model)
+
         active_enzyme_info = ActiveEnzymeSector(rxn2protein=rxn2protein, protein2gene=protein2gene,
                                                   configuration=config)
     else: