From 785b39d971bd328ce038a4184a68dd780049aa25 Mon Sep 17 00:00:00 2001
From: "samira.vandenbogaard" <samira.vandenbogaard@gmail.com>
Date: Mon, 3 Feb 2025 12:57:13 +0100
Subject: [PATCH] remade the parse_gpr function to be more generally applicable
 and added documentation

---
 src/PAModelpy/utils/pam_generation.py | 177 +++++++++++++++++++-------
 1 file changed, 128 insertions(+), 49 deletions(-)

diff --git a/src/PAModelpy/utils/pam_generation.py b/src/PAModelpy/utils/pam_generation.py
index cde3dc5..5891a81 100644
--- a/src/PAModelpy/utils/pam_generation.py
+++ b/src/PAModelpy/utils/pam_generation.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import numpy as np
 import cobra
-from typing import TypedDict, Literal, Union, Tuple, Iterable
+from typing import TypedDict, Literal, Union, Tuple, Iterable, List, Dict, Optional
 import re
 import os
 import ast
@@ -75,37 +75,134 @@ def enzyme_information(rxn_id: str,
             'molmass':molmass
             }
 
-def parse_gpr_information(gpr_info:str,
-                          genes:list=None,
-                          enzyme_id:str=None,
-                          gene2protein: dict[str, str] = None) -> tuple[list,list]:
-    #filter out nan entries
-    if not isinstance(gpr_info, str):
+# def parse_gpr_information(gpr_info:str,
+#                           genes:list=None,
+#                           enzyme_id:str=None,
+#                           gene2protein: dict[str, str] = None,
+#                           convert_to_complexes= False) -> tuple[list,list]:
+#     #filter out nan entries
+#     if not isinstance(gpr_info, str) or isinstance(enzyme_id, float):
+#         return None, None
+#
+#     # #only get the genes associated with this enzyme
+#     gpr_list = _parse_gpr(gpr_info)
+#     if genes is None: return gpr_list
+#
+#     gpr_list = _filter_sublists(gpr_list, genes)
+#
+#     #convert the genes to the associated proteins for a database with single protein names
+#     enzyme_relations = []
+#     if convert_to_complexes:
+#         for sublist in gpr_list:
+#             enz_sublist = []
+#             for item in sublist:
+#                 if item in gene2protein.keys():
+#                     if '_' not in gene2protein[item]:
+#                         enz_sublist.append(gene2protein[item])
+#                     elif gene2protein[item].split('_') not in enzyme_relations:
+#                         enzyme_relations += gene2protein[item].split('_')
+#
+#             enzyme_relations += [enz_sublist]
+#     # convert genes to proteins for building a model
+#     elif any([len(info)>1 for info in gpr_list]):
+#         enzyme_relations = [enzyme_id.split('_')]
+#     else:
+#         enzyme_relations = [[enzyme_id]]
+#         # enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'), how='all')
+#     return sorted(gpr_list), sorted(enzyme_relations)
+
+def parse_gpr_information(
+    gpr_info: str,
+    genes: Optional[List[str]] = None,
+    enzyme_id: Optional[str] = None,
+    gene2protein: Optional[Dict[str, str]] = None,
+    convert_to_complexes: bool = False
+) -> Tuple[List[List[str]], List[List[str]]]:
+    """
+    Parses Gene-Protein-Reaction (GPR) information and maps genes to their associated proteins or enzyme complexes.
+
+    Args:
+    gpr_info : str
+        The GPR string representing the gene associations.
+    genes : list[str], optional
+        A list of genes to filter the GPR associations.
+    enzyme_id : str, optional
+        The enzyme ID associated with this reaction.
+    gene2protein : dict[str, str], optional
+        A dictionary mapping genes to proteins.
+    convert_to_complexes : bool, default=False
+        Whether to convert genes to protein complexes.
+
+    Returns:
+    tuple[list[list[str]], list[list[str]]]
+        - A sorted list of gene associations (GPR structure).
+        - A sorted list of enzyme relations (protein mappings).
+    """
+
+    # Filter out invalid entries
+    if not isinstance(gpr_info, str) or isinstance(enzyme_id, float):
         return None, None
 
-    # #only get the genes associated with this enzyme
+    # Step 1: Parse the GPR string
     gpr_list = _parse_gpr(gpr_info)
-    if genes is None: return gpr_list
 
-    gpr_list = _filter_sublists(gpr_list, genes)
+    # Step 2: Filter genes if provided
+    if genes is not None:
+        gpr_list = _filter_sublists(gpr_list, genes)
+
+    # Step 3: Convert genes to associated proteins
+    enzyme_relations = _map_genes_to_proteins(gpr_list, gene2protein, enzyme_id, convert_to_complexes)
+
+    return sorted(gpr_list), sorted(enzyme_relations)
+
 
-    #convert the genes to the associated proteins
-    # enzyme_relations = []
-    if any([len(info)>1 for info in gpr_list]):
-        enzyme_relations = [enzyme_id.split('_')]
+def _map_genes_to_proteins(
+    gpr_list: List[List[str]],
+    gene2protein: Optional[Dict[str, str]],
+    enzyme_id: Optional[str],
+    convert_to_complexes: bool
+) -> List[List[str]]:
+    """
+    Converts genes in a GPR list to their associated proteins or enzyme complexes.
+
+    Args:
+    gpr_list : list[list[str]]
+        Parsed GPR information as a nested list.
+    gene2protein : dict[str, str], optional
+        Dictionary mapping genes to proteins.
+    enzyme_id : str, optional
+        The enzyme ID for this reaction.
+    convert_to_complexes : bool
+        Whether to convert genes to enzyme complexes.
+
+    Returns:
+    list[list[str]]
+        A nested list representing the enzyme relationships.
+    """
+
+    if gene2protein is None:
+        gene2protein = {}
+
+    enzyme_relations = []
+
+    if convert_to_complexes:
+        for sublist in gpr_list:
+            enz_sublist = []
+            for item in sublist:
+                if not item in gene2protein: continue
+                protein = gene2protein[item]
+                if "_" not in protein:
+                    enz_sublist.append(protein)
+                elif protein.split("_") not in enzyme_relations:
+                    enzyme_relations.extend(protein.split("_"))  # Corrected list handling
+            enzyme_relations.append(enz_sublist)
+    elif any(len(info) > 1 for info in gpr_list):  # Complex enzymes
+        enzyme_relations = [enzyme_id.split("_")]
     else:
         enzyme_relations = [[enzyme_id]]
-    # for sublist in gpr_list:
-    #     enz_sublist = []
-    #     for item in sublist:
-    #         if item in gene2protein.keys():
-    #             if '_' not in gene2protein[item]:
-    #                 enz_sublist.append(gene2protein[item])
-    #                 enzyme_relations += enz_sublist
-    #             elif gene2protein[item].split('_') not in enzyme_relations:
-    #                 enzyme_relations += gene2protein[item].split('_')
-    #     enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'), how='all')
-    return sorted(gpr_list), sorted(enzyme_relations)
+
+    return enzyme_relations
+
 
 def get_protein_gene_mapping(enzyme_db: pd.DataFrame,
                              model) -> tuple[dict, dict]:
@@ -369,21 +466,23 @@ def merge_enzyme_complexes(df, gene2protein):
 
     for rxn_id, group in df.groupby('rxn_id'):
         for _, row in group.iterrows():
+            #skip nan entries
+            if isinstance(row.GPR, float) or isinstance(row.gene, float):
+                continue
             # Parse GPR
             gpr_list, enzyme_relations = parse_gpr_information(
-                row['GPR'], row['gene'], row['enzyme_id'], gene2protein
+                row['GPR'], row['gene'], row['enzyme_id'], gene2protein, convert_to_complexes=True
             )
             # Collapse "and" relationships into multimer ID
-            if enzyme_relations:
+            if enzyme_relations and not all(all([isinstance(e, float) for e in er]) for er in enzyme_relations):
                 for gene_list, enzyme_list in zip(gpr_list, enzyme_relations):
+                    print(enzyme_list, enzyme_relations, gpr_list)
                     row_copy = row.copy()
                     row_copy['enzyme_id'] = "_".join(enzyme_list)  # Replace gene with multimer
                     row_copy['gene'] = gene_list  # add all the annotations to the corresponding gene
-
                     collapsed_rows.append(row_copy)
             else:
                 collapsed_rows.append(row)
-
     # Create a new dataframe with collapsed rows
     collapsed_df = pd.DataFrame(collapsed_rows)
     return collapsed_df
@@ -395,8 +494,6 @@ def set_up_pam(pam_info_file:str = '',
                active_enzymes: bool = True,
                translational_enzymes: bool = True,
                unused_enzymes: bool = True,
-               membrane_sector: bool = False,
-               max_membrane_area:float = 0.27,
                sensitivity:bool = True,
                enzyme_db:pd.DataFrame = None,
                adjust_reaction_ids:bool = False) -> PAModel:
@@ -459,24 +556,6 @@ def set_up_pam(pam_info_file:str = '',
     else:
         unused_enzyme_info = None
 
-    if membrane_sector:
-        membrane_info = pd.read_excel(pam_info_file, sheet_name='Membrane').set_index('Parameter')
-        active_membrane_info = pd.read_excel(pam_info_file, sheet_name='MembraneEnzymes').set_index('enzyme_id')
-
-        area_avail_0 = membrane_info.at['area_avail_0','Value']
-        area_avail_mu = membrane_info.at['area_avail_mu','Value']
-        alpha_numbers_dict = active_membrane_info.alpha_numbers.to_dict()
-        enzyme_location = active_membrane_info.location.to_dict()
-
-        membrane_sector = MembraneSector(area_avail_0=area_avail_0,
-                                         area_avail_mu=area_avail_mu,
-                                         alpha_numbers_dict=alpha_numbers_dict,
-                                         enzyme_location=enzyme_location,
-                                         max_area=max_membrane_area)
-
-    else:
-        membrane_sector = None
-
 
     if total_protein: total_protein = TOTAL_PROTEIN_CONCENTRATION