From 785b39d971bd328ce038a4184a68dd780049aa25 Mon Sep 17 00:00:00 2001 From: "samira.vandenbogaard" Date: Mon, 3 Feb 2025 12:57:13 +0100 Subject: [PATCH] remade the parse_gpr function to be more generally applicable and added documentation --- src/PAModelpy/utils/pam_generation.py | 177 +++++++++++++++++++------- 1 file changed, 128 insertions(+), 49 deletions(-) diff --git a/src/PAModelpy/utils/pam_generation.py b/src/PAModelpy/utils/pam_generation.py index cde3dc5..5891a81 100644 --- a/src/PAModelpy/utils/pam_generation.py +++ b/src/PAModelpy/utils/pam_generation.py @@ -1,7 +1,7 @@ import pandas as pd import numpy as np import cobra -from typing import TypedDict, Literal, Union, Tuple, Iterable +from typing import TypedDict, Literal, Union, Tuple, Iterable, List, Dict, Optional import re import os import ast @@ -75,37 +75,134 @@ def enzyme_information(rxn_id: str, 'molmass':molmass } -def parse_gpr_information(gpr_info:str, - genes:list=None, - enzyme_id:str=None, - gene2protein: dict[str, str] = None) -> tuple[list,list]: - #filter out nan entries - if not isinstance(gpr_info, str): +# def parse_gpr_information(gpr_info:str, +# genes:list=None, +# enzyme_id:str=None, +# gene2protein: dict[str, str] = None, +# convert_to_complexes= False) -> tuple[list,list]: +# #filter out nan entries +# if not isinstance(gpr_info, str) or isinstance(enzyme_id, float): +# return None, None +# +# # #only get the genes associated with this enzyme +# gpr_list = _parse_gpr(gpr_info) +# if genes is None: return gpr_list +# +# gpr_list = _filter_sublists(gpr_list, genes) +# +# #convert the genes to the associated proteins for a database with single protein names +# enzyme_relations = [] +# if convert_to_complexes: +# for sublist in gpr_list: +# enz_sublist = [] +# for item in sublist: +# if item in gene2protein.keys(): +# if '_' not in gene2protein[item]: +# enz_sublist.append(gene2protein[item]) +# elif gene2protein[item].split('_') not in enzyme_relations: +# enzyme_relations += gene2protein[item].split('_') +# +# enzyme_relations += [enz_sublist] +# # convert genes to proteins for building a model +# elif any([len(info)>1 for info in gpr_list]): +# enzyme_relations = [enzyme_id.split('_')] +# else: +# enzyme_relations = [[enzyme_id]] +# # enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'), how='all') +# return sorted(gpr_list), sorted(enzyme_relations) + +def parse_gpr_information( + gpr_info: str, + genes: Optional[List[str]] = None, + enzyme_id: Optional[str] = None, + gene2protein: Optional[Dict[str, str]] = None, + convert_to_complexes: bool = False +) -> Tuple[List[List[str]], List[List[str]]]: + """ + Parses Gene-Protein-Reaction (GPR) information and maps genes to their associated proteins or enzyme complexes. + + Args: + gpr_info : str + The GPR string representing the gene associations. + genes : list[str], optional + A list of genes to filter the GPR associations. + enzyme_id : str, optional + The enzyme ID associated with this reaction. + gene2protein : dict[str, str], optional + A dictionary mapping genes to proteins. + convert_to_complexes : bool, default=False + Whether to convert genes to protein complexes. + + Returns: + tuple[list[list[str]], list[list[str]]] + - A sorted list of gene associations (GPR structure). + - A sorted list of enzyme relations (protein mappings). + """ + + # Filter out invalid entries + if not isinstance(gpr_info, str) or isinstance(enzyme_id, float): return None, None - # #only get the genes associated with this enzyme + # Step 1: Parse the GPR string gpr_list = _parse_gpr(gpr_info) - if genes is None: return gpr_list - gpr_list = _filter_sublists(gpr_list, genes) + # Step 2: Filter genes if provided + if genes is not None: + gpr_list = _filter_sublists(gpr_list, genes) + + # Step 3: Convert genes to associated proteins + enzyme_relations = _map_genes_to_proteins(gpr_list, gene2protein, enzyme_id, convert_to_complexes) + + return sorted(gpr_list), sorted(enzyme_relations) + - #convert the genes to the associated proteins - # enzyme_relations = [] - if any([len(info)>1 for info in gpr_list]): - enzyme_relations = [enzyme_id.split('_')] +def _map_genes_to_proteins( + gpr_list: List[List[str]], + gene2protein: Optional[Dict[str, str]], + enzyme_id: Optional[str], + convert_to_complexes: bool +) -> List[List[str]]: + """ + Converts genes in a GPR list to their associated proteins or enzyme complexes. + + Args: + gpr_list : list[list[str]] + Parsed GPR information as a nested list. + gene2protein : dict[str, str], optional + Dictionary mapping genes to proteins. + enzyme_id : str, optional + The enzyme ID for this reaction. + convert_to_complexes : bool + Whether to convert genes to enzyme complexes. + + Returns: + list[list[str]] + A nested list representing the enzyme relationships. + """ + + if gene2protein is None: + gene2protein = {} + + enzyme_relations = [] + + if convert_to_complexes: + for sublist in gpr_list: + enz_sublist = [] + for item in sublist: + if not item in gene2protein: continue + protein = gene2protein[item] + if "_" not in protein: + enz_sublist.append(protein) + elif protein.split("_") not in enzyme_relations: + enzyme_relations.extend(protein.split("_")) # Corrected list handling + enzyme_relations.append(enz_sublist) + elif any(len(info) > 1 for info in gpr_list): # Complex enzymes + enzyme_relations = [enzyme_id.split("_")] else: enzyme_relations = [[enzyme_id]] - # for sublist in gpr_list: - # enz_sublist = [] - # for item in sublist: - # if item in gene2protein.keys(): - # if '_' not in gene2protein[item]: - # enz_sublist.append(gene2protein[item]) - # enzyme_relations += enz_sublist - # elif gene2protein[item].split('_') not in enzyme_relations: - # enzyme_relations += gene2protein[item].split('_') - # enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'), how='all') - return sorted(gpr_list), sorted(enzyme_relations) + + return enzyme_relations + def get_protein_gene_mapping(enzyme_db: pd.DataFrame, model) -> tuple[dict, dict]: @@ -369,21 +466,23 @@ def merge_enzyme_complexes(df, gene2protein): for rxn_id, group in df.groupby('rxn_id'): for _, row in group.iterrows(): + #skip nan entries + if isinstance(row.GPR, float) or isinstance(row.gene, float): + continue # Parse GPR gpr_list, enzyme_relations = parse_gpr_information( - row['GPR'], row['gene'], row['enzyme_id'], gene2protein + row['GPR'], row['gene'], row['enzyme_id'], gene2protein, convert_to_complexes=True ) # Collapse "and" relationships into multimer ID - if enzyme_relations: + if enzyme_relations and not all(all([isinstance(e, float) for e in er]) for er in enzyme_relations): for gene_list, enzyme_list in zip(gpr_list, enzyme_relations): + print(enzyme_list, enzyme_relations, gpr_list) row_copy = row.copy() row_copy['enzyme_id'] = "_".join(enzyme_list) # Replace gene with multimer row_copy['gene'] = gene_list # add all the annotations to the corresponding gene - collapsed_rows.append(row_copy) else: collapsed_rows.append(row) - # Create a new dataframe with collapsed rows collapsed_df = pd.DataFrame(collapsed_rows) return collapsed_df @@ -395,8 +494,6 @@ def set_up_pam(pam_info_file:str = '', active_enzymes: bool = True, translational_enzymes: bool = True, unused_enzymes: bool = True, - membrane_sector: bool = False, - max_membrane_area:float = 0.27, sensitivity:bool = True, enzyme_db:pd.DataFrame = None, adjust_reaction_ids:bool = False) -> PAModel: @@ -459,24 +556,6 @@ def set_up_pam(pam_info_file:str = '', else: unused_enzyme_info = None - if membrane_sector: - membrane_info = pd.read_excel(pam_info_file, sheet_name='Membrane').set_index('Parameter') - active_membrane_info = pd.read_excel(pam_info_file, sheet_name='MembraneEnzymes').set_index('enzyme_id') - - area_avail_0 = membrane_info.at['area_avail_0','Value'] - area_avail_mu = membrane_info.at['area_avail_mu','Value'] - alpha_numbers_dict = active_membrane_info.alpha_numbers.to_dict() - enzyme_location = active_membrane_info.location.to_dict() - - membrane_sector = MembraneSector(area_avail_0=area_avail_0, - area_avail_mu=area_avail_mu, - alpha_numbers_dict=alpha_numbers_dict, - enzyme_location=enzyme_location, - max_area=max_membrane_area) - - else: - membrane_sector = None - if total_protein: total_protein = TOTAL_PROTEIN_CONCENTRATION