some updates

iAMB-RWTH-Aachen · Mar 27, 2024 · 80da693 · 80da693
1 parent 146fe83
commit 80da693
Show file tree

Hide file tree

Showing 21 changed files with 1,453 additions and 617 deletions.
diff --git a/Data/TAModel/.~lock.2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx# b/Data/TAModel/.~lock.2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx#
@@ -0,0 +1 @@
+,samiralvdb,QPE-IAMBPC156,27.03.2024 10:16,file:///home/samiralvdb/.config/libreoffice/4;
diff --git a/Data/TAModel/.~lock.Sinha-etal_2021_transcript-data.xlsx# b/Data/TAModel/.~lock.Sinha-etal_2021_transcript-data.xlsx#
diff --git a/Data/TAModel/2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx b/Data/TAModel/2024-02-27_gene_enzyme_reaction_relation_Ecoli.xlsx
diff --git a/Data/proteome_data_extract_schmidt2016.xlsx b/Data/proteome_data_extract_schmidt2016.xlsx
diff --git a/Scripts/.ipynb_checkpoints/Translation_parameters_estimation-checkpoint.ipynb b/Scripts/.ipynb_checkpoints/Translation_parameters_estimation-checkpoint.ipynb
diff --git a/Scripts/Translation_parameters_estimation.ipynb b/Scripts/Translation_parameters_estimation.ipynb
diff --git a/Scripts/__pycache__/pam_generation.cpython-310.pyc b/Scripts/__pycache__/pam_generation.cpython-310.pyc
diff --git a/Scripts/ecolicore_tam_incl_transcript_info.py b/Scripts/ecolicore_tam_incl_transcript_info.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plt
 import pandas as pd
 import os
+import cobra
 
 from Scripts.tam_generation import set_up_toy_tam, set_up_ecolicore_tam
 from Scripts.pam_generation import set_up_ecolicore_pam
@@ -14,8 +15,8 @@
 FLUX_FILE_PATH = os.path.join('Data', 'TAModel', 'Sinha-etal_2021_flux-data.xlsx')
 
 
-mrna_vs_mu_slope = 2.64E-10
-mrna_vs_mu_intercept = 3.59E-11
+mrna_vs_mu_slope = 0.00013049558330984208
+mrna_vs_mu_intercept = 1.7750480089801658e-05
 
 
 def get_transcript_data(transcript_file_path:str = TRANSCRIPT_FILE_PATH, mmol = True,
@@ -43,25 +44,71 @@ def get_pam_fluxes(substrate_uptake_rate):
     pam.change_reaction_bounds('EX_glc__D_e', lower_bound=-substrate_uptake_rate, upper_bound=0)
     sol = pam.optimize()
     pam_fluxes = sol.fluxes
-    return pam_fluxes
+    return pam_fluxes,pam
 
-def set_up_tamodel(strain ='REF'):
+def set_up_tamodel(substrate_uptake_rate, strain ='REF'):
     tam = set_up_ecolicore_tam()
-    tam.change_reaction_bounds('EX_glc__D_e', lower_bound=-1e6, upper_bound=0)
+    tam.change_reaction_bounds('EX_glc__D_e', lower_bound=-substrate_uptake_rate, upper_bound=0)
     transcript_data_mmol = get_transcript_data()
-
-    for gene, expression_data in transcript_data_mmol.iterrows():
-        transcript_id = 'mRNA_' + gene
-        if not transcript_id in tam.transcripts: continue
-        transcript = tam.transcripts.get_by_id('mRNA_' + gene)
-        # testing wildtype condition
-        transcript.change_concentration(concentration=expression_data[strain],
-                                        error=expression_data[strain] * 0.01)
+    i=0
+    infeasible = []
+    for transcript in tam.transcripts:
+        # print(enzyme, transcript)
+        conc = []
+        for gene in transcript.genes:
+            if gene.id in transcript_data_mmol.index: #and gene.id != 'b2987' and gene.id!= 'b3493': #genes related to phosphate transport seem to mess up the calculations
+                conc += [transcript_data_mmol.loc[gene.id, strain]]
+        if len(conc) > 0:
+            transcript.change_concentration(concentration=min(conc) * 1e-3, error= max(conc)*1e-3* 0.01)
+
+            tam.change_reaction_bounds('EX_glc__D_e', lower_bound=-9, upper_bound=-9)
+            tam.optimize()
+            if tam.solver.status != 'optimal': infeasible += [transcript]
+            else: print(tam.objective.value)
+
+        elif gene.id != 'gene_dummy':
+            print(transcript)
+
+    print(infeasible)
+            # tam.test()
+            # print(transcript.enzymes)
+            # print(tam.solver.status)
+    # for gene, expression_data in transcript_data_mmol.iterrows():
+    #     if i<10000:
+    #         transcript_id = 'mRNA_' + gene
+    #         if not transcript_id in tam.transcripts: continue
+    #
+    #         transcript = tam.transcripts.get_by_id('mRNA_' + gene)
+    #         # testing wildtype condition
+    #         transcript.change_concentration(concentration=expression_data[strain]*1e-3,
+    #                                             error=expression_data[strain] *1e-3* 0.01)
+    #         i+=1
+    #         tam.test()
+    #         print(transcript.enzymes)
+    #         print(tam.solver.status)
+    #         print(transcript)
+    #         if tam.solver.status != 'optimal':
+    #             print('number of transcripts which were be added to the model: ',i)
+    #             for name, cons in transcript._constraints.items():
+    #                 print(name, cons, expression_data[strain])
+    #             break
+    # print('number of transcripts which could be added to the model: ', len(tam.transcripts))
+    # enz = tam.enzymes.get_by_id('2.7.3.9')
+    # print(enz.name, enz.enzyme_variable.concentration)
+    # print(infeasible)
     return tam
 
 def get_tam_fluxes(tam,substrate_uptake_rate):
     tam.change_reaction_bounds('EX_glc__D_e', lower_bound=-substrate_uptake_rate, upper_bound=0)
     sol = tam.optimize()
+    if tam.solver.status != 'optimal': return None
+    for transcript in tam.transcripts:
+        for constr in transcript._constraints.values():
+            if constr.dual !=0:
+                print(transcript, transcript.mrna_variable.primal, constr.primal)
+                print(transcript.f_min*transcript.mrna_variable.primal, transcript.f_max*transcript.mrna_variable.primal)
+                for enzyme in transcript.enzymes: print(enzyme.id, enzyme.concentration)
+                print(constr)
     tam_fluxes = sol.fluxes
     return tam_fluxes
 
@@ -92,21 +139,46 @@ def compare_flux_data(flux_data, pam_fluxes, tam_fluxes, strain ='REF', abs=True
     print(flux_results_percentage.to_markdown())
     return flux_results_percentage
 
-def compare_fluxes_holm_reference(strain = 'REF'):
+def compare_fluxes_holm_reference(strain = 'REF', plot = True):
     flux_data =get_flux_data()
     substrate_uptake_rate = flux_data[strain]['GLCptspp']
 
-    pam_fluxes = get_pam_fluxes(substrate_uptake_rate=substrate_uptake_rate)
+    pam_fluxes, pam = get_pam_fluxes(substrate_uptake_rate=substrate_uptake_rate)
 
-    tam = set_up_tamodel(strain)
+    tam = set_up_tamodel(substrate_uptake_rate,strain)
     tam_fluxes = get_tam_fluxes(tam, substrate_uptake_rate=substrate_uptake_rate)
+    if tam_fluxes is None:
+        print('TAModel was infeasible')
+        return
     for i,row in tam.capacity_sensitivity_coefficients.iterrows():
         if row.coefficient > 0: print(row)
 
+    # for transcript in tam.transcripts:
+    #     if transcript.mrna_variable.lb != 0:
+    #         for gene in transcript.genes:
+    #             gene_id = gene.id
+    #             enzymes = tam.get_enzymes_by_gene_id(gene_id)
+    #             for enzyme in enzymes:
+    #                 print(enzyme.id, 'TAM',enzyme.concentration,'PAM' ,pam.enzymes.get_by_id(enzyme.id).concentration)
+    #                 print()
+    # print('RNA constraints primal', tam.constraints[tam.TOTAL_MRNA_CONSTRAINT_ID].primal)
+    # print('RNA constraints dual', tam.constraints[tam.TOTAL_MRNA_CONSTRAINT_ID].dual)
+    # for enzyme in pam.enzyme_variables[:9]:
+    #     tam_enzyme = tam.enzyme_variables.get_by_id(enzyme.id)
+    #
+    #     print(enzyme.id, enzyme.concentration, tam_enzyme.concentration)
+    #     tam_transcripts = tam.get_transcripts_associated_with_enzyme(tam.enzymes.get_by_id(enzyme.id), output='list')
+    #     for transcript in tam_transcripts:
+    #         print(transcript.id, transcript.mrna_variable.primal)
+    #         print('bounds',transcript.mrna_variable.lb, transcript.mrna_variable.ub)
+    #         for constraint in transcript._constraints.values():
+    #             if constraint.dual>0: print(constraint, constraint.dual)
+
+
     flux_relative = compare_flux_data(flux_data, pam_fluxes, tam_fluxes,  strain,abs = False)
     flux_absolute = compare_flux_data(flux_data, pam_fluxes, tam_fluxes,  strain)
 
-    plot_flux_comparison(flux_absolute, flux_relative, strain)
+    if plot: plot_flux_comparison(flux_absolute, flux_relative, strain)
 
 def plot_flux_comparison(flux_df_abs, flux_df_rel, strain):
     fig, ax = plt.subplots(1,2)
@@ -136,11 +208,12 @@ def plot_flux_comparison(flux_df_abs, flux_df_rel, strain):
 
 
 if __name__ == '__main__':
+
     print('Reference condition')
-    compare_fluxes_holm_reference()
+    compare_fluxes_holm_reference(plot =False)
     print('\n-------------------------------------------------------------------------------------------------')
-    print('mutation 1: NOX strain (overexpression of NADH oxidase)\n')
-    compare_fluxes_holm_reference('NOX')
+    # print('mutation 1: NOX strain (overexpression of NADH oxidase)\n')
+    # compare_fluxes_holm_reference('NOX', plot=False)
     # TODO print mRNA and protein concentrations to compare with lb
     # TODO print shadowprices of mRNA (are lbs hit? how far can I constrain?)
 

diff --git a/Scripts/pam_generation.py b/Scripts/pam_generation.py
@@ -9,6 +9,7 @@
 from src.PAModelpy.EnzymeSectors import ActiveEnzymeSector, UnusedEnzymeSector, TransEnzymeSector
 from src.PAModelpy.configuration import Config
 
+from Scripts.tam_generation import parse_reaction2protein2gene2transcript
 from Scripts.toy_ec_pam import build_toy_gem, build_active_enzyme_sector, build_translational_protein_sector, build_unused_protein_sector
 
 'Function library for making Protein Allocation Models as described in the publication'
@@ -43,7 +44,9 @@ def set_up_ecolicore_pam(total_protein:bool = True, active_enzymes: bool = True,
 
     # some other constants
     BIOMASS_REACTION = 'BIOMASS_Ecoli_core_w_GAM'
-    TOTAL_PROTEIN_CONCENTRATION = 0.16995  # [g_prot/g_cdw]
+    # TOTAL_PROTEIN_CONCENTRATION = 0.16995  # [g_prot/g_cdw]
+    TOTAL_PROTEIN_CONCENTRATION = 0.185  # [g_prot/g_cdw]
+
 
     config = Config()
     config.reset()
@@ -57,7 +60,7 @@ def set_up_ecolicore_pam(total_protein:bool = True, active_enzymes: bool = True,
         enzyme_db = _parse_enzyme_information_from_file(TAM_DATA_FILE_PATH)
 
         # create enzyme objects for each gene-associated reaction
-        rxn2protein, protein2gene, gene2transcript = parse_reaction2protein(enzyme_db, model)
+        rxn2protein, protein2gene, gene2transcript = parse_reaction2protein2gene2transcript(enzyme_db, model)
 
         # create active enzymes sector
         active_enzyme_sector = ActiveEnzymeSector(rxn2protein=rxn2protein,
@@ -252,7 +255,6 @@ def parse_coefficients(pamodel):
 def parse_esc(pamodel):
     return pamodel.enzyme_sensitivity_coefficients.coefficient.to_list()
 
-
 def _parse_enzyme_information_from_file(file_path:str):
     # load active enzyme sector information
     enzyme_db = pd.read_excel(file_path, sheet_name='enzyme-gene-reaction')
@@ -376,7 +378,6 @@ def _get_fwd_bckw_kcat(rxn_id: str, kcat:float, model:PAModel) -> Union[list, No
 
     # Iterate over each identifier in the input
     if base_id in model.reactions:
-        if not model.reactions.get_by_id(base_id).genes: return None
         # Determine the form of the identifier
         if rxn_id.endswith('_f'):
             kcat_fwd = kcat
@@ -391,7 +392,6 @@ def _get_fwd_bckw_kcat(rxn_id: str, kcat:float, model:PAModel) -> Union[list, No
         else:
             return None
     elif rxn_id in model.reactions:
-        if not model.reactions.get_by_id(rxn_id).genes: return None
         kcat_fwd = kcat
         kcat_rev = kcat
     else:

diff --git a/Scripts/tam_generation.py b/Scripts/tam_generation.py
@@ -67,8 +67,17 @@ def set_up_ecolicore_tam(total_protein:bool = True, active_enzymes: bool = True,
         enzyme_db = _parse_enzyme_information_from_file(TAM_DATA_FILE_PATH)
 
         # create enzyme objects for each gene-associated reaction
-        rxn2protein, protein2gene, gene2transcript = parse_reaction2protein(enzyme_db, model)
-
+        rxn2protein, protein2gene, gene2transcript = parse_reaction2protein2gene2transcript(enzyme_db, model)
+
+        # import random
+        # gene2transcript2 ={}
+        # for i in range(50):
+        #     gene, transcript = random.choice(list(gene2transcript.items()))
+        #     gene2transcript2 = {**gene2transcript2, gene:transcript}
+        # gene2, transcript2 = random.choice(list(gene2transcript.items()))
+        # gene3, transcript3 = random.choice(list(gene2transcript.items()))
+        #
+        # gene2transcript = {gene:transcript, gene2:transcript2, gene3:transcript3}
         # create active enzymes sector
         active_enzyme_sector = ActiveEnzymeSector(rxn2protein=rxn2protein,
                                                   protein2gene = protein2gene,
@@ -284,7 +293,7 @@ def _parse_enzyme_information_from_file(file_path:str):
     return enzyme_db
 
 
-def parse_reaction2protein(enzyme_db: pd.DataFrame, model:cobra.Model) -> dict:
+def parse_reaction2protein2gene2transcript(enzyme_db: pd.DataFrame, model:cobra.Model) -> dict:
     # Initialize dictionaries
     rxn2protein = {}
     protein2gene = {}
@@ -301,24 +310,25 @@ def parse_reaction2protein(enzyme_db: pd.DataFrame, model:cobra.Model) -> dict:
         #check if there is a forward or backward string which needs to be removed from the rxn id
         if any([rxn_id.endswith('_f'), rxn_id.endswith('_b')]): rxn_id = rxn_id[:-2]
 
+        rxn = model.reactions.get_by_id(rxn_id)
         #get the identifiers and replace nan values by dummy placeholders
         enzyme_id = row['EC_nmbr']
-        if not isinstance(enzyme_id, str): enzyme_id = 'Enzyme_'+rxn_id
         gene_id = row['gene_id']
-        if not isinstance(gene_id, str): gene_id = 'gene_dummy'
-        mrna_length = row['mrna_length']
+        if len(rxn.genes)>0:
+            if not isinstance(enzyme_id, str): enzyme_id = 'Enzyme_'+rxn_id
+            if not isinstance(gene_id, str): gene_id = 'gene_dummy'
+            mrna_length = row['mrna_length']
 
-        #get the gene-protein-reaction-associations
-        gpr = parse_gpr_information_for_protein2genes(row['gpr'])
+            #get the gene-protein-reaction-associations
+            gpr = parse_gpr_information_for_protein2genes(row['gpr'])
 
-        # Create gene-protein-reaction associations
-        if enzyme_id not in protein2gene:
-            protein2gene[enzyme_id] = []
-        protein2gene[enzyme_id].append(gpr)
+            # Create gene-protein-reaction associations
+            if enzyme_id not in protein2gene:
+                protein2gene[enzyme_id] = gpr
 
-        # Create gene2transcript dictionary
-        if gene_id not in gene2transcript.keys():
-            gene2transcript[gene_id] = {'id': f'mRNA_{gene_id}', 'length': mrna_length}
+            # Create gene2transcript dictionary
+            if gene_id not in gene2transcript.keys():
+                gene2transcript[gene_id] = {'id': f'mRNA_{gene_id}', 'length': mrna_length}
 
         # Create rxn2protein dictionary
         if rxn_id not in rxn2protein:

diff --git a/docs/source/example.md b/docs/source/example.md
@@ -42,7 +42,7 @@ First, we'll define the paths we'll download the data
 
 ```python
 protein_sector_info_path = 'Data/proteinAllocationModel_iML1515_EnzymaticData_py.xls'
-active_enzyme_data = pd.read_excel(protein_sector_info_path, sheet_name='ActiveEnzymes'))
+active_enzyme_info = pd.read_excel(protein_sector_info_path, sheet_name='ActiveEnzymes'))
 ```
 
 The data is now in a dataframe with the following columns:
@@ -52,9 +52,6 @@ rxn_id - rxnName - rxnEquat - EC_nmbr - molMass
 First, let's add an identifier to the reactions for which the enzyme is unknown, in order to distinguish between the 
 enzymes
 ```python
-#load active enzyme sector information
-active_enzyme_info = pd.read_excel(pam_info_file, sheet_name='ActiveEnzymes')
-
 # replace NaN values with unique identifiers
 #select the NaN values 
 nan_values = active_enzyme_info['EC_nmbr'].isnull()

diff --git a/src/PAModelpy/Enzyme.py b/src/PAModelpy/Enzyme.py
@@ -5,7 +5,7 @@
 import PAModelpy.Enzyme
 import cobra.core
 import cobra
-from cobra import Reaction, Object
+from cobra import Reaction, Object, Gene
 from cobra.exceptions import OptimizationError
 from cobra.util.solver import check_solver_status
 from copy import copy, deepcopy
@@ -87,6 +87,7 @@ def __init__(
         self._model = None
         self.enzyme_complex = [] #is the enzyme in a complex?
         self.genes = genes
+        self.transcripts = DictList()
         self.annotation = {'type':'Constraint'}#you can add an annotation for an enzyme
 
     @property
@@ -114,9 +115,7 @@ def concentration(self, units:str = 'mmol/gDW', return_units:bool = False) -> fl
         """
 
         # sum up concentrations (aka fluxes) of all enzyme objects
-        concentration = 0.0
-        for catalytic_event in self.catalytic_events:
-            concentration += catalytic_event.flux()
+        concentration = self.enzyme_variable.concentration
         if units == 'g/gDW':
             #converting mmol to grams of protein:
             # [g] = [mmol]* 1e-3 [mol/mmol] * MW[g/mol]
@@ -147,6 +146,40 @@ def add_catalytic_event(self, ce: CatalyticEvent, kcats: Dict):
         self.catalytic_events += [ce]
         self.enzyme_variable.add_catalytic_events([ce],[kcats])
 
+    def add_genes(self, gene_list: list, gene_length:list, relation:str = 'OR') -> None:
+        """
+            Add genes to the enzyme and the model related to the enzyme if applicable
+
+            Args:
+                gene_list (list): A list of gene identifiers to be added.
+                gene_length (list): A list of lengths corresponding to each gene.
+                relation (str, optional): The relationship between genes in gene_list.
+                    Defaults to 'OR'. Possible values: 'OR' or 'AND'.
+
+            Raises:
+                ValueError: If an invalid relation is provided.
+
+            Note:
+                If relation is 'OR', each gene in gene_list will be treated as coding for an individual isozyme
+                If relation is 'AND', all genes in gene_list will be treated as coding for peptides in an enzyme complex
+
+            """
+        # check/correct type of arguments
+        if isinstance(gene_list, str): gene_list = [gene_list]
+        if not isinstance(gene_length, list): gene_length = [gene_length]
+
+        if relation == 'OR':
+            genes_to_add = [[Gene(gene)] for gene in gene_list][0]
+        elif relation == 'AND':
+            genes_to_add = [Gene(gene) for gene in gene_list]
+        else:
+            raise ValueError("Invalid relation. Supported values are 'OR' or 'AND'.")
+        self.genes.append(genes_to_add)
+
+        if self._model is not None:
+            self._model.add_genes(genes = genes_to_add, enzymes = [self], gene_lengths=gene_length)
+
+
     def create_catalytic_event(self, rxn_id: str, kcats: Dict):
         """creates enzyme variables that link to reactions
 
@@ -633,6 +666,7 @@ def add_reactions(self, reaction_kcat_dict: dict):
                         self.reverse_variable: -1
                     })
 
+
     def remove_catalytic_event(self, catalytic_event: Union[CatalyticEvent, str]):
         """
         Function to remove a catalytic event from an enzyme
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		,samiralvdb,QPE-IAMBPC156,27.03.2024 10:16,file:///home/samiralvdb/.config/libreoffice/4;