Skip to content

Commit

Permalink
improvements in parsing gene-protein-reaction mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
SamiralVdB committed Jan 16, 2025
1 parent d4c86d5 commit 08d5c69
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 24 deletions.
22 changes: 14 additions & 8 deletions src/PAModelpy/Enzyme.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def __init__(
self.annotation = {
"type": "Constraint"
} # you can add an annotation for an enzyme

@property
def kcat_values(self):
"""Returns a dictionary with kcat values for each associated reaction.
Expand Down Expand Up @@ -120,7 +121,7 @@ def concentration(
return concentration

@concentration.setter
def concentration(self, conc):
def concentration(self, conc:float):
self.enzyme_variable.concentration = conc

@property
Expand All @@ -132,10 +133,10 @@ def model(self, model):
self._model = model
self.enzyme_id_regex = model.ENZYME_ID_REGEX

def set_forward_concentration(self, conc):
def set_forward_concentration(self, conc:float):
self.enzyme_variable.set_forward_concentration(conc)

def set_reverse_concentration(self, conc):
def set_reverse_concentration(self, conc:float):
self.enzyme_variable.set_reverse_concentration(conc)

def create_constraint(self, extension: str = None):
Expand Down Expand Up @@ -345,6 +346,7 @@ def __getstate__(self):
# Return the state to be pickled
state = self.__dict__.copy()
state['catalytic_events'] = list(self.catalytic_events)
state['transcripts'] = list(self.transcripts)
# Handle any non-serializable attributes here
return state

Expand Down Expand Up @@ -379,11 +381,11 @@ def __init__(
self,
id: str,
rxn2kcat: Dict,
enzymes: DictList = DictList(),
enzymes: DictList = DictList(),
genes: list = [],
upper_bound: Union[int, float] = 1000.0,
name: Optional[str] = None,
molmass: Union[int, float] = DEFAULT_ENZYME_MOL_MASS, ):
upper_bound: Union[int, float] = 1000.0,
name: Optional[str] = None,
molmass: Union[int, float] = DEFAULT_ENZYME_MOL_MASS, ):
super().__init__(
id=id,
rxn2kcat=rxn2kcat,
Expand All @@ -398,6 +400,10 @@ def __init__(
self.add_enzymes(enzymes)

def add_enzymes(self, enzymes: DictList):
#if required: recover dictlist after pickle
if not isinstance(self.enzymes, DictList):
self.enzymes = DictList(self.enzymes)

for enzyme in enzymes:
if enzyme not in self.enzymes:# and not isinstance(enzyme, EnzymeComplex):
self.enzymes.append(enzyme)
Expand Down Expand Up @@ -450,7 +456,7 @@ def __getstate__(self):
# Return the state to be pickled
state = self.__dict__.copy()
state['catalytic_events'] = list(self.catalytic_events)

state['enzymes'] = list(self.enzymes)
# Handle any non-serializable attributes here
return state

Expand Down
11 changes: 9 additions & 2 deletions src/PAModelpy/EnzymeSectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def add(self, model):
#e.g.: B1_B2_B3 should not be added if B3_B1_B2 is already in the model
enzyme_complex_id = '_'.join(sorted(pr))

if enzyme_complex_id not in model.enzymes:
if enzyme_complex_id not in model.enzyme_variables:
enzyme = EnzymeComplex(
id=enzyme_complex_id,
rxn2kcat={rxn_id: kcat},
Expand All @@ -245,6 +245,7 @@ def add(self, model):
enz_complex.add_enzymes([enzyme])

else:

model.add_enzymes([enzyme])

self.constraints += [enzyme]
Expand Down Expand Up @@ -342,7 +343,12 @@ def _add_reaction_to_enzyme(self, model, enzyme:Union[Enzyme, EnzymeComplex], rx
model.add_catalytic_events([enzyme.catalytic_events.get_by_id(f'CE_{rxn_id}')])

def _enzyme_is_enzyme_complex(self, protein_reaction, enzyme_id):
return any([((enzyme_id in pr) & (len(pr) > 1)) for pr in protein_reaction])
return any(
[all(
[sorted(pr) == sorted(enzyme_id.split('_')) and #enzyme should take part in enzyme complex
len(pr)>1] # enzyme complex needs to have at least 2 proteins
) for pr in protein_reaction]
)

def _get_model_genes_from_enzyme(self, enzyme_id: str, model: Model) -> list:
"""
Expand All @@ -360,6 +366,7 @@ def _get_model_genes_from_enzyme(self, enzyme_id: str, model: Model) -> list:
gene_list = []
for genes_or in gene_id_list:
genes_and_list = []
# print(genes_or, enzyme_id)
for gene_and in genes_or:
#check if there is an and relation (then gene and should be a list and not a string)
if isinstance(gene_and, list):
Expand Down
4 changes: 2 additions & 2 deletions src/PAModelpy/PAModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1339,14 +1339,14 @@ def change_reaction_bounds(
def change_reaction_ub(self, rxn_id: str, upper_bound: float = None):
if self._sensitivity:
self.constraints[rxn_id + "_ub"].ub = upper_bound
self.reactions.get_by_id(rxn_id).upper_bound = upper_bound*0.01
self.reactions.get_by_id(rxn_id).upper_bound = upper_bound*1.01
else:
self.reactions.get_by_id(rxn_id).upper_bound = upper_bound

def change_reaction_lb(self, rxn_id: str, lower_bound: float = None):
if self._sensitivity:
self.constraints[rxn_id + "_lb"].ub = -lower_bound
self.reactions.get_by_id(rxn_id).lower_bound = lower_bound*0.01
self.reactions.get_by_id(rxn_id).lower_bound = lower_bound*1.01
else:
self.reactions.get_by_id(rxn_id).lower_bound = lower_bound

Expand Down
51 changes: 39 additions & 12 deletions src/PAModelpy/utils/pam_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def parse_gpr_information(gpr_info:str,
enzyme_relations += [enz_sublist]
elif gene2protein[item].split('_') not in enzyme_relations:
enzyme_relations += [gene2protein[item].split('_')]
enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'))
enzyme_relations = _filter_sublists(enzyme_relations, enzyme_id.split('_'), how='all')
return sorted(gpr_list), sorted(enzyme_relations)

def get_protein_gene_mapping(enzyme_db: pd.DataFrame, model) -> tuple[dict, dict]:
Expand All @@ -115,7 +115,7 @@ def get_protein_gene_mapping(enzyme_db: pd.DataFrame, model) -> tuple[dict, dict
rxn = model.reactions.get_by_id(rxn_id)
# get the identifiers and replace nan values by dummy placeholders
enzyme_id = row['enzyme_id']
gene_id = row['gene']
gene_id = row['genes']

# check if there are genes associates with the reaction
if len(rxn.genes) > 0 or isinstance(gene_id, str):
Expand Down Expand Up @@ -144,7 +144,8 @@ def _parse_gpr(gpr_info):
parsed_gpr.append(and_genes)
return parsed_gpr

def _filter_sublists(nested_list:list[list[str]], target_list:list[str]) -> list[str]:
def _filter_sublists(nested_list:list[list[str]], target_list:list[str],
how: Literal['any', 'all'] = 'any') -> list[str]:
"""
Keeps only sublists from `nested_list` that contain at least one identifier present in `target_list`.
Expand All @@ -155,7 +156,10 @@ def _filter_sublists(nested_list:list[list[str]], target_list:list[str]) -> list
Returns:
list of lists: The filtered nested list.
"""
filtered_list = [sublist for sublist in nested_list if any([item in target_list for item in sublist])]
if how == 'any':
filtered_list = [sublist for sublist in nested_list if any([item in target_list for item in sublist])]
else:
filtered_list = [sublist for sublist in nested_list if all([item in target_list for item in sublist])]

return filtered_list

Expand Down Expand Up @@ -189,7 +193,8 @@ def _extract_reaction_id(input_str):

def _check_if_all_model_reactions_are_in_rxn_info2protein(model: cobra.Model,
rxn_info2protein:dict[str, ReactionInformation],
protein2gpr:defaultdict[str,list]
protein2gpr:defaultdict[str,list],
gene2protein:dict[str, str]
) -> Tuple[dict[str, ReactionInformation],defaultdict[str,list]]:
for rxn in model.reactions:
rxn_id = _extract_reaction_id(
Expand All @@ -205,27 +210,31 @@ def _check_if_all_model_reactions_are_in_rxn_info2protein(model: cobra.Model,
print('No enzyme information found for reaction: ' + rxn.id)

rxn_info = ReactionInformation(rxn.id)
rxn_info.model_reactions = rxn
rxn_info.model_reactions = [rxn]
kwargs = {}
if rxn_info.check_reaction_reversibility() > 0:
kwargs = {'kcat_b':0}
elif rxn_info.check_reaction_reversibility() < 0:
kwargs = {'kcat_f': 0}


gpr_info = parse_gpr_information(rxn.gpr)
if gpr_info is None: gpr_info = [[f'gene_{rxn.id}']]
enzyme_info = enzyme_information(rxn.id, rxn._genes, **kwargs)
rxn_info.enzymes[enzyme_info['enzyme_id']] = enzyme_info

gpr_info,_ = parse_gpr_information(str(rxn.gpr),
genes=[g.id for g in rxn._genes],
enzyme_id=enzyme_info['enzyme_id'],
gene2protein=gene2protein)
if gpr_info is None: gpr_info = [[f'gene_{rxn.id}']]

rxn_info2protein[rxn.id] = rxn_info
protein2gpr[enzyme_info['enzyme_id']] = gpr_info

return rxn_info2protein, protein2gpr

def _get_genes_for_proteins(enzyme_db: pd.DataFrame, model) -> dict:
protein2gene = defaultdict(list)
gene2protein = {}
gene2protein = defaultdict(list)
for index, row in enzyme_db.iterrows():
# Parse data from the row
rxn_id = row['rxn_id']
Expand All @@ -243,7 +252,7 @@ def _get_genes_for_proteins(enzyme_db: pd.DataFrame, model) -> dict:
genes = [gene.id for gene in rxn.genes]

for gene in genes:
gene2protein[gene] = enzyme_id
gene2protein[gene].append(enzyme_id)
# Create gene-protein-reaction associations
protein2gene[enzyme_id].append(genes)
# assume that there is a single protein if the previous relation was not assigned a gene
Expand All @@ -260,13 +269,27 @@ def _get_kcat_value_from_catalytic_reaction_df(catalytic_reaction_df: pd.DataFra
return catalytic_reaction_df.kcat_values[
catalytic_reaction_df.direction == direction].iloc[0]

def _order_enzyme_complex_id(enz_id:str,
other_enzyme_id_pattern: str = r'E[0-9][0-9]*')->str:
# Define the regex pattern for protein IDs, obtained from UniProtKB, 2024-08-07
# https://www.uniprot.org/help/accession_numbers
protein_id_pattern = r'(?:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})'

# Define the regex pattern to match protein IDs
protein_id_regex = re.compile(protein_id_pattern + r'|' + other_enzyme_id_pattern)
proteins = [obj.group(0).strip('_') for obj in re.finditer(protein_id_regex, enz_id)]
return "_".join(sorted(proteins))


def parse_reaction2protein(enzyme_db: pd.DataFrame, model: cobra.Model) -> dict:
rxn_info2protein = {}
protein2gpr = defaultdict(list)
#remove copy number substrings from the reaction to make it matchable to enzyme information
filtered_model_reactions = [_extract_reaction_id(r.id) for r in model.reactions]

#make sure all enzyme complexes have an id ordered in a structured way
enzyme_db['enzyme_id'] = enzyme_db['enzyme_id'].map(_order_enzyme_complex_id, na_action='ignore')

# replace NaN values with unique identifiers
enzyme_db.loc[enzyme_db['enzyme_id'].isnull(), 'enzyme_id'] = [f'E{i}' for i in
range(enzyme_db['enzyme_id'].isnull().sum())]
Expand Down Expand Up @@ -312,11 +335,14 @@ def parse_reaction2protein(enzyme_db: pd.DataFrame, model: cobra.Model) -> dict:
rxn_info2protein[rxn.id] = rxn_info

# if no enzyme info is found, add dummy enzyme with median kcat and molmass
rxn_info2protein, protein2gpr = _check_if_all_model_reactions_are_in_rxn_info2protein(model, rxn_info2protein, protein2gpr)
rxn_info2protein, protein2gpr = _check_if_all_model_reactions_are_in_rxn_info2protein(model,
rxn_info2protein,
protein2gpr,
gene2protein)


#convert the dataobject dict to a normal dict for correct parsing by PAModelpy
rxn2protein = {rxn_id: dict(rxn_info.enzymes) for rxn_id, rxn_info in rxn_info2protein.items()}

return rxn2protein, dict(protein2gpr)


Expand Down Expand Up @@ -384,6 +410,7 @@ def set_up_pam(pam_info_file:str = '',
enzyme_db['rxn_id'] = enzyme_db['rxn_id'].apply(_check_rxn_identifier_format)
# create enzyme objects for each gene-associated reaction
rxn2protein, protein2gene = parse_reaction2protein(enzyme_db, model)

active_enzyme_info = ActiveEnzymeSector(rxn2protein=rxn2protein, protein2gene=protein2gene,
configuration=config)
else:
Expand Down

0 comments on commit 08d5c69

Please sign in to comment.