SysBioChalmers · haowang-bioinfo · Jun 21, 2022 · Jan 21, 2022 · Jan 21, 2022 · Jan 21, 2022
diff --git a/code/GPRs/getGenesFromGrRules.m b/code/GPRs/getGenesFromGrRules.m
@@ -1,9 +1,9 @@
-function [genes,rxnGeneMat] = getGenesFromGrRules(grRules)
+function [genes,rxnGeneMat] = getGenesFromGrRules(grRules, originalGenes)
 %getGenesFromGrRules  Extract gene list and rxnGeneMat from grRules array.
 %
 % USAGE:
 %
-%   [genes,rxnGeneMat] = getGenesFromGrRules(grRules);
+%   [genes,rxnGeneMat] = getGenesFromGrRules(grRules, originalGenes);
 %
 % INPUTS:
 %
@@ -12,6 +12,7 @@
 %               NOTE: Boolean operators can be text ("and", "or") or
 %                     symbolic ("&", "|"), but there must be a space
 %                     between operators and gene names/IDs.
+%   originalGenes     The original gene list from the model as reference
 %
 % OUTPUTS:
 %
@@ -24,6 +25,11 @@
 %
 
 
+% handle input arguments
+if nargin < 2
+    originalGenes = [];
+end
+
 % check if the grRules use written or symbolic boolean operators
 if any(contains(grRules,{'&','|'}))
     % fix some potential missing spaces between parentheses and &/|
@@ -50,6 +56,14 @@
 nonEmpty = ~cellfun(@isempty,rxnGenes);
 genes = unique([rxnGenes{nonEmpty}]');
 
+if ~isempty(originalGenes)
+    if ~isequal(sort(originalGenes), sort(genes))
+        error('The grRules and original gene list are inconsistent!');
+    else
+        genes = originalGenes;
+    end
+end
+
 % construct new rxnGeneMat (if requested)
 if nargout > 1
     rxnGeneCell = cellfun(@(rg) ismember(genes,rg),rxnGenes,'UniformOutput',false);

diff --git a/code/addBoundaryMets.m b/code/addBoundaryMets.m
@@ -104,9 +104,15 @@
 % add new boundary mets to the model
 metsToAdd.mets = add_bound_met_IDs;
 metsToAdd.metNames = add_bound_mets;
-metsToAdd.compartments = 'b';
+metsToAdd.compartments = repmat({'b'}, size(add_bound_mets));
 metsToAdd.unconstrained = ones(size(add_bound_mets));
-new_model = addMets(model,metsToAdd);
+if ~isempty(add_bound_met_IDs)
+    new_model = addMets(model,metsToAdd);
+else
+    fprintf('No Boundary metabolites were added to the model!\n');
+    new_model = model;
+    return
+end
 
 % now add the boundary mets to the model S-matrix
 S = new_model.S;

diff --git a/code/annotateGEM.m b/code/annotateGEM.m
@@ -1,10 +1,13 @@
-function annModel = annotateGEM(model,annType,addMiriams,addFields,overwrite)
+function annModel = annotateGEM(model,annPath,annType,addMiriams,addFields,overwrite)
 % Add reaction, metabolite, and/or gene annotation to a model.
 %
 % Input:
 %
 %   model        Model structure.
 %
+%   annPath      Path to the annotation files, which suppose to be named as
+%                'reactions.tsv', 'metabolites.tsv', and 'genes.tsv'。
+%
 %   annType      String or cell array of strings specifying the type(s) of 
 %                annotation data to add: 'rxn', 'met', and/or 'gene'. To
 %                add all annotation types, use 'all'.
@@ -35,27 +38,32 @@
 %
 % Usage:
 %
-%   annModel = annotateGEM(model,annType,addMiriams,addFields,overwrite);
+%   annModel = annotateGEM(model,annPath,annType,addMiriams,addFields,overwrite);
 %
 
 
 %% Inputs and setup
 
-if nargin < 2 || isempty(annType) || strcmpi(annType,'all')
+if nargin < 2
+    [ST, I] = dbstack('-completenames');
+    annPath = strcat(fileparts(ST(I).file),'/../model');
+end
+
+if nargin < 3 || isempty(annType) || isequal(annType,'all')
     annType = {'rxn','met','gene'};
 elseif ~all(ismember(annType,{'rxn','met','gene','reaction','metabolite'}))
     error('annType input(s) not recognized. Valid options are "rxn", "met", and/or "gene", or "all"');
 end
 
-if nargin < 3 || isempty(addMiriams)
+if nargin < 4 || isempty(addMiriams)
     addMiriams = true;
 end
 
-if nargin < 4 || isempty(addFields)
+if nargin < 5 || isempty(addFields)
     addFields = true;
 end
 
-if nargin < 5
+if nargin < 6
     overwrite = true;
 end
 
@@ -96,9 +104,7 @@
 
 % load reaction annotation data
 if any(ismember({'rxn','reaction'},lower(annType)))
-    [ST, I] = dbstack('-completenames');
-    path = fileparts(ST(I).file);
-    tmpfile = fullfile(path,'../model','reactions.tsv');
+    tmpfile = fullfile(annPath,'reactions.tsv');
     rxnAssoc = importTsvFile(tmpfile);
 
     % strip "RHEA:" prefix from Rhea IDs since it should not be included in
@@ -119,9 +125,7 @@
 
 % load metabolite annotation data
 if any(ismember({'met','metabolite'},lower(annType)))
-    [ST, I] = dbstack('-completenames');
-    path = fileparts(ST(I).file);
-    tmpfile = fullfile(path,'../model','metabolites.tsv');
+    tmpfile = fullfile(annPath,'metabolites.tsv');
     metAssoc = importTsvFile(tmpfile);
 
     % ChEBI IDs should be of the form "CHEBI:#####"
@@ -140,9 +144,7 @@
 
 % load and organize gene annotation data
 if ismember('gene',lower(annType))
-    [ST, I] = dbstack('-completenames');
-    path = fileparts(ST(I).file);
-    tmpfile = fullfile(path,'../model','genes.tsv');
+    tmpfile = fullfile(annPath,'genes.tsv');
     geneAssoc = importTsvFile(tmpfile);
 
     % add geneEnsemblID field if missing
@@ -275,9 +277,18 @@
 
     % get fields and their types
     f = fieldnames(allAssoc);
-    fieldType = repmat({'rxn'}, numel(f), 1);
-    fieldType(ismember(f, fieldnames(metAssoc))) = {'met'};
-    fieldType(ismember(f, fieldnames(geneAssoc))) = {'gene'};
+
+    if ~isempty(rxnAssoc)
+        fieldType = repmat({'rxn'}, numel(f), 1);
+    end
+
+    if ~isempty(metAssoc)
+        fieldType(ismember(f, fieldnames(metAssoc))) = {'met'};
+    end
+
+    if ~isempty(geneAssoc)
+        fieldType(ismember(f, fieldnames(geneAssoc))) = {'gene'};
+    end
 
     % add individual ID fields to the model
     for i = 1:numel(f)

diff --git a/code/curateReactionNames.py b/code/curateReactionNames.py
@@ -0,0 +1,68 @@
+"""Fetch Human-GEM reaction names from KEGG
+Original file is located at
+    https://colab.research.google.com/drive/17X0Qx0H4pwjZjLLWHnpp5ac2daH9hOxs
+"""
+
+import requests
+import re
+import yaml
+import pandas
+
+"""Get all the KEGG reactions via their API, and save the result to a file."""
+
+KEGG_REACTIONS = 'kegg_reactions.txt'
+HG_YAML = '../model/Human-GEM.yml'
+F_YAML = '../model/curated-Human-GEM.yml'
+
+with open(KEGG_REACTIONS,'w') as f:
+  r = requests.get('http://rest.kegg.jp/list/reaction/')
+  f.write(r.text)
+
+"""Extract the KEGG reactions as key-value pairs."""
+
+raw_reactions = open(KEGG_REACTIONS, 'r')
+raw_reaction_lines = raw_reactions.readlines()
+
+reaction_id = re.compile('(?:^rn\:)(R\d+)')
+reaction_name = re.compile('(?:\t)([^;]+)(?:;)')
+kegg_reactions = {}
+for line in raw_reaction_lines:
+  try:
+    kegg_reactions[reaction_id.search(line).group(1)] = reaction_name.search(line).group(1)
+  except:
+    kegg_reactions[reaction_id.search(line).group(1)] = ''
+# print(kegg_reactions[])
+
+"""Fetch Human-GEM reactions from the TSV annotation."""
+
+hg_annotation = pandas.read_csv('../model/reactions.tsv', sep='\t', index_col=0)
+
+""" Traverse the YAML, and for each line that looks like a reaction definition, extract the reaction identifier, and get the matching KEGG id. Then, change the next line that contains the reaction name to the name provided by KEGG."""
+
+with open(HG_YAML, 'r') as inputf:
+  with open(F_YAML, 'w') as outputf:
+    count = 0
+    count_blank = 0
+    while True:
+      reaction_id = re.compile('(?:^      - id: ")(MAR\d+)')
+      reaction_name = re.compile('(?:^      - name: ")()("$)')
+      try:
+        line = inputf.readline()
+        r_id = reaction_id.search(line).group(1)
+        outputf.write(line)
+        line = inputf.readline()
+        r_name = reaction_name.search(line).group(1)
+        kegg_id = hg_annotation.loc[r_id]['rxnKEGGID']
+        if kegg_id and r_name == "":
+            if kegg_reactions[kegg_id] == "":
+                count_blank = count_blank + 1
+            else:
+                line = '      - name: "' + kegg_reactions[kegg_id] + '"\n'
+                count = count + 1
+      except:
+        None
+      outputf.write(line)
+      if not line:
+        break
+    print('Reaction names adopted from KEGG: ' + str(count))
+    print('Blank names also blank in KEGG: ' + str(count_blank))
diff --git a/code/gapfill4EssentialTasks.m b/code/gapfill4EssentialTasks.m
@@ -57,7 +57,7 @@
 % load metabolic task for growth under Ham's media
 [ST, I] = dbstack('-completenames');
 path = fileparts(ST(I).file);
-essentialTasks = fullfile(path,'../data/metabolicTasks','metabolicTasks_Essential.xlsx');
+essentialTasks = fullfile(path,'../data/metabolicTasks','metabolicTasks_Essential.txt');
 taskStruct = parseTaskList(essentialTasks);
 %taskStruct = taskStruct(end);
 
@@ -120,10 +120,10 @@
 
 outputModel = inputModel;
 
-% block all biomass equations
-%ind = find(startsWith(outputModel.rxns,'biomass'));
-%outputModel.ub(ind) = 0;
-%outputModel.lb(ind) = 0;
+% block human biomass equations
+ind = find(strcmp(outputModel.rxns,'MAR13082'));
+outputModel.ub(ind) = 0;
+outputModel.lb(ind) = 0;
 outputModel.c(:)  = 0;
 
 % reset object function to "biomass_components"

diff --git a/code/getModelFromOrthology.m b/code/getModelFromOrthology.m
@@ -43,7 +43,8 @@
 templateModel.description = '';
 templateModel.version = '';
 templateModel.annotation = structfun(@(x) '',templateModel.annotation,'UniformOutput',0);
-
+templateModel.annotation.defaultLB = -1000;
+templateModel.annotation.defaultUB = 1000;
 
 % find the index of non-empty grRules before replacing genes
 preNonEmptyRuleInd = find(~cellfun(@isempty, templateModel.grRules));