changes to ParseModifications (can now ignore mods at termini) and Po…

…sitionFrequencyAnalysis UtilProtein class (now updates peptide mod positions to protein positions) and PFA argument (list of named tuple for clarity)
smith-chem-wisc · Feb 16, 2025 · 42e308f · 42e308f
1 parent 665df75
commit 42e308f
Show file tree

Hide file tree

Showing 4 changed files with 484 additions and 0 deletions.
diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs
@@ -347,6 +347,28 @@ public void CalculateProteinResultsTop3(bool useSharedPeptides)
                 }
             }
         }
+        /// <summary>
+        /// Calculate peptide level ptm occupancy with either all peptides to be quantified (by intensity) or a subset of FlashLFQ-identified peptides with an arbitrary peptide-level quantifier.
+        /// </summary>
+        /// <param name="quantifiedPeptides"> Dictionary where keys are string-typed peptide full sequences in PeptideModifiedSequences and the value is a double-typed quantifier of that peptide.</param>
+        /// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
+        /// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
+        /// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
+        public void CalculatePTMOccupancy(Dictionary<string, double> quantifiedPeptides=null, bool modOnNTerminus=true, bool modOnCTerminus=true)
+        {
+            quantifiedPeptides = quantifiedPeptides ?? new Dictionary<string, double> { };
+
+            var peptides = _peptideModifiedSequencesToQuantify
+                .Where(pep => PeptideModifiedSequences.ContainsKey(pep))
+                .Select(pep => (PeptideModifiedSequences[pep].Sequence, 
+                                PeptideModifiedSequences[pep].BaseSequence,
+                                PeptideModifiedSequences[pep].ProteinGroups.Select(pg => pg.ProteinGroupName).ToList(),
+                                quantifiedPeptides.GetValueOrDefault(pep, PeptideModifiedSequences[pep].GetTotalIntensity()))).ToList();
+
+            PositionFrequencyAnalysis pfa = new PositionFrequencyAnalysis();
+            pfa.ProteinGroupsOccupancyByPeptide(peptides, modOnNTerminus, modOnCTerminus);
+            ModInfo = pfa.Occupancy;
+        }
 
         /// <summary>
         /// This method uses the median polish algorithm to calculate protein quantities in each biological replicate.

diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs
@@ -25,6 +25,91 @@ namespace MzLibUtil
 {
     public static class ClassExtensions
     {
+        /// <summary>
+        /// Parses the full sequence to identify mods.
+        /// </summary>
+        /// <param name="fullSequence"> Full sequence of the peptide in question</param>
+        /// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
+        /// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
+        /// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
+        public static Dictionary<int, List<string>> ParseModifications(this string fullSequence, bool modOnNTerminus=false, bool modOnCTerminus=false, bool ignoreTerminusMod=false)
+        {
+            // use a regex to get all modifications
+            string pattern = @"\[(.+?)\](?<!\[I+\])"; //The "look-behind" condition prevents matching ] for metal ion modifications
+            Regex regex = new(pattern);
+
+            // remove each match after adding to the dict. Otherwise, getting positions
+            // of the modifications will be rather difficult.
+            //int patternMatches = regex.Matches(fullSequence).Count;
+            Dictionary<int, List<string>> modDict = new();
+
+            string fullSeq = fullSequence;
+            RemoveSpecialCharacters(ref fullSeq);
+            MatchCollection matches = regex.Matches(fullSeq);
+            int captureLengthSum = 0;
+            foreach (Match match in matches)
+            {
+                GroupCollection group = match.Groups;
+                string val = group[1].Value;
+                int startIndex = group[0].Index;
+                int captureLength = group[0].Length;
+
+                List<string> modList = new List<string>();
+                modList.Add(val);
+
+                // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the 
+                // startIndex of the modification Match and removes the cumulative length of the modifications
+                // found (including the brackets). The difference will be the number of nonmodification characters, 
+                // or the number of amino acids prior to the startIndex in the sequence. 
+                int positionToAddToDict = startIndex - captureLengthSum;
+
+                if ((positionToAddToDict == 0 || (fullSeq.Length == startIndex + captureLength)) && ignoreTerminusMod)
+                {
+                    continue;
+                }
+
+                // Handle N terminus indexing
+                if ((positionToAddToDict == 0) && !modOnNTerminus)
+                {
+                    positionToAddToDict++;
+                }
+
+                // Handle C terminus indexing
+                if ((fullSeq.Length == startIndex + captureLength) && modOnCTerminus)
+                {
+                    positionToAddToDict++;
+                }
+
+                // check to see if key already exist
+                // if the already key exists, update the current position with the capture length + 1.
+                // otherwise, add the modification to the dict.
+                if (modDict.ContainsKey(positionToAddToDict))
+                {
+                    modDict[positionToAddToDict].Add(val);
+                }
+                else
+                {
+                    modDict.Add(positionToAddToDict, modList);
+                }
+                captureLengthSum += captureLength;
+            }
+            return modDict;
+        }
+
+        /// <summary>
+        /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
+        /// </summary>
+        /// <param name="fullSequence"></param>
+        /// <param name="replacement"></param>
+        /// <param name="specialCharacter"></param>
+        /// <returns></returns>
+        public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|")
+        {
+            // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
+            Regex regexSpecialChar = new(specialCharacter);
+            fullSequence = regexSpecialChar.Replace(fullSequence, replacement);
+        }
+
         public static double[] BoxCarSmooth(this double[] data, int points)
         {
             // Force to be odd

diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs
@@ -0,0 +1,238 @@
+using System;
+using System.Collections.Generic;
+using System.Text.RegularExpressions;
+using Easy.Common.Extensions;
+
+namespace MzLibUtil
+{
+    // Should this have all of the parent data (i.e. protein group, protein, peptide, peptide position)? Unnecessary for now, but probably useful later.
+    public class UtilModification
+    {
+        public string IdWithMotif { get; set; }
+        public int PeptidePositionZeroIsNTerminus { get; set; } //NEED TO ENFORCE THIS EVERYWHERE OR CHECK IF ZERO OR ONE
+
+
+        public double Intensity { get; set; }
+
+        public UtilModification(string name, int position, double intensity)
+        {
+            IdWithMotif = name;
+            PeptidePositionZeroIsNTerminus = position;
+            Intensity = intensity;
+        }
+
+    }
+    public class UtilPeptide
+    {
+        public string FullSequence { get; set; }
+        public string BaseSequence { get; set; }
+        public UtilProtein ParentProtein { get; set; }
+        public int IndexInProtein { get; set; }
+        public Dictionary<int, Dictionary<string, UtilModification>> ModifiedAminoAcidPositions { get; set; }
+        public double Intensity { get; set; } 
+
+        public UtilPeptide(string fullSequence, Dictionary<int, Dictionary<string, UtilModification>> mods = null) 
+        {
+            FullSequence = fullSequence;
+            ModifiedAminoAcidPositions = mods.IsNotNullOrEmpty() ? mods : new Dictionary<int, Dictionary<string, UtilModification>>();
+            SetBaseSequence();
+        }
+        public void SetBaseSequence(string modPattern = @"\[(.+?)\](?<!\[I+\])")
+        {
+            Regex regexSpecialChar = new(modPattern);
+            BaseSequence = regexSpecialChar.Replace(FullSequence, @"");
+        }
+        public void AddModifications(Dictionary<int, string> mods)
+        {
+            throw new NotImplementedException();
+        }
+        public void PeptideToProteinPositions(int offset=0, bool UseParent=false)
+        {
+            if (offset <= 0 && !UseParent)
+            {
+                return; // keep current mod indexing if not offsetting.
+            }
+            else if (UseParent)
+            {
+                offset = ParentProtein.Sequence.IndexOf(BaseSequence);
+            }
+
+            var modificationsToAdd = new Dictionary<int, Dictionary<string, UtilModification>>();
+            var modificationsToRemove = new List<int>();
+
+            foreach (var modpos in ModifiedAminoAcidPositions.Keys)
+            {
+                int positionInProtein = modpos + offset;
+                Dictionary<string, UtilModification> mods = ModifiedAminoAcidPositions[modpos];
+                foreach (var mod in mods.Values)
+                {
+                    mod.PeptidePositionZeroIsNTerminus = positionInProtein;
+                }
+                modificationsToAdd[positionInProtein] = mods;
+                modificationsToRemove.Add(modpos);
+            }
+
+            foreach (var modpos in modificationsToRemove)
+            {
+                ModifiedAminoAcidPositions.Remove(modpos);
+            }
+
+            foreach (var modpos in modificationsToAdd)
+            {
+                ModifiedAminoAcidPositions[modpos.Key] = modpos.Value;
+            }
+        }
+    }
+
+    public class UtilProtein
+    {
+        public string Name { get; set; }
+        public string Sequence { get; set; }
+        public Dictionary<string, UtilPeptide> Peptides { get; set; }
+        public Dictionary<int, Dictionary<string, UtilModification>> ModifiedAminoAcidPositionsInProtein { get; set; }
+
+        public UtilProtein(string name, Dictionary<string, UtilPeptide> peptides=null)
+        {
+            Name = name;
+            if (peptides != null) Peptides = peptides;
+            else Peptides= new Dictionary<string, UtilPeptide>();
+        }
+
+        public void SetProteinModsFromPeptides()
+        {
+            // for now, this method must be used AFTER peptide mod positions are offsetted to protein positions
+            ModifiedAminoAcidPositionsInProtein = new Dictionary<int, Dictionary<string, UtilModification>>();
+            foreach (var peptide in Peptides.Values)
+            {
+                foreach (var modpos in peptide.ModifiedAminoAcidPositions)
+                {
+                    if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modpos.Key))
+                    {
+                        ModifiedAminoAcidPositionsInProtein[modpos.Key] = new Dictionary<string, UtilModification>();
+                    }
+                    foreach (var mod in modpos.Value.Values)
+                    {
+                        if (!ModifiedAminoAcidPositionsInProtein[modpos.Key].ContainsKey(mod.IdWithMotif))
+                        {
+                            ModifiedAminoAcidPositionsInProtein[modpos.Key][mod.IdWithMotif] = new UtilModification(mod.IdWithMotif, modpos.Key, 0);
+                        }
+                        ModifiedAminoAcidPositionsInProtein[modpos.Key][mod.IdWithMotif].Intensity += mod.Intensity/peptide.Intensity; // might need to add some magic later to keep stored the mod intensity and the peptide intensity for MM output
+                    }
+                }
+            }
+        }
+    }
+
+    public class UtilProteinGroup
+    {
+        public string Name { get; set;}
+        public Dictionary<string, UtilProtein> Proteins {  get; set; }
+        public string OccupancyLevel { get; set; }
+
+        public UtilProteinGroup(string name, Dictionary<string, UtilProtein> proteins = null)
+        {
+            Name = name;
+            if (proteins != null) Proteins = proteins;
+            else Proteins= new Dictionary<string, UtilProtein>();
+        }
+    }
+    public class PositionFrequencyAnalysis
+    {
+        /// <summary>
+        /// Calculates the occupancy of post-translational modifications at the peptide level. 
+        /// </summary>
+        /// <param name="peptides"> A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List<string> ProteinGroups, Intensity) for each peptide.</param>
+        /// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
+        /// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
+        /// <returns> A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity
+        /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for 
+        /// all of the amino acids in that peptide.</returns>
+        /// 
+
+        public Dictionary<string, UtilProteinGroup> Occupancy { get; private set; }
+
+
+        public void ProteinGroupsOccupancyByPeptide(List<(string fullSeq, string baseSeq, List<string> proteinGroup, double intensity)> peptides, bool modOnNTerminus = true, bool modOnCTerminus = true, bool ignoreTerminusMod=false)
+        {
+            var proteinGroups = new Dictionary<string, UtilProteinGroup>();
+
+            // Go through the peptides given
+            foreach (var pep in peptides)
+            {
+                string baseSeq = pep.Item2.IsNotNullOrEmpty() ? pep.Item2 : new string(pep.Item1.ToCharArray()); // in case it is null or empty and we need to get the base sequence from the full sequence
+                ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", @"\[(.+?)\](?<!\[I+\])"); 
+
+                // Go through the peptide's protein groups
+                foreach (var pg in pep.proteinGroup)
+                {
+                    // If have not seen that protein group, store it
+                    if (!proteinGroups.ContainsKey(pg))
+                    {
+                        proteinGroups[pg] = new UtilProteinGroup(pg);
+                        proteinGroups[pg].OccupancyLevel = "peptide";
+                    }
+                    var proteinGroup = proteinGroups[pg];
+
+                    // Go through the proteins in each protein group
+                    foreach (var proteinName in pg.Split('|'))
+                    {
+                        // Add the protein to the protein group's dictionary if it has not been added
+                        if (!proteinGroup.Proteins.ContainsKey(proteinName))
+                        {
+                            proteinGroup.Proteins[proteinName] = new UtilProtein(proteinName);
+                        }
+                        var protein = proteinGroup.Proteins[proteinName];
+
+                        // If the peptide's base sequence has not been seen, add it to the protein's dictionary
+                        if (!protein.Peptides.ContainsKey(baseSeq))
+                        {
+                            protein.Peptides[baseSeq] = new UtilPeptide(pep.fullSeq);
+                            protein.Peptides[baseSeq].Intensity = 0;
+                        }
+
+                        // Increase the total intensity of the peptide base sequence to track the total intensity of all amino acids in that sequence
+                        protein.Peptides[baseSeq].Intensity += pep.intensity;
+                        var peptide = protein.Peptides[baseSeq];
+
+                        // Want both arguments passed here to be true if need to later filter out peptide terminal mods that are not protein terminal mods 
+                        Dictionary<int, List<string>> peptideMods = pep.fullSeq.ParseModifications(modOnNTerminus, modOnCTerminus, ignoreTerminusMod);
+                        // Go through the modified positions found froum the full sequence
+                        foreach (var modpos in peptideMods)
+                        {
+                            // If that position has not been recorded as containing a modification, add it to the base sequence's dictonary
+                            if (!peptide.ModifiedAminoAcidPositions.ContainsKey(modpos.Key))
+                            {
+                                peptide.ModifiedAminoAcidPositions[modpos.Key] = new Dictionary<string, UtilModification>();
+                            }
+                            var modifiedPosition = peptide.ModifiedAminoAcidPositions[modpos.Key];
+
+                            // Go through the modifications found at a modified amino acid index
+                            foreach (var mod in modpos.Value)
+                            {
+                                //If the name of that modification has not been seen, record that modification in the index's dictionary with an intensity of 0
+                                if (!modifiedPosition.ContainsKey(mod))
+                                {
+                                    modifiedPosition[mod] = new UtilModification(mod, modpos.Key, 0);
+                                }
+                                // Increase the intensity of the modification by the intensity of the peptide
+                                modifiedPosition[mod].Intensity += pep.intensity;
+                            }
+                        }
+                    }
+                }
+            }
+            Occupancy = proteinGroups;
+        }
+
+        public void ProteinGroupsOccupancyByProtein(Dictionary<string, string> proteinSequences) // Dictionary<accession, sequence>
+        {
+            throw new NotImplementedException();
+        }
+
+        public void ChangePeptideToProteinOccupancyIndex(string proteinGroupName, string proteinName, string peptide, int OneBasedStartResidue)
+        {
+            Occupancy[proteinGroupName].OccupancyLevel = "protein";
+            Occupancy[proteinGroupName].Proteins[proteinName].Peptides[peptide].PeptideToProteinPositions(OneBasedStartResidue);
+        }
+    }
+}