cherry-picking last commit of ptm stoich to clean commits after rebas…

…ing to master and matching content
smith-chem-wisc · Feb 16, 2025 · 0184816 · 0184816
1 parent 42e308f
commit 0184816
Show file tree

Hide file tree

Showing 9 changed files with 43 additions and 68 deletions.
diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs
@@ -1,5 +1,7 @@
 using Easy.Common.Extensions;
 using MathNet.Numerics.Statistics;
+using MzLibUtil;
+using Proteomics;
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -14,6 +16,7 @@ public class FlashLfqResults
         public readonly Dictionary<string, Peptide> PeptideModifiedSequences;
         public readonly Dictionary<string, ProteinGroup> ProteinGroups;
         public readonly Dictionary<SpectraFileInfo, List<ChromatographicPeak>> Peaks;
+        public Dictionary<string, MzLibUtil.UtilProteinGroup> ModInfo { get; private set; }
         private readonly HashSet<string> _peptideModifiedSequencesToQuantify;
         public string PepResultString { get; set; }
         public double MbrQValueThreshold { get; set; }

diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs
@@ -280,6 +280,9 @@ public FlashLfqResults Run()
             // do top3 protein quantification
             _results.CalculateProteinResultsMedianPolish(UseSharedPeptidesForProteinQuant);
 
+            // calculate ptm occupancy at the peptide level
+            _results.CalculatePTMOccupancy();
+
             // do Bayesian protein fold-change analysis
             if (BayesianProteinQuant)
             {

diff --git a/mzLib/FlashLFQ/Peptide.cs b/mzLib/FlashLFQ/Peptide.cs
@@ -1,4 +1,5 @@
-using System.Collections.Generic;
+using Easy.Common.Extensions;
+using System.Collections.Generic;
 using System.Linq;
 using System.Text;
 
@@ -67,6 +68,18 @@ public void SetIntensity(SpectraFileInfo fileInfo, double intensity)
             }
         }
 
+        public double GetTotalIntensity()
+        {
+            if (Intensities.IsNotNullOrEmpty())
+            {
+                return Intensities.Sum(i => i.Value);
+            }
+            else
+            {
+                return 0;
+            }
+        }
+
         public DetectionType GetDetectionType(SpectraFileInfo fileInfo)
         {
             if (DetectionTypes.TryGetValue(fileInfo, out DetectionType detectionType))

diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs
@@ -137,7 +137,10 @@ public UtilProteinGroup(string name, Dictionary<string, UtilProtein> proteins =
         }
     }
     public class PositionFrequencyAnalysis
-    {
+    { 
+
+        public Dictionary<string, UtilProteinGroup> Occupancy { get; private set; }
+
         /// <summary>
         /// Calculates the occupancy of post-translational modifications at the peptide level. 
         /// </summary>
@@ -147,11 +150,7 @@ public class PositionFrequencyAnalysis
         /// <returns> A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity
         /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for 
         /// all of the amino acids in that peptide.</returns>
-        /// 
-
-        public Dictionary<string, UtilProteinGroup> Occupancy { get; private set; }
-
-
+        ///
         public void ProteinGroupsOccupancyByPeptide(List<(string fullSeq, string baseSeq, List<string> proteinGroup, double intensity)> peptides, bool modOnNTerminus = true, bool modOnCTerminus = true, bool ignoreTerminusMod=false)
         {
             var proteinGroups = new Dictionary<string, UtilProteinGroup>();
@@ -228,11 +227,5 @@ public void ProteinGroupsOccupancyByProtein(Dictionary<string, string> proteinSe
         {
             throw new NotImplementedException();
         }
-
-        public void ChangePeptideToProteinOccupancyIndex(string proteinGroupName, string proteinName, string peptide, int OneBasedStartResidue)
-        {
-            Occupancy[proteinGroupName].OccupancyLevel = "protein";
-            Occupancy[proteinGroupName].Proteins[proteinName].Peptides[peptide].PeptideToProteinPositions(OneBasedStartResidue);
-        }
     }
 }
diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs
@@ -4,6 +4,7 @@
 using System.Text.RegularExpressions;
 using Chemistry;
 using Omics.Fragmentation.Peptide;
+using MzLibUtil;
 
 namespace Omics.SpectrumMatch
 {
@@ -92,58 +93,15 @@ public static string RemoveParentheses(string baseSequence)
         }
 
         /// <summary>
-        /// Parses the full sequence to identify mods
+        /// Parses the full sequence to identify mods.
         /// </summary>
-        /// <param name="fullSequence"> Full sequence of the peptide in question</param>
+        /// <param name="fullSeq"> Full sequence of the peptide in question</param>
+        /// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
+        /// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
         /// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
-        public static Dictionary<int, List<string>> ParseModifications(string fullSeq)
+        public static Dictionary<int, List<string>> ParseModifications(string fullSeq, bool modOnNTerminus = true, bool modOnCTerminus = true)
         {
-            // use a regex to get all modifications
-            string pattern = @"\[(.+?)\]";
-            Regex regex = new(pattern);
-
-            // remove each match after adding to the dict. Otherwise, getting positions
-            // of the modifications will be rather difficult.
-            //int patternMatches = regex.Matches(fullSeq).Count;
-            Dictionary<int, List<string>> modDict = new();
-
-
-            // If there is a missed cleavage, then there will be a label on K and a Label on X modification.
-            // It'll be like [label]|[label] which complicates the positional stuff a little bit. Therefore, 
-            // RemoveSpecialCharacters will remove the "|", to ease things later on. 
-            RemoveSpecialCharacters(ref fullSeq);
-            MatchCollection matches = regex.Matches(fullSeq);
-            int captureLengthSum = 0; 
-            foreach (Match match in matches)
-            {
-                GroupCollection group = match.Groups;
-                string val = group[1].Value;
-                int startIndex = group[0].Index;
-                int captureLength = group[0].Length;
-
-                List<string> modList = new List<string>();
-                modList.Add(val);
-
-                // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the 
-                // startIndex of the modification Match and removes the cumulative length of the modifications
-                // found (including the brackets). The difference will be the number of nonmodification characters, 
-                // or the number of amino acids prior to the startIndex in the sequence. 
-                int positionToAddToDict = startIndex - captureLengthSum;
-
-                // check to see if key already exist
-                // if the already key exists, update the current position with the capture length + 1.
-                // otherwise, add the modification to the dict.
-                if (modDict.ContainsKey(positionToAddToDict))
-                {
-                    modDict[positionToAddToDict].Add(val);
-                }
-                else
-                {
-                    modDict.Add(positionToAddToDict, modList);
-                }
-                captureLengthSum += captureLength;
-            }
-            return modDict;
+            return fullSeq.ParseModifications(modOnNTerminus, modOnCTerminus);
         }
 
         /// <summary>
@@ -155,9 +113,7 @@ public static Dictionary<int, List<string>> ParseModifications(string fullSeq)
         /// <returns></returns>
         public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|")
         {
-            // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
-            Regex regexSpecialChar = new(specialCharacter);
-            fullSeq = regexSpecialChar.Replace(fullSeq, replacement);
+            MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref fullSeq, replacement, specialCharacter);
         }
 
 

diff --git a/mzLib/Test/AveragingTests/TestAveragingSpectraWriteFile.cs b/mzLib/Test/AveragingTests/TestAveragingSpectraWriteFile.cs
@@ -112,8 +112,8 @@ public static void TestOutputToCustomDirectoryAndNameMzML()
         {
             // output to a different directory than the files were originally in
             Parameters.OutputType = OutputType.MzML;
-            string customDestinationDirectory = Path.Combine(OutputDirectory, "NewTestingDirectory");
-            string customDestinationDirectory2 = Path.Combine(OutputDirectory, "NewTestingDirectory2");
+            string customDestinationDirectory = Path.Combine(OutputDirectory, "NewAveragedTestingDirectory");
+            string customDestinationDirectory2 = Path.Combine(OutputDirectory, "NewAveragedTestingDirectory2");
             Directory.CreateDirectory(customDestinationDirectory);
             string customName = "AveragedSpectra";
 

diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs
@@ -188,7 +188,7 @@ public static void TestParseModification()
 
             // psm with two mods on the same amino acid
             string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK";
-            modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq);
+            modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq, true, true);
             Assert.That(modDict.Count == 1);
             Assert.That(modDict.ContainsKey(0));
             Assert.That(modDict[0].Count == 2);

diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs
@@ -2,6 +2,7 @@
 using Assert = NUnit.Framework.Legacy.ClassicAssert;
 using MzLibUtil;
 using Readers;
+using System.Collections.Generic;
 
 namespace Test
 {

diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs
@@ -1361,6 +1361,12 @@ public static void TestFlashLfqQoutputRealData()
             var peaks = results.Peaks.Values.ToList();
             var peptides = results.PeptideModifiedSequences.Values.ToList();
             var proteins = results.ProteinGroups.Values.ToList();
+            var modInfo = results.ModInfo;
+
+            Assert.AreEqual(6989789.488346225, peptides[0].GetTotalIntensity(), 0.0000001);
+            Assert.AreEqual(726036.539062, peptides[4].GetTotalIntensity(), 0.000001);
+            Assert.AreEqual(726036.539062, modInfo["Q7KZF4"].Proteins["Q7KZF4"].Peptides["EYGMIYLGK"].ModifiedAminoAcidPositions[4]["Common Variable:Oxidation on M"].Intensity, 0.000001);
+            Assert.AreEqual(modInfo["Q7KZF4"].Proteins["Q7KZF4"].Peptides["EYGMIYLGK"].Intensity, modInfo["Q7KZF4"].Proteins["Q7KZF4"].Peptides["EYGMIYLGK"].ModifiedAminoAcidPositions[4]["Common Variable:Oxidation on M"].Intensity, 0.000001);
 
             Assert.AreEqual(4, peaks[0].Count(m => m.IsMbrPeak == false));
             Assert.AreEqual(5, peaks[1].Count(m => m.IsMbrPeak == false));