From d471feb8a2e118a401b9054b8fd9bcfa63d78cfb Mon Sep 17 00:00:00 2001 From: towsey Date: Sat, 5 Oct 2019 16:43:43 +1000 Subject: [PATCH] Experiment with different score normalisations Issue #252 - z-score normalisation works best even though the scores are not normally distributed. --- src/AnalysisPrograms/Sandpit.cs | 2 +- .../ContentDescription.cs | 5 ++- .../ContentDescriptionTools/DataProcessing.cs | 41 ++++++++++++++++++- src/TowseyLibrary/Statistics.cs | 11 ++++- 4 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/AnalysisPrograms/Sandpit.cs b/src/AnalysisPrograms/Sandpit.cs index 48875b355..36530bc74 100644 --- a/src/AnalysisPrograms/Sandpit.cs +++ b/src/AnalysisPrograms/Sandpit.cs @@ -145,7 +145,7 @@ public static void ContentDescriptionApplyTemplates() var path = Path.Combine(@"C:\Ecoacoustics\Output\Test\Test24HourRecording", "Testing__2Maps.png"); var ldfcSpectrogram = Image.FromFile(path); var image = ContentVisualization.DrawLdfcSpectrogramWithContentScoreTracks(ldfcSpectrogram, contentPlots); - var path2 = Path.Combine(@"C:\Ecoacoustics\ContentDescription", "Testing_2Maps.CONTENTnew05.png"); + var path2 = Path.Combine(@"C:\Ecoacoustics\ContentDescription", "Testing_2Maps.CONTENTnew06.png"); image.Save(path2); Console.WriteLine("# Finished scanning recording with content description templates"); } diff --git a/src/AudioAnalysisTools/ContentDescriptionTools/ContentDescription.cs b/src/AudioAnalysisTools/ContentDescriptionTools/ContentDescription.cs index 0fff553ad..11df616d4 100644 --- a/src/AudioAnalysisTools/ContentDescriptionTools/ContentDescription.cs +++ b/src/AudioAnalysisTools/ContentDescriptionTools/ContentDescription.cs @@ -64,10 +64,11 @@ public static List ContentDescriptionOfMultipleRecordingFiles(FileInfo lis var plotDict = DataProcessing.ConvertResultsToPlots(completeListOfResults, 1440, 0); var contentPlots = DataProcessing.ConvertPlotDictionaryToPlotList(plotDict); - //contentPlots = DataProcessing.SubtractMeanPlusSd(contentPlots); + contentPlots = DataProcessing.SubtractMeanPlusSd(contentPlots); //the following did not work as well. - contentPlots = DataProcessing.SubtractModeAndSd(contentPlots); + //contentPlots = DataProcessing.SubtractModeAndSd(contentPlots); + //contentPlots = DataProcessing.PercentileThresholding(contentPlots, 80); return contentPlots; } diff --git a/src/AudioAnalysisTools/ContentDescriptionTools/DataProcessing.cs b/src/AudioAnalysisTools/ContentDescriptionTools/DataProcessing.cs index 3b0eba332..fcf282533 100644 --- a/src/AudioAnalysisTools/ContentDescriptionTools/DataProcessing.cs +++ b/src/AudioAnalysisTools/ContentDescriptionTools/DataProcessing.cs @@ -227,7 +227,7 @@ public static Dictionary ApplyBandPass(Dictionary templateDict, Dictionary oneMinuteIndices) { // convert the template dictionary to an array of averaged values - var dictionaryOfIndexAverages = DataProcessing.AverageIndicesInDictionary(templateDict); + var dictionaryOfIndexAverages = AverageIndicesInDictionary(templateDict); var templateVector = ConvertDictionaryToVector(dictionaryOfIndexAverages); // the score spectrum to be returned @@ -455,6 +455,45 @@ public static List SubtractModeAndSd(List plots) return opPlots; } + public static List PercentileThresholding(List plots, int percentile) + { + var opPlots = new List(); + + // subtract average from each plot array + foreach (Plot plot in plots) + { + var scores = plot.data; + var threshold = Statistics.GetPercentileValue(scores, percentile); + //NormalDist.AverageAndSD(scores, out double average, out double sd); + + // normalize the scores to z-scores + for (int i = 0; i < scores.Length; i++) + { + // Normalize scores relative to threshold + scores[i] = (scores[i] - threshold) / (1 - threshold); + if (scores[i] < 0.0) + { + scores[i] = 0.0; + } + + if (scores[i] > 4.0) + { + scores[i] = 4.0; + } + + //normalize full scale to 4 SDs. + //scores[i] /= 4.0; + } + + // when normalizing the scores this way the range of the plot will be 0 to 4 SD above the mean. + // Consequently we set the plot threshold to 0.5, which is two SDs or a p value = 5%. + plot.threshold = 0.5; + opPlots.Add(plot); + } + + return opPlots; + } + public static List ConvertPlotDictionaryToPlotList(Dictionary dict) { var list = new List(); diff --git a/src/TowseyLibrary/Statistics.cs b/src/TowseyLibrary/Statistics.cs index 48d58d94d..a615e451a 100644 --- a/src/TowseyLibrary/Statistics.cs +++ b/src/TowseyLibrary/Statistics.cs @@ -1,4 +1,4 @@ -// +// // All code in this file and all associated files are the copyright and property of the QUT Ecoacoustics Research Group (formerly MQUTeR, and formerly QUT Bioacoustics Research Group). // @@ -21,6 +21,15 @@ public static double GetMedian(double[] v) return median; } + public static double GetPercentileValue(double[] v, int percentile) + { + Tuple tuple = DataTools.SortArray(v); + var fraction = percentile / 100.0; + var percentileBin = (int)Math.Round(v.Length * fraction); + double percentileValue = tuple.Item2[percentileBin]; + return percentileValue; + } + /// /// Analyses an array of events or hits, represented by a binary of matrix. /// Assumes a Poisson distribution