From a01ec6c477a42d4c6286b2d5236b25f1a29c8e5b Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 7 Jan 2024 01:54:09 +0300
Subject: [PATCH 1/3] tools/compare: when dumping json, pretty-print it

It's rather completely non-human-readable otherwise.
I can't imagine the filesize really matters,
and if it does, it should just be compressed later on.
---
 tools/compare.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/compare.py b/tools/compare.py
index 3cc9e5eb4a..7572520cc0 100755
--- a/tools/compare.py
+++ b/tools/compare.py
@@ -327,7 +327,7 @@ def main():
     # Optionally, diff and output to JSON
     if args.dump_to_json is not None:
         with open(args.dump_to_json, "w") as f_json:
-            json.dump(diff_report, f_json)
+            json.dump(diff_report, f_json, indent=1)
 
 
 class TestParser(unittest.TestCase):

From 9282814d0c4c5ec314efb96f4d0390db5fcc8294 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 7 Jan 2024 02:26:51 +0300
Subject: [PATCH 2/3] tools/compare: add failing test

---
 tools/gbench/Inputs/test5_run0.json | 18 ++++++
 tools/gbench/Inputs/test5_run1.json | 18 ++++++
 tools/gbench/report.py              | 96 +++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+)
 create mode 100644 tools/gbench/Inputs/test5_run0.json
 create mode 100644 tools/gbench/Inputs/test5_run1.json

diff --git a/tools/gbench/Inputs/test5_run0.json b/tools/gbench/Inputs/test5_run0.json
new file mode 100644
index 0000000000..074103b11d
--- /dev/null
+++ b/tools/gbench/Inputs/test5_run0.json
@@ -0,0 +1,18 @@
+{
+    "context": {
+        "date": "2016-08-02 17:44:46",
+        "num_cpus": 4,
+        "mhz_per_cpu": 4228,
+        "cpu_scaling_enabled": false,
+        "library_build_type": "release"
+    },
+    "benchmarks": [
+        {
+            "name": "BM_ManyRepetitions",
+            "iterations": 1000,
+            "real_time": 1,
+            "cpu_time": 1000,
+            "time_unit": "s"
+        }
+    ]
+}
diff --git a/tools/gbench/Inputs/test5_run1.json b/tools/gbench/Inputs/test5_run1.json
new file mode 100644
index 0000000000..430df9f0da
--- /dev/null
+++ b/tools/gbench/Inputs/test5_run1.json
@@ -0,0 +1,18 @@
+{
+    "context": {
+        "date": "2016-08-02 17:44:46",
+        "num_cpus": 4,
+        "mhz_per_cpu": 4228,
+        "cpu_scaling_enabled": false,
+        "library_build_type": "release"
+    },
+    "benchmarks": [
+        {
+            "name": "BM_ManyRepetitions",
+            "iterations": 1000,
+            "real_time": 1000,
+            "cpu_time": 1,
+            "time_unit": "s"
+        }
+    ]
+}
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index 10e6b508f0..b6f7c3bf9a 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -1490,6 +1490,102 @@ def test_json_diff_report_pretty_printing(self):
                 self.assertEqual(out["name"], expected)
 
 
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly2(
+    unittest.TestCase
+):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+
+            testInputs = os.path.join(
+                os.path.dirname(os.path.realpath(__file__)), "Inputs"
+            )
+            testOutput1 = os.path.join(testInputs, "test5_run0.json")
+            testOutput2 = os.path.join(testInputs, "test5_run1.json")
+            with open(testOutput1, "r") as f:
+                json1 = json.load(f)
+                json1["benchmarks"] = [
+                    json1["benchmarks"][0] for i in range(1000)
+                ]
+            with open(testOutput2, "r") as f:
+                json2 = json.load(f)
+                json2["benchmarks"] = [
+                    json2["benchmarks"][0] for i in range(1000)
+                ]
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(json1, json2, utest=True)
+
+    @unittest.expectedFailure
+    def test_json_diff_report_pretty_printing(self):
+        expect_line = [
+            "BM_ManyRepetitions_pvalue",
+            "0.0000",
+            "0.0000",
+            "U",
+            "Test,",
+            "Repetitions:",
+            "1000",
+            "vs",
+            "1000",
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False
+        )
+        output_lines = output_lines_with_header[2:]
+        found = False
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(" ") if x]
+            found = expect_line == parts
+            if found:
+                break
+        self.assertTrue(found)
+
+    @unittest.expectedFailure
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                "name": "BM_ManyRepetitions",
+                "label": "",
+                "time_unit": "s",
+                "run_type": "",
+                "aggregate_name": "",
+                "utest": {
+                    "have_optimal_repetitions": True,
+                    "cpu_pvalue": 0.0,
+                    "time_pvalue": 0.0,
+                    "nr_of_repetitions": 1000,
+                    "nr_of_repetitions_other": 1000,
+                },
+            },
+            {
+                "name": "OVERALL_GEOMEAN",
+                "label": "",
+                "measurements": [
+                    {
+                        "real_time": 1.0,
+                        "cpu_time": 1000.000000000069,
+                        "real_time_other": 1000.000000000069,
+                        "cpu_time_other": 1.0,
+                        "time": 999.000000000069,
+                        "cpu": -0.9990000000000001,
+                    }
+                ],
+                "time_unit": "s",
+                "run_type": "aggregate",
+                "aggregate_name": "geomean",
+                "utest": {},
+            },
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(self.json_diff_report, expected_output):
+            self.assertEqual(out["name"], expected["name"])
+            self.assertEqual(out["time_unit"], expected["time_unit"])
+            assert_utest(self, out, expected)
+
+
 def assert_utest(unittest_instance, lhs, rhs):
     if lhs["utest"]:
         unittest_instance.assertAlmostEqual(

From 98757f880f6a36624280f2a32f8774bccc45272e Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sun, 7 Jan 2024 01:59:58 +0300
Subject: [PATCH 3/3] tools/compare: don't actually discard valid (but zero)
 `pvalue`

So, this is embarressing. For a very large number of repetitions,
we can end up with pvalue of a true zero, and it obviously compares false,
and we treat it as-if we failed to compute it...
---
 tools/gbench/report.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index b6f7c3bf9a..7158fd1654 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -315,7 +315,7 @@ def get_difference_report(json1, json2, utest=False):
             have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
                 timings_cpu, timings_time
             )
-            if cpu_pvalue and time_pvalue:
+            if cpu_pvalue is not None and time_pvalue is not None:
                 utest_results = {
                     "have_optimal_repetitions": have_optimal_repetitions,
                     "cpu_pvalue": cpu_pvalue,
@@ -1518,7 +1518,6 @@ def load_results():
         json1, json2 = load_results()
         cls.json_diff_report = get_difference_report(json1, json2, utest=True)
 
-    @unittest.expectedFailure
     def test_json_diff_report_pretty_printing(self):
         expect_line = [
             "BM_ManyRepetitions_pvalue",
@@ -1543,7 +1542,6 @@ def test_json_diff_report_pretty_printing(self):
                 break
         self.assertTrue(found)
 
-    @unittest.expectedFailure
     def test_json_diff_report(self):
         expected_output = [
             {