Deep-MI
diff --git a/‎test/quicktest/common.py
+35 b/‎test/quicktest/common.py
+35
diff --git a/‎test/quicktest/conftest.py
+13-1 b/‎test/quicktest/conftest.py
+13-1
diff --git a/‎test/quicktest/data/thresholds/aparc.DKTatlas+aseg.deep.mgz.yaml ‎test/quicktest/data/aparc.DKTatlas+aseg.deep.mgz.yaml b/‎test/quicktest/data/thresholds/aparc.DKTatlas+aseg.deep.mgz.yaml ‎test/quicktest/data/aparc.DKTatlas+aseg.deep.mgz.yaml
diff --git a/‎test/quicktest/data/thresholds/aseg+DKT.stats.yaml ‎test/quicktest/data/aseg+DKT.stats.yaml b/‎test/quicktest/data/thresholds/aseg+DKT.stats.yaml ‎test/quicktest/data/aseg+DKT.stats.yaml
diff --git a/‎test/quicktest/data/thresholds/aseg.mgz.yaml ‎test/quicktest/data/aseg.mgz.yaml b/‎test/quicktest/data/thresholds/aseg.mgz.yaml ‎test/quicktest/data/aseg.mgz.yaml
diff --git a/‎test/quicktest/data/thresholds/aseg.presurf.hypos.stats.yaml ‎test/quicktest/data/aseg.presurf.hypos.stats.yaml b/‎test/quicktest/data/thresholds/aseg.presurf.hypos.stats.yaml ‎test/quicktest/data/aseg.presurf.hypos.stats.yaml
diff --git a/‎test/quicktest/data/thresholds/aseg.stats.yaml ‎test/quicktest/data/aseg.stats.yaml b/‎test/quicktest/data/thresholds/aseg.stats.yaml ‎test/quicktest/data/aseg.stats.yaml
diff --git a/‎test/quicktest/data/thresholds/cerebellum.CerebNet.nii.gz.yaml ‎test/quicktest/data/cerebellum.CerebNet.nii.gz.yaml b/‎test/quicktest/data/thresholds/cerebellum.CerebNet.nii.gz.yaml ‎test/quicktest/data/cerebellum.CerebNet.nii.gz.yaml
diff --git a/‎test/quicktest/data/thresholds/cerebellum.CerebNet.stats.yaml ‎test/quicktest/data/cerebellum.CerebNet.stats.yaml b/‎test/quicktest/data/thresholds/cerebellum.CerebNet.stats.yaml ‎test/quicktest/data/cerebellum.CerebNet.stats.yaml
diff --git a/‎test/quicktest/data/thresholds/hypothalamus.HypVINN.nii.gz.yaml ‎test/quicktest/data/hypothalamus.HypVINN.nii.gz.yaml b/‎test/quicktest/data/thresholds/hypothalamus.HypVINN.nii.gz.yaml ‎test/quicktest/data/hypothalamus.HypVINN.nii.gz.yaml
diff --git a/‎test/quicktest/data/thresholds/hypothalamus.HypVINN.stats.yaml ‎test/quicktest/data/hypothalamus.HypVINN.stats.yaml b/‎test/quicktest/data/thresholds/hypothalamus.HypVINN.stats.yaml ‎test/quicktest/data/hypothalamus.HypVINN.stats.yaml
diff --git a/‎test/quicktest/data/thresholds/labels.yaml ‎test/quicktest/data/labels.yaml b/‎test/quicktest/data/thresholds/labels.yaml ‎test/quicktest/data/labels.yaml
diff --git a/‎test/quicktest/data/thresholds/wmparc.DKTatlas.mapped.mgz.yaml ‎test/quicktest/data/wmparc.DKTatlas.mapped.mgz.yaml b/‎test/quicktest/data/thresholds/wmparc.DKTatlas.mapped.mgz.yaml ‎test/quicktest/data/wmparc.DKTatlas.mapped.mgz.yaml
diff --git a/‎test/quicktest/data/thresholds/wmparc.DKTatlas.mapped.stats.yaml ‎test/quicktest/data/wmparc.DKTatlas.mapped.stats.yaml b/‎test/quicktest/data/thresholds/wmparc.DKTatlas.mapped.stats.yaml ‎test/quicktest/data/wmparc.DKTatlas.mapped.stats.yaml
diff --git a/‎test/quicktest/test_images.py
+33-5 b/‎test/quicktest/test_images.py
+33-5
diff --git a/‎test/quicktest/test_stats.py
+40-4 b/‎test/quicktest/test_stats.py
+40-4
@@ -133,3 +133,38 @@ def __eq__(self, other):
             except BaseException as e:
                 logger.exception(e)
         return within_bounds
+
+
+def write_table_file(
+        table_file: Path | None,
+        subject_id: str,
+        file: str,
+        scores: dict[int, int | float] | dict[str, int | float],
+) -> None:
+    """
+    Logs the calculated statistics (difference between test and reference) to a table file.
+
+    Parameters
+    ----------
+    table_file : Path, None
+        The file to write to, skip if None.
+    subject_id : str
+        The subject id.
+    file : file
+        The file associated with the comparison.
+    scores : dict[int | str, int | float]
+        The pairs of data associated with the comparison, e.g. index and value.
+    """
+    if not bool(table_file):
+        # no valid file passed, skip
+        return
+
+    for id, score in scores.items():
+        fmt = f'''"{{subject_id}}","{{file}}",{"{id:d}" if isinstance(id, int) else f'"{id}"'},'''
+        fmt += f"{{score:{'.6f' if isinstance(score, float) else 'd'}}}\n"
+        data = {"subject_id": subject_id, "file": file, "id": id, "score": score}
+        if not table_file.is_file():
+            with open(table_file, "w") as f:
+                f.write(",".join(data.keys()) + "\n")
+        with open(table_file, "a") as f:
+            f.write(fmt.format(**data))
@@ -32,10 +32,12 @@
 Folder with reference data (defined in environment variable).
 """
 reference_dir: Path = env["REF_DIR"]
+__subjects = (p for p in reference_dir.iterdir() if p.is_dir() and p.name not in ("logs", "slurm"))
+__max_subjects = int(os.environ.get("MAX_SUBJECTS", -1))
 """
 Load the test subjects from the reference path (one subject per folder).
 """
-ref_subjects: list[Path] = [p for p in reference_dir.iterdir() if p.is_dir()]
+ref_subjects: list[Path] = [p for i, p in enumerate(__subjects) if i < __max_subjects or __max_subjects < 0]
 
 assert len(ref_subjects) > 0, "No test subjects found!"
 
@@ -62,3 +64,13 @@ def ref_subject(request: pytest.FixtureRequest) -> SubjectDefinition:
 @pytest.fixture(scope="session")
 def test_subject(ref_subject: SubjectDefinition, subjects_dir: Path) -> SubjectDefinition:
     return ref_subject.with_subjects_dir(subjects_dir)
+
+def pytest_addoption(parser):
+    # the following options is for are for test_images and test_stats only
+    parser.addoption(
+        "--collect_csv",
+        action="store",
+        default=None,
+        type=Path,
+        help="Directory to store csv files that will collect all differences between reference and test.",
+    )
@@ -10,7 +10,7 @@
 
 from FastSurferCNN.utils.metrics import dice_score
 
-from .common import SubjectDefinition, Tolerances
+from .common import SubjectDefinition, Tolerances, write_table_file
 
 logger = getLogger(__name__)
 
@@ -19,7 +19,7 @@
 @pytest.fixture(scope='module')
 def segmentation_tolerances(segmentation_image: str) -> Tolerances:
 
-    thresholds_file = Path(__file__).parent / f"data/thresholds/{segmentation_image}.yaml"
+    thresholds_file = Path(__file__).parent / f"data/{segmentation_image}.yaml"
     assert thresholds_file.exists(), f"The thresholds file {thresholds_file} does not exist!"
     return Tolerances(thresholds_file)
 
@@ -51,10 +51,11 @@ def compute_dice_score(test_data, reference_data, labels: dict[int, str]) -> tup
         Dice scores for each class.
     """
     dice_scores = {}
-    logger.debug("Dice scores:")
+    logger.debug("Dice scores (reporting non-zero):")
     for _, (label, lname) in enumerate(labels.items()):
         dice_scores[label] = dice_score(np.equal(reference_data, label), np.equal(test_data, label), validate=False)
-        logger.debug(f"Label {lname}: {dice_scores[label]:.4f}")
+        if dice_scores[label] > 0:
+            logger.debug(f"Label {lname}: {dice_scores[label]:.4f}")
     mean_dice_score = np.asarray(list(dice_scores.values())).mean()
     return mean_dice_score, dice_scores
 
@@ -96,6 +97,7 @@ def test_segmentation_image(
         ref_subject: SubjectDefinition,
         segmentation_image: str,
         segmentation_tolerances: Tolerances,
+        pytestconfig: pytest.Config,
 ):
     """
     Test the segmentation data by calculating and comparing dice scores.
@@ -110,6 +112,8 @@ def test_segmentation_image(
         Name of the segmentation image file.
     segmentation_tolerances: Tolerances
         Object to provide the relevant tolerances for the respective segmentation_image.
+    pytestconfig : pytest.Config
+        The sessions config object.
 
     Raises
     ------
@@ -129,13 +133,23 @@ def test_segmentation_image(
     # Compute the dice score
     mean_dice, dice_scores = compute_dice_score(test_data, reference_data, labels_lnames)
 
+    delta_dir: Path = pytestconfig.getoption("--collect_csv")
+    if delta_dir:
+        delta_dir.mkdir(parents=True, exist_ok=True)
+        write_table_file(delta_dir / "dice.csv", test_subject.name, segmentation_image, dice_scores)
+
     failed_labels = (i for i, dice in dice_scores.items() if not np.isclose(dice, 0, atol=labels_lnames_tols[i][1]))
     dice_exceeding_threshold = [f"Label {labels_lnames[lbl]}: {1-dice_scores[lbl]}" for lbl in failed_labels]
     assert [] == dice_exceeding_threshold, f"Dice scores in {segmentation_image} are not within range!"
     logger.debug("Dice scores are within range for all classes")
 
 
-def test_intensity_image(test_subject: SubjectDefinition, ref_subject: SubjectDefinition, intensity_image: str):
+def test_intensity_image(
+        test_subject: SubjectDefinition,
+        ref_subject: SubjectDefinition,
+        intensity_image: str,
+        pytestconfig: pytest.Config,
+):
     """
     Test the intensity data by calculating and comparing the mean square error.
 
@@ -147,6 +161,8 @@ def test_intensity_image(test_subject: SubjectDefinition, ref_subject: SubjectDe
         Definition of the reference subject.
     intensity_image : str
         Name of the image file.
+    pytestconfig : pytest.Config
+        The sessions config object.
 
     Raises
     ------
@@ -158,6 +174,18 @@ def test_intensity_image(test_subject: SubjectDefinition, ref_subject: SubjectDe
     test_data = test_img.get_fdata()
     reference_file, reference_img = ref_subject.load_image(intensity_image)
     reference_data = reference_img.get_fdata()
+
+    delta_dir = pytestconfig.getoption("--collect_csv")
+    if delta_dir:
+        delta_dir.mkdir(parents=True, exist_ok=True)
+        # this analysis will write not only the max (tested below, but also mean, mse and some percentiles)
+        absdelta = np.abs(test_data - reference_data)
+        scores = {p: v for p, v in zip(("median", "95th", "99th"), np.percentile(absdelta, (50, 95, 99)), strict=True)}
+        scores.update(mean=absdelta.mean(), max=np.max(absdelta))
+        reldelta = absdelta / np.maximum(np.maximum(abs(reference_data), abs(test_data)), 1e-8)
+        scores.update(rel=reldelta.mean(), relmax=np.max(reldelta))
+        write_table_file(delta_dir / "intensity.csv", test_subject.name, intensity_image, scores)
+
     # Check the image data
     np.testing.assert_allclose(test_data, reference_data, rtol=1e-4, err_msg="Image intensity data do not match!")
 
 
@@ -7,7 +7,7 @@
 
 from FastSurferCNN.segstats import PVStats
 
-from .common import ApproxAndLog, SubjectDefinition, Tolerances
+from .common import ApproxAndLog, SubjectDefinition, Tolerances, write_table_file
 
 logger = getLogger(__name__)
 
@@ -54,7 +54,7 @@ def measure_tolerances(stats_file: str) -> MeasureTolerances:
     MeasureTolerances
         The list of measures expected for this file.
     """
-    thresholds_file = Path(__file__).parent / f"data/thresholds/{stats_file}.yaml"
+    thresholds_file = Path(__file__).parent / f"data/{stats_file}.yaml"
     assert Path(thresholds_file).is_file(), f"The threshold file {thresholds_file} does not exist!"
     return MeasureTolerances(thresholds_file)
 
@@ -73,7 +73,7 @@ def stats_tolerances(stats_file: str) -> Tolerances:
     Tolerances
         Per-structure tolerances object
     """
-    thresholds_file = Path(__file__).parent / f"data/thresholds/{stats_file}.yaml"
+    thresholds_file = Path(__file__).parent / f"data/{stats_file}.yaml"
     assert Path(thresholds_file).is_file(), f"The threshold file {thresholds_file} does not exist!"
     return Tolerances(thresholds_file)
 
@@ -159,6 +159,7 @@ def test_measure_thresholds(
         ref_subject: SubjectDefinition,
         stats_file: str,
         measure_tolerances: MeasureTolerances,
+        pytestconfig: pytest.Config,
 ):
     """
     Test if the measure is within thresholds in stats_file.
@@ -173,6 +174,8 @@ def test_measure_thresholds(
         Name of the test directory.
     measure_tolerances : MeasureTolerances
         The object to provide the measure tolerances for stats_file.
+    pytestconfig : pytest.Config
+        The sessions config object.
 
     Raises
     ------
@@ -198,6 +201,14 @@ def check_measure(measure: str) -> bool:
         actual: MeasureTuple = actual_annots[measure]
         return expected == ApproxAndLog(actual, rel=measure_tolerances.threshold(measure))
 
+    delta_dir: Path = pytestconfig.getoption("--collect_csv")
+    if delta_dir:
+        delta_dir.mkdir(parents=True, exist_ok=True)
+        values = [(m, expected_annots[m][2], actual_annots[m][2]) for m in expected_measures]
+        scores: dict[str, float] = {m: abs(a - b) for m, a, b in values}
+        scores.update({m + "_rel": abs(a - b)/max((abs(a), abs(b))) for m, a, b in values})
+        write_table_file(delta_dir / "stats-measure.csv", test_subject.name, stats_file, scores)
+
     failed_measures = (m for m in expected_measures if not check_measure(m))
     measures_outside_spec = [f"Measure {m}: {expected_annots[m][2]} <> {actual_annots[m][2]}" for m in failed_measures]
     assert [] == measures_outside_spec, f"Some Measures are outside of the threshold in {test_subject}: {stats_file}!"
@@ -241,6 +252,7 @@ def test_stats_table(
         ref_subject: SubjectDefinition,
         stats_file: str,
         stats_tolerances: Tolerances,
+        pytestconfig: pytest.Config,
 ):
     """
     Test if the tables are within the threshold.
@@ -255,6 +267,8 @@ def test_stats_table(
         Name of the test directory.
     stats_tolerances : Tolerances
         The object to provide the tolerances for stats_file.
+    pytestconfig : pytest.Config
+        The sessions config object.
 
     Raises
     ------
@@ -264,9 +278,14 @@ def test_stats_table(
     _, _, expected_table = ref_subject.load_stats_file(stats_file)
     _, _, actual_table = test_subject.load_stats_file(stats_file)
     actual_segids = [stats["SegId"] for stats in actual_table]
+    ignored_columns = ["SegId", "StructName"]
 
     def filter_keys(stats: PVStats) -> dict[str, int | float]:
-        return {k: v for k, v in stats.items() if k not in ["SegId", "StructName"]}
+        return {k: v for k, v in stats.items() if k not in ignored_columns}
+
+    delta_dir: Path = pytestconfig.getoption("--collect_csv")
+    table_data = []
+    keys = []
 
     expected_different = []
     actual_different = []
@@ -275,10 +294,27 @@ def filter_keys(stats: PVStats) -> dict[str, int | float]:
         _expected = filter_keys(expected)
         actual = actual_table[actual_segids.index(expected_segid)]
         _actual = filter_keys(actual)
+        if delta_dir:
+            keys.append((list(_expected.keys()), list(_actual.keys())))
+            table_data.append((expected_segid, _expected, _actual))
         if not _expected == ApproxAndLog(_actual, abs=stats_tolerances.threshold(expected_segid)):
             expected_different.append(expected)
             actual_different.append(actual)
 
+    if delta_dir:
+        expected_keys, actual_keys = zip(*keys, strict=True)
+        assert expected_keys == actual_keys, "To compare tables keys of reference and test have to be identical!"
+
+        def relative(a, b, field) -> float:
+            return abs(a[field] - b[field]) / max((abs(a[field]), abs(b[field]), 1e-8))
+
+        delta_dir.mkdir(parents=True, exist_ok=True)
+
+        for field in actual_keys[0]:
+            scores: dict[str, float] = {f"{seg_id}:{field}": abs(a[field] - b[field]) for seg_id, a, b in table_data}
+            scores.update({f"{seg_id}:rel-{field}": relative(a, b, field) for seg_id, a, b in table_data})
+            write_table_file(delta_dir / "stats-table.csv", test_subject.name, stats_file, scores)
+
     assert expected_different == actual_different, f"The tables of some structures in {stats_file} are 'too' different!"