Merge pull request #96 from jjmccollum/95-excluding-fragmentary-witne…

…sses ⚡ Added support, tests, and documentation for --fragmentary-thres…
jjmccollum · Jan 12, 2025 · dbdaaa5 · dbdaaa5
2 parents 4bc6365 + 32e6195
commit dbdaaa5
Show file tree

Hide file tree

Showing 5 changed files with 161 additions and 2 deletions.
diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -319,6 +319,17 @@ correctors of Codex Bezae follows:
 
 Then, when you invoke any conversion command through the CLI, make sure that you include the ``--fill-correctors`` argument.
 
+Excluding Fragmentary witnesses
+-------------------------------
+
+Fragmentary witnesses with too many missing readings can introduce more noise than signal to a phylogenetic analysis, so it is often helpful to exclude such witnesses from the phylogenetic software inputs you generate.
+You can do this using the ``--fragmentary-threshold`` command-line option.
+With this option, you must specify a number between 0 and 1 that represents the proportion of extant readings that a witness must have in order to be included in the output.
+For the purposes of determining whether a witness meets or falls below this threshold, that witness is considered non-extant/lacunose at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the ``-m`` option).
+This calculation is performed after the reading sequences of correctors have been filled in (if the ``--fill-correctors flag`` was specified).
+A threshold specified with ``--fragmentary-threshold 0.7``, for example, means that a witness with missing readings at more than 30 percent of variation units will be excluded from the output.
+By comparison, ``--fragmentary-threshold 1.0`` will exclude any witness that has even one missing reading.
+
 Removing First-hand Siglum Suffixes and Merging Multiple Attestations
 ---------------------------------------------------------------------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "teiphy"
-version = "0.1.17"
+version = "0.1.18"
 description = "Converts TEI XML collations to NEXUS and other formats"
 authors = ["Joey McCollum and Robert Turnbull"]
 license = "MIT"

diff --git a/teiphy/collation.py b/teiphy/collation.py
@@ -59,6 +59,7 @@ class Collation:
         trivial_reading_types: A set of reading types (e.g., "reconstructed", "defective", "orthographic", "subreading") whose readings should be collapsed under the previous substantive reading.
         missing_reading_types: A set of reading types (e.g., "lac", "overlap") whose readings should be treated as missing data.
         fill_corrector_lacunae: A boolean flag indicating whether or not to fill "lacunae" in witnesses with type "corrector".
+        fragmentary_threshold: A float representing the proportion such that all witnesses extant at fewer than this proportion of variation units are filtered out of the collation.
         witnesses: A list of Witness instances contained in this Collation.
         witness_index_by_id: A dictionary mapping base witness ID strings to their int indices in the witnesses list.
         variation_units: A list of VariationUnit instances contained in this Collation.
@@ -75,6 +76,7 @@ def __init__(
         trivial_reading_types: List[str] = [],
         missing_reading_types: List[str] = [],
         fill_corrector_lacunae: bool = False,
+        fragmentary_threshold: float = None,
         dates_file: Union[Path, str] = None,
         verbose: bool = False,
     ):
@@ -86,13 +88,15 @@ def __init__(
             trivial_reading_types: An optional set of reading types (e.g., "reconstructed", "defective", "orthographic", "subreading") whose readings should be collapsed under the previous substantive reading.
             missing_reading_types: An optional set of reading types (e.g., "lac", "overlap") whose readings should be treated as missing data.
             fill_corrector_lacunae: An optional flag indicating whether or not to fill "lacunae" in witnesses with type "corrector".
+            fragmentary_threshold: An optional float representing the proportion such that all witnesses extant at fewer than this proportion of variation units are filtered out of the collation.
             dates_file: An optional path to a CSV file containing witness IDs, minimum dates, and maximum dates. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.
             verbose: An optional flag indicating whether or not to print timing and debugging details for the user.
         """
         self.manuscript_suffixes = manuscript_suffixes
         self.trivial_reading_types = set(trivial_reading_types)
         self.missing_reading_types = set(missing_reading_types)
         self.fill_corrector_lacunae = fill_corrector_lacunae
+        self.fragmentary_threshold = fragmentary_threshold
         self.verbose = verbose
         self.witnesses = []
         self.witness_index_by_id = {}
@@ -127,6 +131,9 @@ def __init__(
         self.parse_apps(xml)
         self.validate_intrinsic_relations()
         self.parse_readings_by_witness()
+        # If a threshold of readings for fragmentary witnesses is specified, then filter the witness list using the dictionary mapping witness IDs to readings:
+        if self.fragmentary_threshold is not None:
+            self.filter_fragmentary_witnesses(xml)
         t1 = time.time()
         if self.verbose:
             print("Total time to initialize collation: %0.4fs." % (t1 - t0))
@@ -626,6 +633,43 @@ def parse_readings_by_witness(self):
             )
         return
 
+    def filter_fragmentary_witnesses(self, xml):
+        """Filters the original witness list and readings by witness dictionary to exclude witnesses whose proportions of extant passages fall below the fragmentary readings threshold."""
+        if self.verbose:
+            print(
+                "Filtering fragmentary witnesses (extant in < %f of all variation units) out of internal witness list and dictionary of witness readings..."
+                % self.fragmentary_threshold
+            )
+        t0 = time.time()
+        fragmentary_witness_set = set()
+        # Proceed for each witness in order:
+        for wit in self.witnesses:
+            wit_id = wit.id
+            # We count the number of variation units at which this witness has an extant (i.e., non-missing) reading:
+            extant_reading_count = 0
+            total_reading_count = len(self.readings_by_witness[wit.id])
+            # Proceed through all reading support lists:
+            for rdg_support in self.readings_by_witness[wit_id]:
+                # If the current reading support list is not all zeroes, then increment this witness's count of extant readings:
+                if sum(rdg_support) != 0:
+                    extant_reading_count += 1
+            # If the proportion of extant readings falls below the threshold, then add this witness to the list of fragmentary witnesses:
+            if extant_reading_count / total_reading_count < self.fragmentary_threshold:
+                fragmentary_witness_set.add(wit_id)
+        # Then filter the witness list to exclude the fragmentary witnesses:
+        filtered_witnesses = [wit for wit in self.witnesses if wit.id not in fragmentary_witness_set]
+        self.witnesses = filtered_witnesses
+        # Then remove the entries for the fragmentary witnesses from the witnesses-to-readings dictionary:
+        for wit_id in fragmentary_witness_set:
+            del self.readings_by_witness[wit_id]
+        t1 = time.time()
+        if self.verbose:
+            print(
+                "Filtered out %d fragmentary witness(es) (%s) in %0.4fs."
+                % (len(fragmentary_witness_set), str(list(fragmentary_witness_set)), t1 - t0)
+            )
+        return
+
     def get_nexus_symbols(self):
         """Returns a list of one-character symbols needed to represent the states of all substantive readings in NEXUS.
 

diff --git a/teiphy/main.py b/teiphy/main.py
@@ -47,6 +47,10 @@ def to_file(
         False,
         help="Use the StatesFormat=Frequency setting instead of the StatesFormat=StatesPresent setting (and thus represent all states with frequency vectors rather than symbols) in NEXUS output.",
     ),
+    fragmentary_threshold: float = typer.Option(
+        None,
+        help="Ignore all witnesses that are extant at fewer than the specified proportion of variation units. For the purposes of this calculation, a witness is considered non-extant/lacunose at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the -m option). This calculation is performed after the reading sequences of correctors have been filled in (if the --fill-correctors flag was specified). Thus, a threshold of 0.7 means that a witness with missing readings at more than 30 percent of variation units will be excluded from the output.",
+    ),
     drop_constant: bool = typer.Option(
         False,
         help="If set, do not write constant sites (i.e., variation units with one substantive reading) to output.",
@@ -138,11 +142,27 @@ def to_file(
     except Exception as err:
         print(f"Error opening input file: {err}")
         exit(1)
+    # Make sure the fragmentary_threshold input, if specified, is between 0 and 1:
+    if fragmentary_threshold is not None and (fragmentary_threshold < 0.0 or fragmentary_threshold > 1.0):
+        print(
+            "Error: the fragmentary variation unit proportion threshold is %f. It must be a value in [0, 1]."
+            % fragmentary_threshold
+        )
+        exit(1)
     # Make sure the dates_file input, if specified, is a CSV file:
     if dates_file is not None and dates_file.suffix.lower() != ".csv":
         print("Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv.")
         exit(1)
-    coll = Collation(xml, suffixes, trivial_reading_types, missing_reading_types, fill_correctors, dates_file, verbose)
+    coll = Collation(
+        xml,
+        suffixes,
+        trivial_reading_types,
+        missing_reading_types,
+        fill_correctors,
+        fragmentary_threshold,
+        dates_file,
+        verbose,
+    )
     coll.to_file(
         output,
         format=format,

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -225,6 +225,90 @@ def test_to_nexus_ambiguous_as_missing():
         assert "{" not in text
 
 
+def test_to_nexus_fragmentary_threshold():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.nexus"
+        result = runner.invoke(
+            app,
+            [
+                "--verbose",
+                "-treconstructed",
+                "-tdefective",
+                "-torthographic",
+                "-tsubreading",
+                "-mlac",
+                "-moverlap",
+                "-s*",
+                "-sT",
+                "--fragmentary-threshold",
+                0.5,
+                str(input_example),
+                str(output),
+            ],
+        )
+        assert result.exit_code == 0
+        assert output.exists()
+        text = output.read_text(encoding="utf-8")
+        assert text.startswith("#NEXUS")
+        assert "04                   " not in text
+        assert "06C2                 " not in text
+
+
+def test_to_nexus_fragmentary_threshold_fill_correctors():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.nexus"
+        result = runner.invoke(
+            app,
+            [
+                "--verbose",
+                "-treconstructed",
+                "-tdefective",
+                "-torthographic",
+                "-tsubreading",
+                "-mlac",
+                "-moverlap",
+                "-s*",
+                "-sT",
+                "--fill-correctors",
+                "--fragmentary-threshold",
+                0.5,
+                str(input_example),
+                str(output),
+            ],
+        )
+        assert result.exit_code == 0
+        assert output.exists()
+        text = output.read_text(encoding="utf-8")
+        assert text.startswith("#NEXUS")
+        assert "04                   " not in text
+        assert "06C2                 " in text
+
+
+def test_to_nexus_fragmentary_threshold_bad_threshold():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        output = Path(tmp_dir) / "test.nexus"
+        result = runner.invoke(
+            app,
+            [
+                "--verbose",
+                "-treconstructed",
+                "-tdefective",
+                "-torthographic",
+                "-tsubreading",
+                "-mlac",
+                "-moverlap",
+                "-s*",
+                "-sT",
+                "--fragmentary-threshold",
+                1.1,
+                str(input_example),
+                str(output),
+            ],
+        )
+        assert result.exit_code == 1
+        assert result.stdout.startswith("Error: the fragmentary variation unit proportion threshold is")
+
+
 def test_to_nexus_calibrate_dates():
     with tempfile.TemporaryDirectory() as tmp_dir:
         output = Path(tmp_dir) / "test.nexus"