Skip to content

Commit

Permalink
Merge pull request #96 from jjmccollum/95-excluding-fragmentary-witne…
Browse files Browse the repository at this point in the history
…sses

⚡ Added support, tests, and documentation for --fragmentary-thres…
  • Loading branch information
jjmccollum authored Jan 12, 2025
2 parents 4bc6365 + 32e6195 commit dbdaaa5
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 2 deletions.
11 changes: 11 additions & 0 deletions docs/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,17 @@ correctors of Codex Bezae follows:
Then, when you invoke any conversion command through the CLI, make sure that you include the ``--fill-correctors`` argument.

Excluding Fragmentary witnesses
-------------------------------

Fragmentary witnesses with too many missing readings can introduce more noise than signal to a phylogenetic analysis, so it is often helpful to exclude such witnesses from the phylogenetic software inputs you generate.
You can do this using the ``--fragmentary-threshold`` command-line option.
With this option, you must specify a number between 0 and 1 that represents the proportion of extant readings that a witness must have in order to be included in the output.
For the purposes of determining whether a witness meets or falls below this threshold, that witness is considered non-extant/lacunose at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the ``-m`` option).
This calculation is performed after the reading sequences of correctors have been filled in (if the ``--fill-correctors flag`` was specified).
A threshold specified with ``--fragmentary-threshold 0.7``, for example, means that a witness with missing readings at more than 30 percent of variation units will be excluded from the output.
By comparison, ``--fragmentary-threshold 1.0`` will exclude any witness that has even one missing reading.

Removing First-hand Siglum Suffixes and Merging Multiple Attestations
---------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "teiphy"
version = "0.1.17"
version = "0.1.18"
description = "Converts TEI XML collations to NEXUS and other formats"
authors = ["Joey McCollum and Robert Turnbull"]
license = "MIT"
Expand Down
44 changes: 44 additions & 0 deletions teiphy/collation.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class Collation:
trivial_reading_types: A set of reading types (e.g., "reconstructed", "defective", "orthographic", "subreading") whose readings should be collapsed under the previous substantive reading.
missing_reading_types: A set of reading types (e.g., "lac", "overlap") whose readings should be treated as missing data.
fill_corrector_lacunae: A boolean flag indicating whether or not to fill "lacunae" in witnesses with type "corrector".
fragmentary_threshold: A float representing the proportion such that all witnesses extant at fewer than this proportion of variation units are filtered out of the collation.
witnesses: A list of Witness instances contained in this Collation.
witness_index_by_id: A dictionary mapping base witness ID strings to their int indices in the witnesses list.
variation_units: A list of VariationUnit instances contained in this Collation.
Expand All @@ -75,6 +76,7 @@ def __init__(
trivial_reading_types: List[str] = [],
missing_reading_types: List[str] = [],
fill_corrector_lacunae: bool = False,
fragmentary_threshold: float = None,
dates_file: Union[Path, str] = None,
verbose: bool = False,
):
Expand All @@ -86,13 +88,15 @@ def __init__(
trivial_reading_types: An optional set of reading types (e.g., "reconstructed", "defective", "orthographic", "subreading") whose readings should be collapsed under the previous substantive reading.
missing_reading_types: An optional set of reading types (e.g., "lac", "overlap") whose readings should be treated as missing data.
fill_corrector_lacunae: An optional flag indicating whether or not to fill "lacunae" in witnesses with type "corrector".
fragmentary_threshold: An optional float representing the proportion such that all witnesses extant at fewer than this proportion of variation units are filtered out of the collation.
dates_file: An optional path to a CSV file containing witness IDs, minimum dates, and maximum dates. If specified, then for all witnesses in the first column, any existing date ranges for them in the TEI XML collation will be ignored.
verbose: An optional flag indicating whether or not to print timing and debugging details for the user.
"""
self.manuscript_suffixes = manuscript_suffixes
self.trivial_reading_types = set(trivial_reading_types)
self.missing_reading_types = set(missing_reading_types)
self.fill_corrector_lacunae = fill_corrector_lacunae
self.fragmentary_threshold = fragmentary_threshold
self.verbose = verbose
self.witnesses = []
self.witness_index_by_id = {}
Expand Down Expand Up @@ -127,6 +131,9 @@ def __init__(
self.parse_apps(xml)
self.validate_intrinsic_relations()
self.parse_readings_by_witness()
# If a threshold of readings for fragmentary witnesses is specified, then filter the witness list using the dictionary mapping witness IDs to readings:
if self.fragmentary_threshold is not None:
self.filter_fragmentary_witnesses(xml)
t1 = time.time()
if self.verbose:
print("Total time to initialize collation: %0.4fs." % (t1 - t0))
Expand Down Expand Up @@ -626,6 +633,43 @@ def parse_readings_by_witness(self):
)
return

def filter_fragmentary_witnesses(self, xml):
"""Filters the original witness list and readings by witness dictionary to exclude witnesses whose proportions of extant passages fall below the fragmentary readings threshold."""
if self.verbose:
print(
"Filtering fragmentary witnesses (extant in < %f of all variation units) out of internal witness list and dictionary of witness readings..."
% self.fragmentary_threshold
)
t0 = time.time()
fragmentary_witness_set = set()
# Proceed for each witness in order:
for wit in self.witnesses:
wit_id = wit.id
# We count the number of variation units at which this witness has an extant (i.e., non-missing) reading:
extant_reading_count = 0
total_reading_count = len(self.readings_by_witness[wit.id])
# Proceed through all reading support lists:
for rdg_support in self.readings_by_witness[wit_id]:
# If the current reading support list is not all zeroes, then increment this witness's count of extant readings:
if sum(rdg_support) != 0:
extant_reading_count += 1
# If the proportion of extant readings falls below the threshold, then add this witness to the list of fragmentary witnesses:
if extant_reading_count / total_reading_count < self.fragmentary_threshold:
fragmentary_witness_set.add(wit_id)
# Then filter the witness list to exclude the fragmentary witnesses:
filtered_witnesses = [wit for wit in self.witnesses if wit.id not in fragmentary_witness_set]
self.witnesses = filtered_witnesses
# Then remove the entries for the fragmentary witnesses from the witnesses-to-readings dictionary:
for wit_id in fragmentary_witness_set:
del self.readings_by_witness[wit_id]
t1 = time.time()
if self.verbose:
print(
"Filtered out %d fragmentary witness(es) (%s) in %0.4fs."
% (len(fragmentary_witness_set), str(list(fragmentary_witness_set)), t1 - t0)
)
return

def get_nexus_symbols(self):
"""Returns a list of one-character symbols needed to represent the states of all substantive readings in NEXUS.
Expand Down
22 changes: 21 additions & 1 deletion teiphy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def to_file(
False,
help="Use the StatesFormat=Frequency setting instead of the StatesFormat=StatesPresent setting (and thus represent all states with frequency vectors rather than symbols) in NEXUS output.",
),
fragmentary_threshold: float = typer.Option(
None,
help="Ignore all witnesses that are extant at fewer than the specified proportion of variation units. For the purposes of this calculation, a witness is considered non-extant/lacunose at a variation unit if the type of its reading in that unit is in the user-specified list of missing reading types (i.e., the argument(s) of the -m option). This calculation is performed after the reading sequences of correctors have been filled in (if the --fill-correctors flag was specified). Thus, a threshold of 0.7 means that a witness with missing readings at more than 30 percent of variation units will be excluded from the output.",
),
drop_constant: bool = typer.Option(
False,
help="If set, do not write constant sites (i.e., variation units with one substantive reading) to output.",
Expand Down Expand Up @@ -138,11 +142,27 @@ def to_file(
except Exception as err:
print(f"Error opening input file: {err}")
exit(1)
# Make sure the fragmentary_threshold input, if specified, is between 0 and 1:
if fragmentary_threshold is not None and (fragmentary_threshold < 0.0 or fragmentary_threshold > 1.0):
print(
"Error: the fragmentary variation unit proportion threshold is %f. It must be a value in [0, 1]."
% fragmentary_threshold
)
exit(1)
# Make sure the dates_file input, if specified, is a CSV file:
if dates_file is not None and dates_file.suffix.lower() != ".csv":
print("Error opening dates file: The dates file is not a CSV file. Make sure the dates file type is .csv.")
exit(1)
coll = Collation(xml, suffixes, trivial_reading_types, missing_reading_types, fill_correctors, dates_file, verbose)
coll = Collation(
xml,
suffixes,
trivial_reading_types,
missing_reading_types,
fill_correctors,
fragmentary_threshold,
dates_file,
verbose,
)
coll.to_file(
output,
format=format,
Expand Down
84 changes: 84 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,90 @@ def test_to_nexus_ambiguous_as_missing():
assert "{" not in text


def test_to_nexus_fragmentary_threshold():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
result = runner.invoke(
app,
[
"--verbose",
"-treconstructed",
"-tdefective",
"-torthographic",
"-tsubreading",
"-mlac",
"-moverlap",
"-s*",
"-sT",
"--fragmentary-threshold",
0.5,
str(input_example),
str(output),
],
)
assert result.exit_code == 0
assert output.exists()
text = output.read_text(encoding="utf-8")
assert text.startswith("#NEXUS")
assert "04 " not in text
assert "06C2 " not in text


def test_to_nexus_fragmentary_threshold_fill_correctors():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
result = runner.invoke(
app,
[
"--verbose",
"-treconstructed",
"-tdefective",
"-torthographic",
"-tsubreading",
"-mlac",
"-moverlap",
"-s*",
"-sT",
"--fill-correctors",
"--fragmentary-threshold",
0.5,
str(input_example),
str(output),
],
)
assert result.exit_code == 0
assert output.exists()
text = output.read_text(encoding="utf-8")
assert text.startswith("#NEXUS")
assert "04 " not in text
assert "06C2 " in text


def test_to_nexus_fragmentary_threshold_bad_threshold():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
result = runner.invoke(
app,
[
"--verbose",
"-treconstructed",
"-tdefective",
"-torthographic",
"-tsubreading",
"-mlac",
"-moverlap",
"-s*",
"-sT",
"--fragmentary-threshold",
1.1,
str(input_example),
str(output),
],
)
assert result.exit_code == 1
assert result.stdout.startswith("Error: the fragmentary variation unit proportion threshold is")


def test_to_nexus_calibrate_dates():
with tempfile.TemporaryDirectory() as tmp_dir:
output = Path(tmp_dir) / "test.nexus"
Expand Down

0 comments on commit dbdaaa5

Please sign in to comment.