From 8936c6bc4e8058393d9944cdd78e2499f914c953 Mon Sep 17 00:00:00 2001 From: jjmccollum Date: Tue, 23 Apr 2024 19:15:14 +1000 Subject: [PATCH] :zap: Added code, testing, and documentation for --ancestral-logger option --- docs/advanced.rst | 37 ++++++------ teiphy/collation.py | 20 ++++++- teiphy/main.py | 7 ++- teiphy/templates/beast_template.xml | 11 +++- tests/test_main.py | 89 +++++++++++++++++++++++++++++ 5 files changed, 140 insertions(+), 24 deletions(-) diff --git a/docs/advanced.rst b/docs/advanced.rst index 87bdf03..f761bb2 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -2,27 +2,13 @@ Advanced Usage ================== -Lacunae, retroversions, and other sources of ambiguity occasionally make -a one-to-one mapping of witnesses to readings impossible, and in some -cases, one disambiguation may be more likely than another in a -quantifiable way. Mechanisms for accommodating such situations exist in -both TEI XML and NEXUS, and for likelihood-based phylogenetic methods, -“soft decisions” about the states at the leaves and even the root of the -tree can provide useful information to the inference process. For these -reasons, ``teiphy`` ensures that these types of judgments, as well as -other rich features from TEI XML, can be respected (and, where, -necessary, preserved) in the conversion process. - - -Collations should preserve -as much detail as possible, including information on how certain types -of data can be normalized and collapsed for analysis. Since we may want -to conduct the same analysis at different levels of granularity, the -underlying collation data should be available for us to use in any case, -and only the output should reflect changes in the desired level of -detail. Likewise, as noted in the previous section, uncertainty about -witnesses’ attestations should be encoded in the collation and preserved -in the conversion of the collation. +Lacunae, retroversions, and other sources of ambiguity occasionally make a one-to-one mapping of witnesses to readings impossible, and in some cases, one disambiguation may be more likely than another in a quantifiable way. +Mechanisms for accommodating such situations exist in both TEI XML and NEXUS, and for likelihood-based phylogenetic methods, “soft decisions” about the states at the leaves and even the root of the tree can provide useful information to the inference process. +For these reasons, ``teiphy`` ensures that these types of judgments, as well as other rich features from TEI XML, can be respected (and, where, necessary, preserved) in the conversion process. + +Collations should preserve as much detail as possible, including information on how certain types of data can be normalized and collapsed for analysis. +Since we may want to conduct the same analysis at different levels of granularity, the underlying collation data should be available for us to use in any case, and only the output should reflect changes in the desired level of detail. +Likewise, as noted in the previous section, uncertainty about witnesses’ attestations should be encoded in the collation and preserved in the conversion of the collation. Analysis at Varying Levels of Detail Using Reading Types -------------------------------------------------------- @@ -570,6 +556,15 @@ summing multiple rates if more than one tag is specified for a transition. All transitions not covered by a ``relation`` element (e.g., a transition from reading 3 to reading 1, which is not covered in the example above) will be assigned the "default" rate of 1. Accordingly, if no ``listRelation`` for transcriptional change categories is specified at all, then the substitution model for a variation unit with *k* substantive readings will default to the Lewis Mk model. +Logging for Ancestral State Reconstructions +------------------------------------------- + +BEAST 2 offers support for the logging of the reconstructed states (i.e., variant readings) for each site (i.e., variation unit) at varying levels of detail. +The ``AncestralStateLogger`` class (part of the ``BEASTLabs`` package) reconstructs the state of a particular clade (which, for our purposes, is chosen to be the root of the tree) in each tree sampled during the analysis, resulting in a relatively compact output. +The ``AncestralSequenceLogger`` class (part of the ``BEAST_CLASSIC`` package) reconstructs the states of all hypothetical ancestors in each tree sampled during the analysis, which results in a more comprehensive, but also much larger output. +In writing to BEAST 2.7 XML files, ``teiphy`` can include elements for either (or neither) logger based on the ``--ancestral-logger`` argument. +The default option, ``state``, will include an ``AncestralStateLogger`` element in the XML file, while ``sequence`` will include an ``AncestralSequenceLogger`` element, and ``none`` will not include any logging elements for ancestral states. + Supported Output Formats and Options ------------------------------------ diff --git a/teiphy/collation.py b/teiphy/collation.py index cd67d5c..268a3d4 100644 --- a/teiphy/collation.py +++ b/teiphy/collation.py @@ -36,6 +36,12 @@ class ClockModel(str, Enum): local = "local" +class AncestralLogger(str, Enum): + state = "state" + sequence = ("sequence",) + none = "none" + + class TableType(str, Enum): matrix = "matrix" distance = "distance" @@ -1310,6 +1316,7 @@ def to_beast( file_addr: Union[Path, str], drop_constant: bool = False, clock_model: ClockModel = ClockModel.strict, + ancestral_logger: AncestralLogger = AncestralLogger.state, seed: int = None, ): """Writes this Collation to a file in BEAST format with the given address. @@ -1318,6 +1325,7 @@ def to_beast( file_addr: A string representing the path to an output file. drop_constant (bool, optional): An optional flag indicating whether to ignore variation units with one substantive reading. clock_model: A ClockModel option indicating which clock model to use. + ancestral_logger: An AncestralLogger option indicating which class of logger (if any) to use for ancestral states. seed: A seed for random number generation (for setting initial values of unspecified transcriptional rates). """ # Populate a list of sites that will correspond to columns of the sequence alignment: @@ -1494,6 +1502,7 @@ def to_beast( origin_span=origin_span, clock_model=clock_model.value, clock_rate_categories=2 * len(self.witnesses) - 1, + ancestral_logger=ancestral_logger.value, witnesses=witness_objects, variation_units=variation_unit_objects, intrinsic_categories=intrinsic_category_objects, @@ -2042,6 +2051,7 @@ def to_file( calibrate_dates: bool = False, mrbayes: bool = False, clock_model: ClockModel = ClockModel.strict, + ancestral_logger: AncestralLogger = AncestralLogger.state, table_type: TableType = TableType.matrix, seed: int = None, ): @@ -2084,6 +2094,8 @@ def to_file( clock_model: A ClockModel option indicating which type of clock model to use. This option is intended for inputs to MrBayes and BEAST 2. MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified. + ancestral_logger: An AncestralLogger option indicating which class of logger (if any) to use for ancestral states. + This option is intended for inputs to BEAST 2. table_type: A TableType option indicating which type of tabular output to generate. Only applicable for tabular outputs. Default value is "matrix". @@ -2116,7 +2128,13 @@ def to_file( return self.to_fasta(file_addr, drop_constant=drop_constant) if format == format.BEAST: - return self.to_beast(file_addr, drop_constant=drop_constant, clock_model=clock_model, seed=seed) + return self.to_beast( + file_addr, + drop_constant=drop_constant, + clock_model=clock_model, + ancestral_logger=ancestral_logger, + seed=seed, + ) if format == Format.CSV: return self.to_csv( diff --git a/teiphy/main.py b/teiphy/main.py index f7b4e20..42ed29d 100644 --- a/teiphy/main.py +++ b/teiphy/main.py @@ -5,7 +5,7 @@ import typer from .format import Format -from .collation import Collation, ClockModel, TableType +from .collation import Collation, ClockModel, AncestralLogger, TableType app = typer.Typer(rich_markup_mode="rich") @@ -71,6 +71,10 @@ def to_file( ClockModel.strict, help="The clock model to use; this option is intended for inputs to MrBayes and BEAST 2. MrBayes does not presently support a local clock model, so it will default to a strict clock model if a local clock model is specified.", ), + ancestral_logger: AncestralLogger = typer.Option( + AncestralLogger.state, + help="The type of logger to use for ancestral state reconstruction data; this option is intended for inputs to BEAST 2. If \"state\", then only the reconstructed states at the root of each sampled tree will be logged. If \"sequence\", then each sampled tree's reconstructed states for all ancestors will be logged (WARNING: this will be memory-intensive!). If \"none\", then no ancestral states will be logged.", + ), table: TableType = typer.Option( TableType.matrix, help="The type of table to use for CSV/Excel output. If \"matrix\", then the table will have rows for witnesses and columns for all variant readings, with frequency values in cells (the --split-missing flag can be used with this option). If \"distance\", then the table will have rows and columns for witnesses, with the number or proportion of disagreements between each pair in the corresponding cell (the --proportion flag can be used with this option). If \"nexus\", then the table will have rows for witnesses and columns for variation units with reading IDs in cells (the --ambiguous-as-missing flag can be used with this option). If \"long\", then the table will consist of repeated rows with column entries for taxa, characters, reading indices, and reading texts.", @@ -135,6 +139,7 @@ def to_file( calibrate_dates=calibrate_dates, mrbayes=mrbayes, clock_model=clock, + ancestral_logger=ancestral_logger, table_type=table, split_missing=split_missing, seed=seed, diff --git a/teiphy/templates/beast_template.xml b/teiphy/templates/beast_template.xml index 737f871..531432a 100644 --- a/teiphy/templates/beast_template.xml +++ b/teiphy/templates/beast_template.xml @@ -292,11 +292,20 @@ - + {%- if ancestral_logger == "state" %} + {%- for vu in variation_units %} + + {%- endfor %} + + {%- elif ancestral_logger == "sequence" %} + + + {%- for vu in variation_units %} {%- endfor %} + {%- endif %} \ No newline at end of file diff --git a/tests/test_main.py b/tests/test_main.py index c3a2dd3..5bfe7b4 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -931,6 +931,95 @@ def test_to_beast_local_clock(): assert branch_rate_model.get("spec") == "RandomLocalClockModel" +def test_to_beast_state_ancestral_logger(): + parser = et.XMLParser(remove_comments=True) + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.xml" + result = runner.invoke( + app, + [ + "-treconstructed", + "-tdefective", + "-torthographic", + "-tsubreading", + "-mlac", + "-moverlap", + "-s*", + "-sT", + "--fill-correctors", + "--ancestral-logger", + "state", + str(input_example), + str(output), + ], + ) + assert result.exit_code == 0 + assert output.exists() + beast_xml = et.parse(output, parser=parser) + assert beast_xml.find("//logger[@id=\"ancestralStateLogger\"]") is not None + ancestral_log = beast_xml.find("//logger[@id=\"ancestralStateLogger\"]/log") + assert ancestral_log.get("spec") == "beastlabs.evolution.likelihood.AncestralStateLogger" + + +def test_to_beast_sequence_ancestral_logger(): + parser = et.XMLParser(remove_comments=True) + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.xml" + result = runner.invoke( + app, + [ + "-treconstructed", + "-tdefective", + "-torthographic", + "-tsubreading", + "-mlac", + "-moverlap", + "-s*", + "-sT", + "--fill-correctors", + "--ancestral-logger", + "sequence", + str(input_example), + str(output), + ], + ) + assert result.exit_code == 0 + assert output.exists() + beast_xml = et.parse(output, parser=parser) + assert beast_xml.find("//logger[@id=\"ancestralSequenceLogger\"]") is not None + ancestral_log = beast_xml.find("//logger[@id=\"ancestralSequenceLogger\"]/log") + assert ancestral_log.get("spec") == "beastclassic.evolution.likelihood.AncestralSequenceLogger" + + +def test_to_beast_sequence_no_logger(): + parser = et.XMLParser(remove_comments=True) + with tempfile.TemporaryDirectory() as tmp_dir: + output = Path(tmp_dir) / "test.xml" + result = runner.invoke( + app, + [ + "-treconstructed", + "-tdefective", + "-torthographic", + "-tsubreading", + "-mlac", + "-moverlap", + "-s*", + "-sT", + "--fill-correctors", + "--ancestral-logger", + "none", + str(input_example), + str(output), + ], + ) + assert result.exit_code == 0 + assert output.exists() + beast_xml = et.parse(output, parser=parser) + assert beast_xml.find("//logger[@id=\"ancestralStateLogger\"]") is None + assert beast_xml.find("//logger[@id=\"ancestralSequenceLogger\"]") is None + + def test_to_csv(): with tempfile.TemporaryDirectory() as tmp_dir: output = Path(tmp_dir) / "test.csv"