Merge pull request #87 from jjmccollum/85-add-support-for-time-depend…

…ent-substitution-models-with-epochsubstitutionmodel-in-beast-outputs 85 add support for time dependent substitution models with epochsubstitutionmodel in beast outputs
jjmccollum · Apr 24, 2024 · 5724b9f · 5724b9f
2 parents 0014523 + f5f98da
commit 5724b9f
Show file tree

Hide file tree

Showing 13 changed files with 1,312 additions and 255 deletions.
diff --git a/README.rst b/README.rst
@@ -152,9 +152,9 @@ Further details on the capabilities of ``teiphy``, particularly in terms of the
         title = {{Using Bayesian Phylogenetics to Infer Manuscript Transmission History}}, 
         journal = {Digital Scholarship in the Humanities},
         year = {2024}, 
-        volume = {TBD}, 
-        number = {TBD}, 
-        pages = {TBD},
+        volume = {39}, 
+        number = {1}, 
+        pages = {258--279},
         doi = {10.1093/llc/fqad089}, 
         url = {https://doi.org/10.1093/llc/fqad089}
     }

diff --git a/docs/advanced.rst b/docs/advanced.rst
@@ -556,6 +556,26 @@ summing multiple rates if more than one tag is specified for a transition.
 All transitions not covered by a ``relation`` element (e.g., a transition from reading 3 to reading 1, which is not covered in the example above) will be assigned the "default" rate of 1.
 Accordingly, if no ``listRelation`` for transcriptional change categories is specified at all, then the substitution model for a variation unit with *k* substantive readings will default to the Lewis Mk model.
 
+In many cases, certain transcriptional explanations are applicable only at certain times.
+For instance, assimilation to a popular text that arose at a later point in the tradition's history (modeled with the ``Byz`` class in the above example) would only be available as a transcriptional explanation after this point.
+Skips of the eye may be empirically more common for earlier scribes than for later ones.
+Certain paleographic confusions may only be possible for earlier scripts or later ones.
+Specific orthodox corruptions of sacred texts may have only become plausible after certain theological conflicts or developments had taken place to inspire them.
+If you wish to encode such transcriptional possibilities as time-dependent, you can do so by adding ``@notBefore`` and ``@notAfter`` attributes to the corresponding ``relation`` element:
+
+.. code:: xml
+
+    <listRelation type="transcriptional">
+        <relation active="1 2 3" passive="4" ana="#Harm"/>
+        <relation active="1" passive="2 3 4" ana="#Clar"/>
+        <relation active="2" passive="1" ana="#VisErr"/>
+        <relation active="2" passive="3" ana="#Clar #Harm"/>
+        <relation active="3" passive="4" ana="#Clar"/>
+        <relation active="1 2 4" passive="3" ana="#Byz" notBefore="500"/>
+    </listRelation>
+
+If you tag certain transcriptional ``relation`` elements in this way, ``teiphy`` will map the ``listRelation`` to an ``EpochSubstitutionModel`` instance consisting of multiple substitution models that apply at the corresponding points in time.
+
 Logging for Ancestral State Reconstructions
 -------------------------------------------
 

diff --git a/example/ubs_ephesians.xml b/example/ubs_ephesians.xml
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "teiphy"
-version = "0.1.10"
+version = "0.1.11"
 description = "Converts TEI XML collations to NEXUS and other formats"
 authors = ["Joey McCollum and Robert Turnbull"]
 license = "MIT"

diff --git a/teiphy/collation.py b/teiphy/collation.py
diff --git a/teiphy/templates/beast_template.xml b/teiphy/templates/beast_template.xml
@@ -21,12 +21,12 @@
     <birthDeathSkylineModel spec="bdsky.evolution.speciation.BirthDeathSkylineModel" id="birthDeath" tree="@tree">
         {%- if origin_span[1] %}
         {%- if origin_span[1] == origin_span[0] %}
-        <origin spec="parameter.RealParameter" id="origin" lower="{{ origin_span[0] }}" upper="{{ origin_span[1] }}" value="{{ origin_span[1] }}" estimate="false"/>
+        <origin spec="parameter.RealParameter" id="origin" lower="{{ origin_span[0] }}" upper="{{ origin_span[1] }}" value="{{ origin_span[0] }}" estimate="false"/>
         {%- else %}
-        <origin spec="parameter.RealParameter" id="origin" lower="{{ origin_span[0] }}" upper="{{ origin_span[1] }}" value="{{ origin_span[1] }}"/>
+        <origin spec="parameter.RealParameter" id="origin" lower="{{ origin_span[0] }}" upper="{{ origin_span[1] }}" value="{{ origin_span[0] }}"/>
         {%- endif %}
         {%- else %}
-        <origin spec="parameter.RealParameter" id="origin" lower="{{ origin_span[0] }}" upper="Infinity" value="1.0"/>
+        <origin spec="parameter.RealParameter" id="origin" lower="{{ origin_span[0] }}" upper="Infinity" value="{{ origin_span[0] }}"/>
         {%- endif %}
         <reproductiveNumber spec="parameter.RealParameter" id="reproductiveNumber" lower="0.0" upper="Infinity" value="2.0"/>
         <becomeUninfectiousRate spec="parameter.RealParameter" id="becomeUninfectiousRate" lower="0.0" upper="Infinity" value="1.0"/>
@@ -142,21 +142,55 @@
             <distribution spec="CompoundDistribution" id="likelihood" useThreads="true">
                 <!-- Start character distributions -->
                 {%- for vu in variation_units %}
-                <distribution spec="TreeLikelihood" id="morphTreeLikelihood.{{ loop.index }}" useAmbiguities="true" useTipLikelihoods="true" tree="@tree">
-                    <data spec="FilteredAlignment" id="filter.{{ loop.index }}" data="@alignment" filter="{{ loop.index }}">
-                        <userDataType spec="StandardData" id="morphDataType.{{ loop.index }}" nrOfStates="{{ vu.nstates }}"/>
+                {%- set vu_ind = loop.index %}
+                <distribution spec="TreeLikelihood" id="morphTreeLikelihood.{{ vu_ind }}" useAmbiguities="true" useTipLikelihoods="true" tree="@tree">
+                    <data spec="FilteredAlignment" id="filter.{{ vu_ind }}" data="@alignment" filter="{{ vu_ind }}">
+                        <userDataType spec="StandardData" id="morphDataType.{{ vu_ind }}" nrOfStates="{{ vu.nstates }}"/>
                     </data>
-                    <siteModel spec="SiteModel" id="morphSiteModel.{{ loop.index }}">
-                        <mutationRate spec="parameter.RealParameter" id="mutationRate.{{ loop.index }}" value="1.0" estimate="false"/>
-                        <shape spec="parameter.RealParameter" id="gammaShape.{{ loop.index }}" value="1.0" estimate="false"/>
-                        <substModel spec="ComplexSubstitutionModel" id="substModel.{{ loop.index }}">
+                    <siteModel spec="SiteModel" id="morphSiteModel.{{ vu_ind }}">
+                        <mutationRate spec="parameter.RealParameter" id="mutationRate.{{ vu_ind }}" value="1.0" estimate="false"/>
+                        <shape spec="parameter.RealParameter" id="gammaShape.{{ vu_ind }}" value="1.0" estimate="false"/>
+                        {%- if vu.epoch_heights_string != "" %}
+                        <substModel spec="beastlabs.evolution.substitutionmodel.EpochSubstitutionModel" id="substModel.{{ vu_ind }}">
+                            <epochDates spec="parameter.RealParameter">{{ vu.epoch_heights_string }}</epochDates>
+                            {%- for er in vu.epoch_rates %}
+                            {%- set er_ind = loop.index %}
+                            <model spec="ComplexSubstitutionModel" id="substModel.{{ vu_ind }}.{{ er_ind }}">
+                                <!-- Equilibrium frequencies -->
+                                <frequencies idref="equilibriumfreqs.{{ vu_ind }}"/>
+                                <rates spec="parameter.CompoundValuable" id="rates.{{ vu_ind }}.{{ er_ind }}">
+                                    <!-- Start transition rate matrix off-diagonal entries -->
+                                    {%- for r in er %}
+                                    {%- if r.expression %}
+                                    <var spec="RPNcalculator" expression="{{ r.expression }}">
+                                        {%- for tc in r.transcriptional_categories %}
+                                        <parameter idref="{{ tc }}_rate"/>
+                                        {%- endfor %}
+                                    </var>
+                                    {%- else %}
+                                    {%- for tc in r.transcriptional_categories %}
+                                    <var idref="{{ tc }}_rate"/>
+                                    {%- endfor %}
+                                    {%- endif %}
+                                    {%- endfor %}
+                                </rates>
+                            </model>
+                            {%- endfor %}
+                            <frequencies spec="Frequencies" id="equilibriumfreqs.{{ vu_ind }}">
+                                <frequencies spec="parameter.RealParameter" id="equilibriumfrequencies.{{ vu_ind }}" value="{{ vu.equilibrium_frequencies }}" estimate="false"/>
+                            </frequencies>
+                        </substModel>
+                        {%- else %}
+                        {%- for er in vu.epoch_rates %}
+                        {%- set er_ind = loop.index %}
+                        <substModel spec="ComplexSubstitutionModel" id="substModel.{{ vu_ind }}">
                             <!-- Equilibrium frequencies -->
-                            <frequencies spec="Frequencies" id="equilibriumfreqs.{{ loop.index }}">
-                                <frequencies spec="parameter.RealParameter" id="equilibriumfrequencies.{{ loop.index }}" value="{{ vu.equilibrium_frequencies }}" estimate="false"/>
+                            <frequencies spec="Frequencies" id="equilibriumfreqs.{{ vu_ind }}">
+                                <frequencies spec="parameter.RealParameter" id="equilibriumfrequencies.{{ vu_ind }}" value="{{ vu.equilibrium_frequencies }}" estimate="false"/>
                             </frequencies>
-                            <rates spec="parameter.CompoundValuable" id="rates.{{ loop.index }}">
+                            <rates spec="parameter.CompoundValuable" id="rates.{{ vu_ind }}">
                                 <!-- Start transition rate matrix off-diagonal entries -->
-                                {%- for r in vu.rates %}
+                                {%- for r in er %}
                                 {%- if r.expression %}
                                 <var spec="RPNcalculator" expression="{{ r.expression }}">
                                     {%- for tc in r.transcriptional_categories %}
@@ -171,10 +205,12 @@
                                 {%- endfor %}
                             </rates>
                         </substModel>
+                        {%- endfor %}
+                        {%- endif %}
                     </siteModel>
                     <!-- root frequencies -->
-                    <rootFrequencies spec="Frequencies" id="rootfreqs.{{ loop.index }}">
-                        <frequencies spec="parameter.RealParameter" id="rootfrequencies.{{ loop.index }}" value="{{ vu.root_frequencies }}" estimate="false"/>
+                    <rootFrequencies spec="Frequencies" id="rootfreqs.{{ vu_ind }}">
+                        <frequencies spec="parameter.RealParameter" id="rootfrequencies.{{ vu_ind }}" value="{{ vu.root_frequencies }}" estimate="false"/>
                     </rootFrequencies>
                     <branchRateModel idref="clock"/>
                 </distribution>

diff --git a/teiphy/variation_unit.py b/teiphy/variation_unit.py
@@ -37,7 +37,7 @@ def __init__(self, xml: et.Element, verbose: bool = False):
         self.readings = []
         # Initialize its dictionaries of intrinsic and transcriptional relations:
         self.intrinsic_relations = {}
-        self.transcriptional_relations = {}
+        self.transcriptional_relations_by_date_range = {}
         # Now parse the app element to populate these data structures:
         self.parse(xml, verbose)
         if verbose:
@@ -132,22 +132,54 @@ def parse(self, xml: et.Element, verbose: bool = False):
                             self.intrinsic_relations[pair] = intrinsic_category
                 return
             if xml.get("type") is not None and xml.get("type") == "transcriptional":
-                self.transcriptional_relations = {}
+                self.transcriptional_relations_by_date_range = {}
+                # In a first pass, gather any dates specified for transcriptional relations and sort them in a list:
+                unique_date_strings = set()
+                for child in xml:
+                    if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
+                        continue
+                    if child.get("notBefore") is not None:
+                        unique_date_strings.add(child.get("notBefore"))
+                    if child.get("notAfter") is not None:
+                        unique_date_strings.add(child.get("notAfter"))
+                threshold_dates = sorted([int(date_string) for date_string in unique_date_strings])
+                # Then add null entries corresponding to periods before and after the first and last specified dates, respectively:
+                threshold_dates = [None] + threshold_dates + [None]
+                # Then initialize the output dictionary to map each pair of consecutive dates
+                # to a dictionary of the transcriptional relations that hold between them:
+                for i in range(len(threshold_dates) - 1):
+                    self.transcriptional_relations_by_date_range[(threshold_dates[i], threshold_dates[i + 1])] = {}
+                # Then, in a second pass, populate a map from (active, passive) reading tuples to their transcriptional categories for each consecutive pairs of dates:
                 for child in xml:
                     if child.get("active") is None or child.get("passive") is None or child.get("ana") is None:
                         continue
                     from_readings = child.get("active").replace("#", "").split()
                     to_readings = child.get("passive").replace("#", "").split()
                     transcriptional_categories = child.get("ana").replace("#", "").split()
-                    # For each pair of readings, assign them the specified category
-                    for from_reading in from_readings:
-                        for to_reading in to_readings:
-                            pair = (from_reading, to_reading)
-                            if pair not in self.transcriptional_relations:
-                                self.transcriptional_relations[
+                    date_index_range = [0, len(threshold_dates) - 1]
+                    if child.get("notBefore") is not None:
+                        date_index_range[0] = threshold_dates.index(int(child.get("notBefore")))
+                    if child.get("notAfter") is not None:
+                        date_index_range[1] = threshold_dates.index(int(child.get("notAfter")))
+                    for i in range(date_index_range[0], date_index_range[1]):
+                        # For each pair of readings, assign them to the specified category or categories:
+                        for from_reading in from_readings:
+                            for to_reading in to_readings:
+                                pair = (from_reading, to_reading)
+                                if (
                                     pair
-                                ] = set()  # we only need distinct categories for each transition
-                            for transcriptional_category in transcriptional_categories:
-                                self.transcriptional_relations[pair].add(transcriptional_category)
+                                    not in self.transcriptional_relations_by_date_range[
+                                        (threshold_dates[i], threshold_dates[i + 1])
+                                    ]
+                                ):
+                                    self.transcriptional_relations_by_date_range[
+                                        (threshold_dates[i], threshold_dates[i + 1])
+                                    ][
+                                        pair
+                                    ] = set()  # we only need distinct categories for each transition
+                                for transcriptional_category in transcriptional_categories:
+                                    self.transcriptional_relations_by_date_range[
+                                        (threshold_dates[i], threshold_dates[i + 1])
+                                    ][pair].add(transcriptional_category)
                 return
             return