From 812897e16df22971664172da4ac327e171d25d1e Mon Sep 17 00:00:00 2001
From: jjmccollum <jpmccollum05834@yahoo.com>
Date: Fri, 25 Oct 2024 01:18:45 +0200
Subject: [PATCH] Fixed state symbols in STEMMA

---
 pyproject.toml          |  2 +-
 teiphy/collation.py     | 28 ++++++++++++++++++++++++++--
 tests/test_collation.py | 10 ++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 5a95375..1199674 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "teiphy"
-version = "0.1.11"
+version = "0.1.12"
 description = "Converts TEI XML collations to NEXUS and other formats"
 authors = ["Joey McCollum and Robert Turnbull"]
 license = "MIT"
diff --git a/teiphy/collation.py b/teiphy/collation.py
index a7e19a9..7b5a69b 100644
--- a/teiphy/collation.py
+++ b/teiphy/collation.py
@@ -1979,6 +1979,28 @@ def to_excel(
             return df.to_excel(file_addr, index=False)
         return df.to_excel(file_addr)
 
+    def get_stemma_symbols(self):
+        """Returns a list of one-character symbols needed to represent the states of all substantive readings in STEMMA format.
+
+        The number of symbols equals the maximum number of substantive readings at any variation unit.
+
+        Returns:
+            A list of individual characters representing states in readings.
+        """
+        possible_symbols = (
+            list(string.digits) + list(string.ascii_lowercase)[:14]
+        )  # NOTE: the maximum number of symbols allowed in STEMMA format is 24
+        # The number of symbols needed is equal to the length of the longest substantive reading vector:
+        nsymbols = 0
+        # If there are no witnesses, then no symbols are needed at all:
+        if len(self.witnesses) == 0:
+            return []
+        wit_id = self.witnesses[0].id
+        for rdg_support in self.readings_by_witness[wit_id]:
+            nsymbols = max(nsymbols, len(rdg_support))
+        stemma_symbols = possible_symbols[:nsymbols]
+        return stemma_symbols
+
     def to_stemma(self, file_addr: Union[Path, str]):
         """Writes this Collation to a STEMMA file without an extension and a Chron file (containing low, middle, and high dates for all witnesses) without an extension.
 
@@ -2027,6 +2049,7 @@ def to_stemma(self, file_addr: Union[Path, str]):
                 indices = tuple([j, k])
                 reading_wits_by_indices[indices].append(wit.id)
         # In a third pass, write to the STEMMA file:
+        symbols = self.get_stemma_symbols()
         Path(file_addr).parent.mkdir(
             parents=True, exist_ok=True
         )  # generate all parent folders for this file that don't already exist
@@ -2075,13 +2098,14 @@ def to_stemma(self, file_addr: Union[Path, str]):
                     indices = tuple([j, k])
                     if indices not in reading_wits_by_indices:
                         break
+                    rdg_symbol = symbols[k]  # get the one-character alphanumeric code for this state
                     wits = " ".join(reading_wits_by_indices[indices])
                     # Open the variant reading support block with an angle bracket:
                     if k == 0:
-                        f.write("%d %s" % (k, wits))
+                        f.write("%s %s" % (rdg_symbol, wits))
                     # Open all subsequent variant reading support blocks with pipes on the next line:
                     else:
-                        f.write("\n\t| %d %s" % (k, wits))
+                        f.write("\n\t| %s %s" % (rdg_symbol, wits))
                     k += 1
                 f.write(" >\n")
         # In a fourth pass, write to the chron file:
diff --git a/tests/test_collation.py b/tests/test_collation.py
index 2c085ce..74dc19e 100644
--- a/tests/test_collation.py
+++ b/tests/test_collation.py
@@ -390,6 +390,16 @@ def test_get_beast_symbols_empty(self):
         beast_symbols = empty_collation.get_beast_symbols()
         self.assertEqual(beast_symbols, [])
 
+    def test_get_stemma_symbols(self):
+        stemma_symbols = self.collation.get_stemma_symbols()
+        self.assertEqual(stemma_symbols, ["0", "1", "2", "3", "4", "5"])
+
+    def test_get_stemma_symbols_empty(self):
+        empty_collation = self.collation
+        empty_collation.witnesses = []
+        stemma_symbols = empty_collation.get_stemma_symbols()
+        self.assertEqual(stemma_symbols, [])
+
     def test_to_numpy_ignore_missing(self):
         matrix, reading_labels, witness_labels = self.collation.to_numpy(split_missing=False)
         self.assertTrue(