From 812897e16df22971664172da4ac327e171d25d1e Mon Sep 17 00:00:00 2001 From: jjmccollum Date: Fri, 25 Oct 2024 01:18:45 +0200 Subject: [PATCH] Fixed state symbols in STEMMA --- pyproject.toml | 2 +- teiphy/collation.py | 28 ++++++++++++++++++++++++++-- tests/test_collation.py | 10 ++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5a95375..1199674 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "teiphy" -version = "0.1.11" +version = "0.1.12" description = "Converts TEI XML collations to NEXUS and other formats" authors = ["Joey McCollum and Robert Turnbull"] license = "MIT" diff --git a/teiphy/collation.py b/teiphy/collation.py index a7e19a9..7b5a69b 100644 --- a/teiphy/collation.py +++ b/teiphy/collation.py @@ -1979,6 +1979,28 @@ def to_excel( return df.to_excel(file_addr, index=False) return df.to_excel(file_addr) + def get_stemma_symbols(self): + """Returns a list of one-character symbols needed to represent the states of all substantive readings in STEMMA format. + + The number of symbols equals the maximum number of substantive readings at any variation unit. + + Returns: + A list of individual characters representing states in readings. + """ + possible_symbols = ( + list(string.digits) + list(string.ascii_lowercase)[:14] + ) # NOTE: the maximum number of symbols allowed in STEMMA format is 24 + # The number of symbols needed is equal to the length of the longest substantive reading vector: + nsymbols = 0 + # If there are no witnesses, then no symbols are needed at all: + if len(self.witnesses) == 0: + return [] + wit_id = self.witnesses[0].id + for rdg_support in self.readings_by_witness[wit_id]: + nsymbols = max(nsymbols, len(rdg_support)) + stemma_symbols = possible_symbols[:nsymbols] + return stemma_symbols + def to_stemma(self, file_addr: Union[Path, str]): """Writes this Collation to a STEMMA file without an extension and a Chron file (containing low, middle, and high dates for all witnesses) without an extension. @@ -2027,6 +2049,7 @@ def to_stemma(self, file_addr: Union[Path, str]): indices = tuple([j, k]) reading_wits_by_indices[indices].append(wit.id) # In a third pass, write to the STEMMA file: + symbols = self.get_stemma_symbols() Path(file_addr).parent.mkdir( parents=True, exist_ok=True ) # generate all parent folders for this file that don't already exist @@ -2075,13 +2098,14 @@ def to_stemma(self, file_addr: Union[Path, str]): indices = tuple([j, k]) if indices not in reading_wits_by_indices: break + rdg_symbol = symbols[k] # get the one-character alphanumeric code for this state wits = " ".join(reading_wits_by_indices[indices]) # Open the variant reading support block with an angle bracket: if k == 0: - f.write("%d %s" % (k, wits)) + f.write("%s %s" % (rdg_symbol, wits)) # Open all subsequent variant reading support blocks with pipes on the next line: else: - f.write("\n\t| %d %s" % (k, wits)) + f.write("\n\t| %s %s" % (rdg_symbol, wits)) k += 1 f.write(" >\n") # In a fourth pass, write to the chron file: diff --git a/tests/test_collation.py b/tests/test_collation.py index 2c085ce..74dc19e 100644 --- a/tests/test_collation.py +++ b/tests/test_collation.py @@ -390,6 +390,16 @@ def test_get_beast_symbols_empty(self): beast_symbols = empty_collation.get_beast_symbols() self.assertEqual(beast_symbols, []) + def test_get_stemma_symbols(self): + stemma_symbols = self.collation.get_stemma_symbols() + self.assertEqual(stemma_symbols, ["0", "1", "2", "3", "4", "5"]) + + def test_get_stemma_symbols_empty(self): + empty_collation = self.collation + empty_collation.witnesses = [] + stemma_symbols = empty_collation.get_stemma_symbols() + self.assertEqual(stemma_symbols, []) + def test_to_numpy_ignore_missing(self): matrix, reading_labels, witness_labels = self.collation.to_numpy(split_missing=False) self.assertTrue(