Skip to content

Commit

Permalink
Fixed state symbols in STEMMA
Browse files Browse the repository at this point in the history
  • Loading branch information
jjmccollum committed Oct 24, 2024
1 parent dbb82aa commit 812897e
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "teiphy"
version = "0.1.11"
version = "0.1.12"
description = "Converts TEI XML collations to NEXUS and other formats"
authors = ["Joey McCollum and Robert Turnbull"]
license = "MIT"
Expand Down
28 changes: 26 additions & 2 deletions teiphy/collation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1979,6 +1979,28 @@ def to_excel(
return df.to_excel(file_addr, index=False)
return df.to_excel(file_addr)

def get_stemma_symbols(self):
"""Returns a list of one-character symbols needed to represent the states of all substantive readings in STEMMA format.
The number of symbols equals the maximum number of substantive readings at any variation unit.
Returns:
A list of individual characters representing states in readings.
"""
possible_symbols = (
list(string.digits) + list(string.ascii_lowercase)[:14]
) # NOTE: the maximum number of symbols allowed in STEMMA format is 24
# The number of symbols needed is equal to the length of the longest substantive reading vector:
nsymbols = 0
# If there are no witnesses, then no symbols are needed at all:
if len(self.witnesses) == 0:
return []
wit_id = self.witnesses[0].id
for rdg_support in self.readings_by_witness[wit_id]:
nsymbols = max(nsymbols, len(rdg_support))
stemma_symbols = possible_symbols[:nsymbols]
return stemma_symbols

def to_stemma(self, file_addr: Union[Path, str]):
"""Writes this Collation to a STEMMA file without an extension and a Chron file (containing low, middle, and high dates for all witnesses) without an extension.
Expand Down Expand Up @@ -2027,6 +2049,7 @@ def to_stemma(self, file_addr: Union[Path, str]):
indices = tuple([j, k])
reading_wits_by_indices[indices].append(wit.id)
# In a third pass, write to the STEMMA file:
symbols = self.get_stemma_symbols()
Path(file_addr).parent.mkdir(
parents=True, exist_ok=True
) # generate all parent folders for this file that don't already exist
Expand Down Expand Up @@ -2075,13 +2098,14 @@ def to_stemma(self, file_addr: Union[Path, str]):
indices = tuple([j, k])
if indices not in reading_wits_by_indices:
break
rdg_symbol = symbols[k] # get the one-character alphanumeric code for this state
wits = " ".join(reading_wits_by_indices[indices])
# Open the variant reading support block with an angle bracket:
if k == 0:
f.write("%d %s" % (k, wits))
f.write("%s %s" % (rdg_symbol, wits))
# Open all subsequent variant reading support blocks with pipes on the next line:
else:
f.write("\n\t| %d %s" % (k, wits))
f.write("\n\t| %s %s" % (rdg_symbol, wits))
k += 1
f.write(" >\n")
# In a fourth pass, write to the chron file:
Expand Down
10 changes: 10 additions & 0 deletions tests/test_collation.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,16 @@ def test_get_beast_symbols_empty(self):
beast_symbols = empty_collation.get_beast_symbols()
self.assertEqual(beast_symbols, [])

def test_get_stemma_symbols(self):
stemma_symbols = self.collation.get_stemma_symbols()
self.assertEqual(stemma_symbols, ["0", "1", "2", "3", "4", "5"])

def test_get_stemma_symbols_empty(self):
empty_collation = self.collation
empty_collation.witnesses = []
stemma_symbols = empty_collation.get_stemma_symbols()
self.assertEqual(stemma_symbols, [])

def test_to_numpy_ignore_missing(self):
matrix, reading_labels, witness_labels = self.collation.to_numpy(split_missing=False)
self.assertTrue(
Expand Down

0 comments on commit 812897e

Please sign in to comment.