-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #11430 from rmitsch/chore/synch-develop
Synch develop with master
- Loading branch information
Showing
31 changed files
with
725 additions
and
193 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from ...language import Language, BaseDefaults | ||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||
from .stop_words import STOP_WORDS | ||
from .lex_attrs import LEX_ATTRS | ||
|
||
|
||
class LatinDefaults(BaseDefaults): | ||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS | ||
stop_words = STOP_WORDS | ||
lex_attr_getters = LEX_ATTRS | ||
|
||
|
||
class Latin(Language): | ||
lang = "la" | ||
Defaults = LatinDefaults | ||
|
||
|
||
__all__ = ["Latin"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from ...attrs import LIKE_NUM | ||
import re | ||
|
||
# cf. Goyvaerts/Levithan 2009; case-insensitive, allow 4 | ||
roman_numerals_compile = re.compile( | ||
r"(?i)^(?=[MDCLXVI])M*(C[MD]|D?C{0,4})(X[CL]|L?X{0,4})(I[XV]|V?I{0,4})$" | ||
) | ||
|
||
_num_words = set( | ||
""" | ||
unus una unum duo duae tres tria quattuor quinque sex septem octo novem decem | ||
""".split() | ||
) | ||
|
||
_ordinal_words = set( | ||
""" | ||
primus prima primum secundus secunda secundum tertius tertia tertium | ||
""".split() | ||
) | ||
|
||
|
||
def like_num(text): | ||
if text.isdigit(): | ||
return True | ||
if roman_numerals_compile.match(text): | ||
return True | ||
if text.lower() in _num_words: | ||
return True | ||
if text.lower() in _ordinal_words: | ||
return True | ||
return False | ||
|
||
|
||
LEX_ATTRS = {LIKE_NUM: like_num} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Corrected Perseus list, cf. https://wiki.digitalclassicist.org/Stopwords_for_Greek_and_Latin | ||
|
||
STOP_WORDS = set( | ||
""" | ||
ab ac ad adhuc aliqui aliquis an ante apud at atque aut autem | ||
cum cur | ||
de deinde dum | ||
ego enim ergo es est et etiam etsi ex | ||
fio | ||
haud hic | ||
iam idem igitur ille in infra inter interim ipse is ita | ||
magis modo mox | ||
nam ne nec necque neque nisi non nos | ||
o ob | ||
per possum post pro | ||
quae quam quare qui quia quicumque quidem quilibet quis quisnam quisquam quisque quisquis quo quoniam | ||
sed si sic sive sub sui sum super suus | ||
tam tamen trans tu tum | ||
ubi uel uero | ||
vel vero | ||
""".split() | ||
) |
Oops, something went wrong.