Skip to content

Commit

Permalink
Merge branch 'master' into feature/UD_Belarusian
Browse files Browse the repository at this point in the history
  • Loading branch information
alanakbik authored Apr 19, 2021
2 parents d45a59d + 217e3e6 commit 9dac167
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 2 deletions.
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@
from .treebanks import UD_GOTHIC
from .treebanks import UD_WOLOF
from .treebanks import UD_BELARUSIAN
from .treebanks import UD_OLD_CHURCH_SLAVONIC
from .treebanks import UD_COPTIC

# Expose all text-text datasets
from .text_text import ParallelTextCorpus
Expand Down
57 changes: 55 additions & 2 deletions flair/datasets/treebanks.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,32 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, s

super(UD_ENGLISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)

class UD_OLD_CHURCH_SLAVONIC(UniversalDependenciesCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):

if type(base_path) == str:
base_path: Path = Path(base_path)

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Old_Church_Slavonic-PROIEL/master"
cached_path(f"{web_path}/cu_proiel-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(
f"{web_path}/cu_proiel-ud-test.conllu", Path("datasets") / dataset_name
)
cached_path(
f"{web_path}/cu_proiel-ud-train.conllu", Path("datasets") / dataset_name
)

super(UD_OLD_CHURCH_SLAVONIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)


class UD_ESTONIAN(UniversalDependenciesCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
Expand Down Expand Up @@ -1485,7 +1511,7 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, s


class UD_BELARUSIAN(UniversalDependenciesCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):

if type(base_path) == str:
base_path: Path = Path(base_path)
Expand All @@ -1508,4 +1534,31 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, s
f"{web_path}/be_hse-ud-train.conllu", Path("datasets") / dataset_name
)

super(UD_BELARUSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)
super(UD_BELARUSIAN, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)


class UD_COPTIC(UniversalDependenciesCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):

if type(base_path) == str:
base_path: Path = Path(base_path)

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Coptic-Scriptorium/master"
cached_path(f"{web_path}/cop_scriptorium-ud-dev.conllu", Path("datasets") / dataset_name)
cached_path(
f"{web_path}/cop_scriptorium-ud-test.conllu", Path("datasets") / dataset_name
)
cached_path(
f"{web_path}/cop_scriptorium-ud-train.conllu", Path("datasets") / dataset_name
)

super(UD_COPTIC, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords)

0 comments on commit 9dac167

Please sign in to comment.