From 0e3ef9b66192314372b5bc1e5b7f92d76fe3356f Mon Sep 17 00:00:00 2001 From: enricoboos Date: Tue, 20 Apr 2021 12:27:25 +0200 Subject: [PATCH] added kazakh as new language for pos --- flair/datasets/__init__.py | 1 + flair/datasets/treebanks.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 1843d98193..6e459f9408 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -98,6 +98,7 @@ from .treebanks import UniversalDependenciesDataset from .treebanks import UD_ARMENIAN from .treebanks import UD_ENGLISH +from .treebanks import UD_KAZAKH from .treebanks import UD_ESTONIAN from .treebanks import UD_GERMAN from .treebanks import UD_GERMAN_HDT diff --git a/flair/datasets/treebanks.py b/flair/datasets/treebanks.py index 4ab5eb6109..15f68120c9 100755 --- a/flair/datasets/treebanks.py +++ b/flair/datasets/treebanks.py @@ -251,6 +251,33 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, s super(UD_ENGLISH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) + +class UD_KAZAKH(UniversalDependenciesCorpus): + def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True): + + if type(base_path) == str: + base_path: Path = Path(base_path) + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + web_path = "https://raw.githubusercontent.com/UniversalDependencies/UD_Kazakh-KTB/master" + cached_path( + f"{web_path}/kk_ktb-ud-test.conllu", Path("datasets") / dataset_name + ) + cached_path( + f"{web_path}/kk_ktb-ud-train.conllu", Path("datasets") / dataset_name + ) + + super(UD_KAZAKH, self).__init__(data_folder, in_memory=in_memory, split_multiwords=split_multiwords) + + class UD_OLD_CHURCH_SLAVONIC(UniversalDependenciesCorpus): def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = True, split_multiwords: bool = True):