Skip to content

Commit

Permalink
Merge pull request #2154 from FranziskaKuhls/japaneseNER
Browse files Browse the repository at this point in the history
Japanese ner
  • Loading branch information
alanakbik authored Mar 15, 2021
2 parents 2bd9e72 + a232eba commit 2b5d1e9
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 1 deletion.
1 change: 1 addition & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from .sequence_labeling import EUROPARL_NER_GERMAN
from .sequence_labeling import GERMEVAL_14
from .sequence_labeling import INSPEC
from .sequence_labeling import JAPANESE_NER
from .sequence_labeling import LER_GERMAN
from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
Expand Down
80 changes: 79 additions & 1 deletion flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,84 @@ def __init__(
**corpusargs,
)

class JAPANESE_NER(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
**corpusargs,
):
"""
Initialize the Hironsan/IOB2 corpus for Japanese. The first time you call this constructor it will automatically
download the dataset.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: NER by default.
:param in_memory: If True, keeps dataset in memory giving speedups in training.
"""
if type(base_path) == str:
base_path: Path = Path(base_path)

# column format
columns = {0: 'text', 1: 'ner'}

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data from github if necessary (hironsan.txt, ja.wikipedia.conll)
IOB2_path = "https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/"

# download files if not present locally
cached_path(f"{IOB2_path}hironsan.txt", data_folder / 'raw')
cached_path(f"{IOB2_path}ja.wikipedia.conll", data_folder / 'raw')

# we need to modify the original files by adding new lines after after the end of each sentence
train_data_file = data_folder / 'train.txt'
if not train_data_file.is_file():
self.__prepare_jap_wikinews_corpus(data_folder / 'raw' / "hironsan.txt", data_folder / 'train.txt')
self.__prepare_jap_wikipedia_corpus(data_folder / 'raw' / "ja.wikipedia.conll", data_folder / 'train.txt')

super(JAPANESE_NER, self).__init__(
data_folder,
columns,
train_file='train.txt',
tag_to_bioes=tag_to_bioes,
in_memory=in_memory,
**corpusargs,
)

@staticmethod
def __prepare_jap_wikipedia_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
with open(file_in, 'r') as f:
lines = f.readlines()
with open(file_out, 'a') as f:
for line in lines:
if (line[0] == "。"):
f.write(line)
f.write("\n")
elif (line[0] == "\n"):
continue
else:
f.write(line)

@staticmethod
def __prepare_jap_wikinews_corpus(file_in: Union[str, Path], file_out: Union[str, Path]):
with open(file_in, 'r') as f:
lines = f.readlines()
with open(file_out, 'a') as f:
for line in lines:
sp_line = line.split("\t")
if (sp_line[0] == "\n"):
f.write("\n")
else:
f.write(sp_line[0] + "\t" + sp_line[len(sp_line) - 1])

class STACKOVERFLOW_NER(ColumnCorpus):
def __init__(
self,
Expand Down Expand Up @@ -3737,4 +3815,4 @@ def _fill_curr_comment(self, fix_flag: bool):
except StopIteration: # When the end of the comments.tsv file is reached
self.curr_row = next_row
self.stop_iter = True if not fix_flag else False
break
break
1 change: 1 addition & 0 deletions resources/docs/TUTORIAL_6_CORPUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ data the first time you call the corresponding constructor ID. The following dat
| 'CONLL_03_SPANISH' | Spanish | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
| 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) |
| 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches |
| 'JAPANESE_NER' | Japanese | [https://github.com/Hironsan/IOB2Corpus] Japanese NER dataset automatically generated from Wikipedia |
| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents |
| 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER |
| 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER |
Expand Down

0 comments on commit 2b5d1e9

Please sign in to comment.