Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding WNUT_2020_NER dataset support #1942

Merged
merged 5 commits into from
Nov 9, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from .sequence_labeling import WIKINER_POLISH
from .sequence_labeling import WIKINER_RUSSIAN
from .sequence_labeling import WNUT_17
from .sequence_labeling import WNUT_2020_NER
from .sequence_labeling import MIT_RESTAURANTS
from .sequence_labeling import UP_CHINESE
from .sequence_labeling import UP_ENGLISH
Expand Down
68 changes: 68 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import re
import os
import shutil
from pathlib import Path
from typing import Union, Dict, List

Expand Down Expand Up @@ -448,6 +449,73 @@ def __init__(
)



class WNUT_2020_NER(ColumnCorpus):
def __init__(
self,
base_path: Union[str, Path] = None,
tag_to_bioes: str = "ner",
in_memory: bool = True,
document_as_sequence: bool = False,
):
"""
Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
download the dataset.
:param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
to point to a different folder but typically this should not be necessary.
:param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
:param in_memory: If True, keeps dataset in memory giving speedups in training.
:param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
"""
if type(base_path) == str:
base_path: Path = Path(base_path)

# column format
columns = {0: "text", 1: "ner"}

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
data_folder = base_path / dataset_name

# download data if necessary
github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"

for sample in ["train", "test", "dev"]:

sample_file = data_folder / (sample + ".txt")
if not sample_file.is_file():

zip_path = cached_path(
f"{github_url}", Path("datasets") / dataset_name
)

# unzip the downloaded repo and merge the train, dev and test datasets
unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master

file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
filenames = os.listdir(file_path)
with open(data_folder / (sample + '.txt'), 'w') as outfile:
for fname in filenames:
with open(file_path / fname) as infile:
lines = infile.read()
outfile.write(lines[:-3]) # get rid of the extra 2 empty at the end of each .txt file

shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done

super(WNUT_2020_NER, self).__init__(
data_folder,
columns,
tag_to_bioes=tag_to_bioes,
encoding="utf-8",
in_memory=in_memory,
document_separator_token=None if not document_as_sequence else "-DOCSTART-",
)


class WIKIGOLD_NER(ColumnCorpus):
def __init__(
self,
Expand Down
1 change: 1 addition & 0 deletions resources/docs/TUTORIAL_6_CORPUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ data the first time you call the corresponding constructor ID. The following dat
| 'TWITTER_NER' | English | [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) |
| 'WIKIANN' | 282 languages | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/). |
| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
| 'WNUT_20' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
| 'WIKIGOLD_NER' | English | [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text |
| 'WIKINER_ENGLISH' | English | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
| 'WIKINER_GERMAN' | German | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
Expand Down