Skip to content

Commit

Permalink
Merge pull request #1494 from flairNLP/GH-1492-transformers
Browse files Browse the repository at this point in the history
GH-1492: added new BERT embeddings implementation
  • Loading branch information
alanakbik authored Apr 3, 2020
2 parents 222d8a3 + 181ce16 commit e9b5c2a
Show file tree
Hide file tree
Showing 7 changed files with 444 additions and 67 deletions.
94 changes: 57 additions & 37 deletions flair/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ def __init__(
test_file=None,
dev_file=None,
tokenizer: Callable[[str], List[Token]] = space_tokenizer,
max_tokens_per_doc: int = -1,
max_chars_per_doc: int = -1,
truncate_to_max_tokens: int = -1,
truncate_to_max_chars: int = -1,
filter_if_longer_than: int = -1,
in_memory: bool = False,
encoding: str = 'utf-8',
):
Expand All @@ -161,8 +162,8 @@ def __init__(
:param test_file: the name of the test file
:param dev_file: the name of the dev file, if None, dev data is sampled from train
:param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
:param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
:param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
:param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of Tokens
:param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
:param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
:return: a Corpus with annotated train, dev and test data
"""
Expand All @@ -175,8 +176,9 @@ def __init__(
train_file,
label_type=label_type,
tokenizer=tokenizer,
max_tokens_per_doc=max_tokens_per_doc,
max_chars_per_doc=max_chars_per_doc,
truncate_to_max_tokens=truncate_to_max_tokens,
truncate_to_max_chars=truncate_to_max_chars,
filter_if_longer_than=filter_if_longer_than,
in_memory=in_memory,
encoding=encoding,
)
Expand All @@ -186,8 +188,9 @@ def __init__(
test_file,
label_type=label_type,
tokenizer=tokenizer,
max_tokens_per_doc=max_tokens_per_doc,
max_chars_per_doc=max_chars_per_doc,
truncate_to_max_tokens=truncate_to_max_tokens,
truncate_to_max_chars=truncate_to_max_chars,
filter_if_longer_than=filter_if_longer_than,
in_memory=in_memory,
encoding=encoding,
) if test_file is not None else None
Expand All @@ -197,8 +200,9 @@ def __init__(
dev_file,
label_type=label_type,
tokenizer=tokenizer,
max_tokens_per_doc=max_tokens_per_doc,
max_chars_per_doc=max_chars_per_doc,
truncate_to_max_tokens=truncate_to_max_tokens,
truncate_to_max_chars=truncate_to_max_chars,
filter_if_longer_than=filter_if_longer_than,
in_memory=in_memory,
encoding=encoding,
) if dev_file is not None else None
Expand Down Expand Up @@ -930,8 +934,9 @@ def __init__(
self,
path_to_file: Union[str, Path],
label_type: str = 'class',
max_tokens_per_doc=-1,
max_chars_per_doc=-1,
truncate_to_max_tokens=-1,
truncate_to_max_chars=-1,
filter_if_longer_than: int = -1,
tokenizer=segtok_tokenizer,
in_memory: bool = True,
encoding: str = 'utf-8',
Expand All @@ -943,9 +948,9 @@ def __init__(
If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g.,
__label__<class_name_1> __label__<class_name_2> <text>
:param path_to_file: the path to the data file
:param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
:param truncate_to_max_tokens: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
:param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
:param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
:param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
:param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
:return: list of sentences
"""
Expand All @@ -966,8 +971,9 @@ def __init__(
self.indices = []

self.total_sentence_count: int = 0
self.max_chars_per_doc = max_chars_per_doc
self.max_tokens_per_doc = max_tokens_per_doc
self.truncate_to_max_chars = truncate_to_max_chars
self.truncate_to_max_tokens = truncate_to_max_tokens
self.filter_if_longer_than = filter_if_longer_than

self.path_to_file = path_to_file

Expand All @@ -980,6 +986,11 @@ def __init__(
line = f.readline()
continue

if 0 < self.filter_if_longer_than < len(line.split(' ')):
position = f.tell()
line = f.readline()
continue

if self.in_memory:
sentence = self._parse_line_to_sentence(
line, self.label_prefix, tokenizer
Expand Down Expand Up @@ -1012,8 +1023,8 @@ def _parse_line_to_sentence(

text = line[l_len:].strip()

if self.max_chars_per_doc > 0:
text = text[: self.max_chars_per_doc]
if self.truncate_to_max_chars > 0:
text = text[: self.truncate_to_max_chars]

if text and labels:
sentence = Sentence(text, use_tokenizer=tokenizer)
Expand All @@ -1023,9 +1034,9 @@ def _parse_line_to_sentence(

if (
sentence is not None
and 0 < self.max_tokens_per_doc < len(sentence)
and 0 < self.truncate_to_max_tokens < len(sentence)
):
sentence.tokens = sentence.tokens[: self.max_tokens_per_doc]
sentence.tokens = sentence.tokens[: self.truncate_to_max_tokens]

return sentence
return None
Expand Down Expand Up @@ -1586,7 +1597,7 @@ def __init__(
class SENTEVAL_MR(ClassificationCorpus):
def __init__(
self,
in_memory: bool = True,
**corpusargs
):
# this dataset name
dataset_name = self.__class__.__name__.lower()
Expand Down Expand Up @@ -1619,7 +1630,7 @@ def __init__(
train_file.write(f"__label__NEGATIVE {line}")

super(SENTEVAL_MR, self).__init__(
data_folder, label_type='sentiment', tokenizer=segtok_tokenizer, in_memory=in_memory
data_folder, label_type='sentiment', tokenizer=segtok_tokenizer, **corpusargs
)


Expand Down Expand Up @@ -1703,13 +1714,13 @@ def __init__(
)


class SENTEVAL_SST_BINARY(CSVClassificationCorpus):
class SENTEVAL_SST_BINARY(ClassificationCorpus):
def __init__(
self,
in_memory: bool = True,
**corpusargs
):
# this dataset name
dataset_name = self.__class__.__name__.lower()
dataset_name = self.__class__.__name__.lower() + '_v2'

# default dataset folder is the cache root
data_folder = Path(flair.cache_root) / "datasets" / dataset_name
Expand All @@ -1718,17 +1729,21 @@ def __init__(
if not (data_folder / "train.txt").is_file():

# download senteval datasets if necessary und unzip
cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-train', Path("datasets") / dataset_name)
cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-test', Path("datasets") / dataset_name)
cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-dev', Path("datasets") / dataset_name)
cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-train', Path("datasets") / dataset_name / 'raw')
cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-test', Path("datasets") / dataset_name / 'raw')
cached_path('https://raw.githubusercontent.com/PrincetonML/SIF/master/data/sentiment-dev', Path("datasets") / dataset_name / 'raw')

# create train.txt file by iterating over pos and neg file
with open(data_folder / "train.txt", "a") as out_file, open(data_folder / 'raw' / "sentiment-train") as in_file:
for line in in_file:
fields = line.split('\t')
label = 'POSITIVE' if fields[1].rstrip() == '1' else 'NEGATIVE'
out_file.write(f"__label__{label} {fields[0]}\n")

super(SENTEVAL_SST_BINARY, self).__init__(
data_folder,
column_name_map={0: 'text', 1: 'label'},
tokenizer=segtok_tokenizer,
in_memory=in_memory,
delimiter='\t',
quotechar=None,
**corpusargs,
)


Expand Down Expand Up @@ -1813,12 +1828,15 @@ def __init__(


class IMDB(ClassificationCorpus):
def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
def __init__(self, base_path: Union[str, Path] = None, rebalance_corpus: bool = True, **corpusargs):
if type(base_path) == str:
base_path: Path = Path(base_path)

# this dataset name
dataset_name = self.__class__.__name__.lower()
dataset_name = self.__class__.__name__.lower() + '_v2'

if rebalance_corpus:
dataset_name = dataset_name + '-rebalanced'

# default dataset folder is the cache root
if not base_path:
Expand Down Expand Up @@ -1853,20 +1871,22 @@ def __init__(self, base_path: Union[str, Path] = None, in_memory: bool = False):
if f"{dataset}/{label}" in m.name
],
)
with open(f"{data_path}/{dataset}.txt", "at") as f_p:
with open(f"{data_path}/train-all.txt", "at") as f_p:
current_path = data_path / "aclImdb" / dataset / label
for file_name in current_path.iterdir():
if file_name.is_file() and file_name.name.endswith(
".txt"
):
if label == "pos": sentiment_label = 'POSITIVE'
if label == "neg": sentiment_label = 'NEGATIVE'
f_p.write(
f"__label__{label} "
f"__label__{sentiment_label} "
+ file_name.open("rt", encoding="utf-8").read()
+ "\n"
)

super(IMDB, self).__init__(
data_folder, tokenizer=space_tokenizer, in_memory=in_memory
data_folder, tokenizer=space_tokenizer, **corpusargs
)


Expand Down
Loading

0 comments on commit e9b5c2a

Please sign in to comment.