Skip to content

Commit

Permalink
GH-38: Add label class for sentence labels.
Browse files Browse the repository at this point in the history
  • Loading branch information
tabergma committed Aug 14, 2018
1 parent 5e5ad14 commit 3e3194e
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 31 deletions.
44 changes: 35 additions & 9 deletions flair/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,29 @@ def load(cls, name: str):
return Dictionary.load_from_file(name)


class Label:
def __init__(self, name: str, confidence: float = 0.0):
self.name = name

This comment has been minimized.

Copy link
@stefan-it

stefan-it Aug 14, 2018

Member

@tabergma I think this should be self._name (because it is used later in some methods) :)

self.confidence = confidence

@property
def name(self):
return self._name

@name.setter
def name(self, name):
self._name = name

@property
def confidence(self):
return self._name

@confidence.setter
def confidence(self, confidence):
if 0.0 <= confidence <= 1.0:
self._confidence = confidence


class Token:
"""
This class represents one word in a tokenized sentence. Each token may have any number of tags. It may also point
Expand Down Expand Up @@ -150,13 +173,13 @@ def embedding(self):


class Sentence:
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: List[str] = None):
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: List[Label] = None):

super(Sentence, self).__init__()

self.tokens: List[Token] = []

self.labels: List[str] = labels
self.labels: List[Label] = labels

self._embeddings: Dict = {}

Expand All @@ -183,18 +206,21 @@ def __getitem__(self, idx: int) -> Token:
def __iter__(self):
return iter(self.tokens)

def add_label(self, label: str):
def add_label(self, label: Label):
if self.labels is None:
self.labels = [label]
else:
self.labels.append(label)

def add_labels(self, labels: List[str]):
def add_labels(self, labels: List[Label]):
if self.labels is None:
self.labels = labels
else:
self.labels.extend(labels)

def get_label_names(self) -> List[str]:
return [label.name for label in self.labels]

def get_token(self, token_id: int) -> Token:
for token in self.tokens:
if token.idx == token_id:
Expand Down Expand Up @@ -357,7 +383,7 @@ def make_label_dictionary(self) -> Dictionary:
:return: dictionary of labels
"""

labels = set(self._get_all_labels())
labels = set(self._get_all_label_names())

label_dictionary: Dictionary = Dictionary(add_unk=False)
for label in labels:
Expand All @@ -384,7 +410,7 @@ def make_vocab_dictionary(self, max_tokens=-1, min_freq=1) -> Dictionary:

return vocab_dictionary

def _get_most_common_tokens(self, max_tokens, min_freq) -> List[Token]:
def _get_most_common_tokens(self, max_tokens, min_freq) -> List[str]:
tokens_and_frequencies = Counter(self._get_all_tokens())
tokens_and_frequencies = tokens_and_frequencies.most_common()

Expand All @@ -395,8 +421,8 @@ def _get_most_common_tokens(self, max_tokens, min_freq) -> List[Token]:
tokens.append(token)
return tokens

def _get_all_labels(self) -> List[str]:
return [label for sent in self.train for label in sent.labels]
def _get_all_label_names(self) -> List[str]:
return [label.name for sent in self.train for label in sent.labels]

def _get_all_tokens(self) -> List[str]:
tokens = list(map((lambda s: s.tokens), self.train))
Expand Down Expand Up @@ -452,7 +478,7 @@ def _get_classes_to_count(sentences):
classes_to_count = defaultdict(lambda: 0)
for sent in sentences:
for label in sent.labels:
classes_to_count[label] += 1
classes_to_count[label.name] += 1
return classes_to_count

def __str__(self) -> str:
Expand Down
4 changes: 2 additions & 2 deletions flair/data_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
from enum import Enum

from flair.data import Sentence, TaggedCorpus, Token
from flair.data import Sentence, TaggedCorpus, Token, Label


class NLPTask(Enum):
Expand Down Expand Up @@ -333,7 +333,7 @@ def read_text_classification_file(path_to_file):
if words[i].startswith(label_prefix):
l_len += len(words[i]) + 1
label = words[i].replace(label_prefix, "")
labels.append(label)
labels.append(Label(label))
else:
break

Expand Down
16 changes: 8 additions & 8 deletions flair/models/text_classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch.nn as nn

import flair.embeddings
from flair.data import Dictionary, Sentence
from flair.data import Dictionary, Sentence, Label
from flair.training_utils import convert_labels_to_one_hot, clear_embeddings


Expand Down Expand Up @@ -117,7 +117,7 @@ def predict(self, sentences: List[Sentence], mini_batch_size: int = 32, embeddin

return sentences

def get_labels_and_loss(self, sentences: List[Sentence]) -> (List[List[str]], float):
def get_labels_and_loss(self, sentences: List[Sentence]) -> (List[List[Label]], float):
"""
Predicts the labels of sentences and calculates the loss.
:param sentences: list of sentences
Expand All @@ -134,7 +134,7 @@ def get_labels_and_loss(self, sentences: List[Sentence]) -> (List[List[str]], fl

return pred_labels, loss

def _get_multi_label(self, label_scores) -> List[str]:
def _get_multi_label(self, label_scores) -> List[Label]:
labels = []

sigmoid = torch.nn.Sigmoid()
Expand All @@ -143,15 +143,15 @@ def _get_multi_label(self, label_scores) -> List[str]:
for idx, conf in enumerate(results):
if conf > 0.5:
label = self.label_dictionary.get_item_for_index(idx)
labels.append(label)
labels.append(Label(label, conf))

return labels

def _get_single_label(self, label_scores) -> List[str]:
def _get_single_label(self, label_scores) -> List[Label]:
conf, idx = torch.max(label_scores[0], 0)
label = self.label_dictionary.get_item_for_index(idx.item())

return [label]
return [Label(label, conf)]

def _calculate_multi_label_loss(self, label_scores, sentences: List[Sentence]) -> float:
loss_function = nn.BCELoss()
Expand All @@ -163,7 +163,7 @@ def _calculate_single_label_loss(self, label_scores, sentences: List[Sentence])
return loss_function(label_scores, self._labels_to_indices(sentences))

def _labels_to_one_hot(self, sentences: List[Sentence]):
label_list = [sentence.labels for sentence in sentences]
label_list = [sentence.get_label_names() for sentence in sentences]
one_hot = convert_labels_to_one_hot(label_list, self.label_dictionary)
one_hot = [torch.FloatTensor(l).unsqueeze(0) for l in one_hot]
one_hot = torch.cat(one_hot, 0)
Expand All @@ -173,7 +173,7 @@ def _labels_to_one_hot(self, sentences: List[Sentence]):

def _labels_to_indices(self, sentences: List[Sentence]):
indices = [
torch.LongTensor([self.label_dictionary.get_idx_for_item(label) for label in sentence.labels])
torch.LongTensor([self.label_dictionary.get_idx_for_item(label.name) for label in sentence.labels])
for sentence in sentences
]

Expand Down
4 changes: 2 additions & 2 deletions flair/trainers/text_classification_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ def evaluate(self, sentences: List[Sentence], eval_class_metrics: bool = False,

eval_loss += loss

y_true.extend([sentence.labels for sentence in batch])
y_pred.extend(labels)
y_true.extend([sentence.get_label_names() for sentence in batch])
y_pred.extend([[label.name for label in sent_labels] for sent_labels in labels])

if not embeddings_in_memory:
clear_embeddings(batch)
Expand Down
16 changes: 8 additions & 8 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from flair.data import Sentence, Token, Dictionary, TaggedCorpus
from flair.data import Sentence, Label, Token, Dictionary, TaggedCorpus


def test_get_head():
Expand Down Expand Up @@ -180,9 +180,9 @@ def test_tagged_corpus_make_vocab_dictionary():


def test_tagged_corpus_make_label_dictionary():
sentence_1 = Sentence('sentence 1', labels=['class_1'])
sentence_2 = Sentence('sentence 2', labels=['class_2'])
sentence_3 = Sentence('sentence 3', labels=['class_1'])
sentence_1 = Sentence('sentence 1', labels=[Label('class_1')])
sentence_2 = Sentence('sentence 2', labels=[Label('class_2')])
sentence_3 = Sentence('sentence 3', labels=[Label('class_1')])

corpus: TaggedCorpus = TaggedCorpus([sentence_1, sentence_2, sentence_3], [], [])

Expand All @@ -195,9 +195,9 @@ def test_tagged_corpus_make_label_dictionary():


def test_tagged_corpus_statistics():
train_sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer=True)
dev_sentence = Sentence('The sun is shining.', labels=['class_2'], use_tokenizer=True)
test_sentence = Sentence('Berlin is sunny.', labels=['class_1'], use_tokenizer=True)
train_sentence = Sentence('I love Berlin.', labels=[Label('class_1')], use_tokenizer=True)
dev_sentence = Sentence('The sun is shining.', labels=[Label('class_2')], use_tokenizer=True)
test_sentence = Sentence('Berlin is sunny.', labels=[Label('class_1')], use_tokenizer=True)

class_to_count_dict = TaggedCorpus._get_classes_to_count([train_sentence, dev_sentence, test_sentence])

Expand All @@ -215,7 +215,7 @@ def test_tagged_corpus_statistics():


def test_tagged_corpus_downsample():
sentence = Sentence('I love Berlin.', labels=['class_1'], use_tokenizer=True)
sentence = Sentence('I love Berlin.', labels=[Label('class_1')], use_tokenizer=True)

corpus: TaggedCorpus = TaggedCorpus(
[sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence, sentence], [], [])
Expand Down
4 changes: 2 additions & 2 deletions tests/test_text_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_labels_to_indices():
result = model._labels_to_indices(corpus.train)

for i in range(len(corpus.train)):
expected = label_dict.get_idx_for_item(corpus.train[i].labels[0])
expected = label_dict.get_idx_for_item(corpus.train[i].labels[0].name)
actual = result[i].item()

assert(expected == actual)
Expand All @@ -38,7 +38,7 @@ def test_labels_to_one_hot():
result = model._labels_to_one_hot(corpus.train)

for i in range(len(corpus.train)):
expected = label_dict.get_idx_for_item(corpus.train[i].labels[0])
expected = label_dict.get_idx_for_item(corpus.train[i].labels[0].name)
actual = result[i]

for idx in range(len(label_dict)):
Expand Down

0 comments on commit 3e3194e

Please sign in to comment.