Skip to content

Commit

Permalink
added GoEmotions text classification corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelmmm committed Oct 19, 2020
1 parent ecbf48f commit f469cfe
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 0 deletions.
1 change: 1 addition & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
from .document_classification import WASSA_FEAR
from .document_classification import WASSA_JOY
from .document_classification import WASSA_SADNESS
from .document_classification import GO_EMOTIONS

# Expose all treebanks
from .treebanks import UniversalDependenciesCorpus
Expand Down
108 changes: 108 additions & 0 deletions flair/datasets/document_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,114 @@ def __init__(
memory_mode=memory_mode,
**corpusargs,
)


class GO_EMOTIONS(ClassificationCorpus):
"""
GoEmotions dataset containing 58k Reddit comments labeled with 27 emotion categories, see. https://github.com/google-research/google-research/tree/master/goemotions
"""
def __init__(
self,
base_path: Union[str, Path] = None,
tokenizer: Union[bool, Callable[[str], List[Token]], Tokenizer] = SegtokTokenizer(),
memory_mode: str = 'partial',
**corpusargs,
):
"""
Parameters
----------
base_path : Provide this only if you want to store the corpus in a specific folder, otherwise use default.
tokenizer : Default is SegtokTokenizer().
memory_mode : Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full'
if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
even this is too much for your memory, use 'disk'.
**corpusargs : Other args for ClassificationCorpus.
"""


label_name_map = {'0': 'ADMIRATION',
'1': 'AMUSEMENT',
'2': 'ANGER',
'3': 'ANNOYANCE',
'4': 'APPROVAL',
'5': 'CARING',
'6': 'CONFUSION',
'7': 'CURIOSITY',
'8': 'DESIRE',
'9': 'DISAPPOINTMENT',
'10': 'DISAPPROVAL',
'11': 'DISGUST',
'12': 'EMBARRASSMENT',
'13': 'EXCITEMENT',
'14': 'FEAR',
'15': 'GRATITUDE',
'16': 'GRIEF',
'17': 'JOY',
'18': 'LOVE',
'19': 'NERVOUSNESS',
'20': 'OPTIMISM',
'21': 'PRIDE',
'22': 'REALIZATION',
'23': 'RELIEF',
'24': 'REMORSE',
'25': 'SADNESS',
'26': 'SURPRISE',
'27': 'NEUTRAL'}

if type(base_path) == str:
base_path: Path = Path(base_path)

# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"

# this dataset name
dataset_name = self.__class__.__name__.lower()

# default dataset folder is the cache root
data_folder = base_path / dataset_name

# download data if necessary
if not (data_folder / "train.txt").is_file():

# download datasets if necessary
goemotions_url = "https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/"
for name in ["train.tsv","test.tsv","dev.tsv"]:
cached_path(goemotions_url+name, Path("datasets") / dataset_name / 'raw')


# create dataset directory if necessary
if not os.path.exists(data_folder):
os.makedirs(data_folder)


data_path = Path(flair.cache_root) / "datasets" / dataset_name / 'raw'
# create correctly formated txt files
for name in ["train","test","dev"]:
with open(data_folder / (name +'.txt'), "w", encoding='utf-8') as txt_file:
with open(data_path / (name +".tsv"), "r",encoding = 'utf-8') as tsv_file:

lines = tsv_file.readlines()
for line in lines:
row = line.split('\t')
text = row[0]
#multiple labels are possible
labels = row[1].split(',')
label_string = ""
for label in labels:
label_string += '__label__'
label_string += label
label_string += ' '
txt_file.write(f"{label_string}{text}\n")




super(GO_EMOTIONS, self).__init__(
data_folder, label_type='sentiment', tokenizer=tokenizer,
memory_mode=memory_mode, label_name_map=label_name_map, **corpusargs,
)


class TREC_50(ClassificationCorpus):
Expand Down
1 change: 1 addition & 0 deletions resources/docs/TUTORIAL_6_CORPUS.md
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md).
| 'SENTEVAL_MPQA' | English | Opinion-polarity dataset of [SentEval](https://github.com/facebookresearch/SentEval) with opinion-polarity annotation |
| 'SENTEVAL_SST_BINARY' | English | Stanford sentiment treebank dataset of of [SentEval](https://github.com/facebookresearch/SentEval) with sentiment annotation |
| 'SENTEVAL_SST_GRANULAR' | English | Stanford sentiment treebank dataset of of [SentEval](https://github.com/facebookresearch/SentEval) with fine-grained sentiment annotation |
| 'GO_EMOTIONS' | English | [GoEmotions dataset](https://github.com/google-research/google-research/tree/master/goemotions) Reddit comments labeled with 27 emotions |
| 'TREC_6', 'TREC_50' | English | The [TREC](http://cogcomp.org/Data/QA/QC/) question classification dataset |


Expand Down

0 comments on commit f469cfe

Please sign in to comment.