From 9c070a187fcc5a69dd702d392df551950a7b67ee Mon Sep 17 00:00:00 2001 From: symeneses Date: Sun, 18 Apr 2021 14:04:40 +0200 Subject: [PATCH 1/3] add entity mapping --- flair/datasets/sequence_labeling.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 774be540b9..8a8cf2ea67 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -863,6 +863,16 @@ def __init__( # column format columns = {0: "word", 1: "ner", 3: "markdown"} + # entity_mapping + entity_mapping = {"Library_Function": "Function", + "Function_Name": "Function", + "Class_Name": "Class", + "Library_Class": "Class", + "Organization": "Website", + "Library_Variable": "Variable", + "Variable_Name": "Variable" + } + # this dataset name dataset_name = self.__class__.__name__.lower() @@ -884,10 +894,8 @@ def __init__( tag_to_bioes=tag_to_bioes, encoding="utf-8", in_memory=in_memory, - train_file="train.txt", - test_file="test.txt", - dev_file="dev.txt", - **corpusargs, + label_name_map=entity_mapping, + **corpusargs ) From f5e4b56e5f810abe461068b06ea511930b5a4650 Mon Sep 17 00:00:00 2001 From: symeneses Date: Sun, 18 Apr 2021 14:05:29 +0200 Subject: [PATCH 2/3] add data cleaning SO ner --- flair/datasets/sequence_labeling.py | 52 ++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 8a8cf2ea67..87760e3beb 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -883,14 +883,58 @@ def __init__( # download data if necessary STACKOVERFLOW_NER_path = "https://raw.githubusercontent.com/jeniyat/StackOverflowNER/master/resources/annotated_ner_data/StackOverflow/" - cached_path(f"{STACKOVERFLOW_NER_path}train.txt", Path("datasets") / dataset_name) - cached_path(f"{STACKOVERFLOW_NER_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{STACKOVERFLOW_NER_path}dev.txt", Path("datasets") / dataset_name) - # cached_path(f"{STACKOVERFLOW_NER_path}train_merged_labels.txt", Path("datasets") / dataset_name) # TODO: what is this? + + # data validation + disallowed_list = ["code omitted for annotation", + "omitted for annotation", + "CODE_BLOCK :", + "OP_BLOCK :", + "Question_URL :", + "Question_ID :" + ] + + files = ["train", "test", "dev"] + + for file in files: + questions = 0 + answers = 0 + sentences = 0 + max_length = 0 + words = [] + lines_sentence = [] + + cached_path(f"{STACKOVERFLOW_NER_path}{file}.txt", Path("datasets") / dataset_name) + write_file = open(data_folder/ (file + "_clean.txt"), mode="w+") + for line in open(data_folder/ (file + ".txt"), mode="r", encoding="utf-8"): + if line.startswith("Question_ID"): + questions += 1 + + if line.startswith("Answer_to_Question_ID"): + answers += 1 + + line_values = line.strip().split() + if len(line_values) < 2: + text = " ".join(w for w in words) + allowed = all([d not in text for d in disallowed_list]) + if allowed and len(text) > 0: + sentences += 1 + max_length = max(len(words), max_length) + for l in lines_sentence: + write_file.write(l) + write_file.write("\n") + words = [] + lines_sentence = [] + continue + words.append(line_values[0]) + lines_sentence.append(line) + super(STACKOVERFLOW_NER, self).__init__( data_folder, columns, + train_file="train_clean.txt", + test_file="test_clean.txt", + dev_file="dev_clean.txt", tag_to_bioes=tag_to_bioes, encoding="utf-8", in_memory=in_memory, From edb1ccd7f3415ffad8e6293751915fdf35af0ea0 Mon Sep 17 00:00:00 2001 From: symeneses Date: Sun, 18 Apr 2021 14:09:40 +0200 Subject: [PATCH 3/3] add summary corpus to log in SO ner --- flair/datasets/sequence_labeling.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 87760e3beb..42555696ce 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -927,6 +927,10 @@ def __init__( continue words.append(line_values[0]) lines_sentence.append(line) + log.info(f"File {file} processed:") + log.info(f"The longest sentences has {max_length} words.") + log.info(f"Questions: {questions} and Answers: {answers}") + log.info(f"Processed sentences: {sentences}.") super(STACKOVERFLOW_NER, self).__init__(