From 9c070a187fcc5a69dd702d392df551950a7b67ee Mon Sep 17 00:00:00 2001
From: symeneses <sandrayojana@gmail.com>
Date: Sun, 18 Apr 2021 14:04:40 +0200
Subject: [PATCH 1/3] add entity mapping

---
 flair/datasets/sequence_labeling.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 774be540b9..8a8cf2ea67 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -863,6 +863,16 @@ def __init__(
         # column format
         columns = {0: "word", 1: "ner", 3: "markdown"}
 
+        # entity_mapping
+        entity_mapping = {"Library_Function": "Function",
+                          "Function_Name": "Function",
+                          "Class_Name": "Class",
+                          "Library_Class": "Class",
+                          "Organization": "Website",
+                          "Library_Variable": "Variable",
+                          "Variable_Name": "Variable"
+                          }
+
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
@@ -884,10 +894,8 @@ def __init__(
             tag_to_bioes=tag_to_bioes,
             encoding="utf-8",
             in_memory=in_memory,
-            train_file="train.txt",
-            test_file="test.txt",
-            dev_file="dev.txt",
-            **corpusargs,
+            label_name_map=entity_mapping,
+            **corpusargs
         )
 
 

From f5e4b56e5f810abe461068b06ea511930b5a4650 Mon Sep 17 00:00:00 2001
From: symeneses <sandrayojana@gmail.com>
Date: Sun, 18 Apr 2021 14:05:29 +0200
Subject: [PATCH 2/3] add data cleaning SO ner

---
 flair/datasets/sequence_labeling.py | 52 ++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 8a8cf2ea67..87760e3beb 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -883,14 +883,58 @@ def __init__(
 
         # download data if necessary
         STACKOVERFLOW_NER_path = "https://raw.githubusercontent.com/jeniyat/StackOverflowNER/master/resources/annotated_ner_data/StackOverflow/"
-        cached_path(f"{STACKOVERFLOW_NER_path}train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{STACKOVERFLOW_NER_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{STACKOVERFLOW_NER_path}dev.txt", Path("datasets") / dataset_name)
-        # cached_path(f"{STACKOVERFLOW_NER_path}train_merged_labels.txt", Path("datasets") / dataset_name) # TODO: what is this?
+
+        # data validation
+        disallowed_list = ["code omitted for annotation",
+                           "omitted for annotation",
+                           "CODE_BLOCK :",
+                           "OP_BLOCK :",
+                           "Question_URL :",
+                           "Question_ID :"
+                           ]
+
+        files = ["train", "test", "dev"]
+
+        for file in files:
+            questions = 0
+            answers = 0
+            sentences = 0
+            max_length = 0
+            words = []
+            lines_sentence = []
+
+            cached_path(f"{STACKOVERFLOW_NER_path}{file}.txt", Path("datasets") / dataset_name)
+            write_file = open(data_folder/ (file + "_clean.txt"), mode="w+")
+            for line in open(data_folder/ (file + ".txt"), mode="r", encoding="utf-8"):
+                if line.startswith("Question_ID"):
+                    questions += 1
+
+                if line.startswith("Answer_to_Question_ID"):
+                    answers += 1
+
+                line_values = line.strip().split()
+                if len(line_values) < 2:
+                    text = " ".join(w for w in words)
+                    allowed = all([d not in text for d in disallowed_list])
+                    if allowed and len(text) > 0:
+                        sentences += 1
+                        max_length = max(len(words), max_length)
+                        for l in lines_sentence:
+                            write_file.write(l)
+                    write_file.write("\n")
+                    words = []
+                    lines_sentence = []
+                    continue
+                words.append(line_values[0])
+                lines_sentence.append(line)
+
 
         super(STACKOVERFLOW_NER, self).__init__(
             data_folder,
             columns,
+            train_file="train_clean.txt",
+            test_file="test_clean.txt",
+            dev_file="dev_clean.txt",
             tag_to_bioes=tag_to_bioes,
             encoding="utf-8",
             in_memory=in_memory,

From edb1ccd7f3415ffad8e6293751915fdf35af0ea0 Mon Sep 17 00:00:00 2001
From: symeneses <sandrayojana@gmail.com>
Date: Sun, 18 Apr 2021 14:09:40 +0200
Subject: [PATCH 3/3] add summary corpus to log in SO ner

---
 flair/datasets/sequence_labeling.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 87760e3beb..42555696ce 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -927,6 +927,10 @@ def __init__(
                     continue
                 words.append(line_values[0])
                 lines_sentence.append(line)
+            log.info(f"File {file} processed:")
+            log.info(f"The longest sentences has {max_length} words.")
+            log.info(f"Questions: {questions} and Answers: {answers}")
+            log.info(f"Processed sentences: {sentences}.")
 
 
         super(STACKOVERFLOW_NER, self).__init__(