flairNLP · alanakbik · Nov 3, 2020 · Nov 2, 2020 · Nov 3, 2020 · Nov 3, 2020
diff --git a/flair/data.py b/flair/data.py
@@ -624,7 +624,7 @@ def _add_spans_internal(self, spans: List[Span], label_type: str, min_score):
             tag_value = tag.value
 
             # non-set tags are OUT tags
-            if tag_value == "" or tag_value == "O":
+            if tag_value == "" or tag_value == "O" or tag_value == "_":
                 tag_value = "O-"
 
             # anything that is not a BIOES tag is a SINGLE tag

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -567,21 +567,37 @@ def evaluate(
 
         # make "classification report"
         target_names = []
+        labels_to_report = []
+        all_labels = []
+        all_indices = []
         for i in range(len(labels)):
-            target_names.append(labels.get_item_for_index(i))
+            label = labels.get_item_for_index(i)
+            all_labels.append(label)
+            all_indices.append(i)
+            if label == '_' or label == '': continue
+            target_names.append(label)
+            labels_to_report.append(i)
+
+        # report over all in case there are no labels
+        if not labels_to_report:
+            target_names = all_labels
+            labels_to_report = all_indices
+
         classification_report = metrics.classification_report(y_true, y_pred, digits=4, target_names=target_names,
-                                                              zero_division=1)
+                                                              zero_division=1, labels=labels_to_report)
 
         # get scores
-        micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro'), 4)
-        macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro'), 4)
+        micro_f_score = round(
+            metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', labels=labels_to_report), 4)
+        macro_f_score = round(
+            metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', labels=labels_to_report), 4)
         accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)
 
         detailed_result = (
                 "\nResults:"
-                f"\n- F-score (micro) {micro_f_score}"
-                f"\n- F-score (macro) {macro_f_score}"
-                f"\n- Accuracy {accuracy_score}"
+                f"\n- F-score (micro): {micro_f_score}"
+                f"\n- F-score (macro): {macro_f_score}"
+                f"\n- Accuracy (incl. no class): {accuracy_score}"
                 '\n\nBy class:\n' + classification_report
         )
 

diff --git a/tests/test_hyperparameter.py b/tests/test_hyperparameter.py
@@ -18,7 +18,7 @@
 
 def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
-        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
+        data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
 
     # define search space

diff --git a/tests/test_sequence_tagger.py b/tests/test_sequence_tagger.py
@@ -59,7 +59,7 @@ def test_load_use_tagger_keep_embedding():
 @pytest.mark.integration
 def test_train_load_use_tagger(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
-        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
+        data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
     tag_dictionary = corpus.make_tag_dictionary("ner")
 
@@ -186,7 +186,7 @@ def test_train_load_use_tagger_flair_embeddings(results_base_path, tasks_base_pa
 @pytest.mark.integration
 def test_train_load_use_tagger_adam(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
-        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
+        data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
     tag_dictionary = corpus.make_tag_dictionary("ner")
 
@@ -229,7 +229,7 @@ def test_train_load_use_tagger_adam(results_base_path, tasks_base_path):
 @pytest.mark.integration
 def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
     corpus_1 = flair.datasets.ColumnCorpus(
-        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
+        data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
     corpus_2 = flair.datasets.GERMEVAL_14(base_path=tasks_base_path)
 
@@ -275,7 +275,7 @@ def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
 @pytest.mark.integration
 def test_train_resume_tagger(results_base_path, tasks_base_path):
     corpus_1 = flair.datasets.ColumnCorpus(
-        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
+        data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
     corpus_2 = flair.datasets.GERMEVAL_14(base_path=tasks_base_path)
 
@@ -306,7 +306,7 @@ def test_train_resume_tagger(results_base_path, tasks_base_path):
 @pytest.mark.integration
 def test_find_learning_rate(results_base_path, tasks_base_path):
     corpus = flair.datasets.ColumnCorpus(
-        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
+        data_folder=tasks_base_path / "fashion", column_format={0: "text", 3: "ner"}
     )
     tag_dictionary = corpus.make_tag_dictionary("ner")