From 0969a1465943c13d1470ae2eda4dbe38cffde8ea Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Sat, 23 Oct 2021 08:28:56 -0400
Subject: [PATCH 01/12] Barebones implementation

---
 allennlp/common/testing/test_case.py          |  14 +-
 allennlp/evaluation/__init__.py               |   2 +
 allennlp/evaluation/evaluator.py              | 201 ++++++++++++++++++
 .../evaluation/postprocessors/__init__.py     |   1 +
 .../postprocessors/postprocessor.py           |  56 +++++
 tests/commands/evaluate_test.py               |  22 +-
 tests/evaluation/__init__.py                  |   0
 tests/evaluation/evaluator_tests.py           |  57 +++++
 tests/evaluation/postprocessors/__init__.py   |   0
 .../postprocessors/postprocessor_test.py      |  59 +++++
 10 files changed, 386 insertions(+), 26 deletions(-)
 create mode 100644 allennlp/evaluation/__init__.py
 create mode 100644 allennlp/evaluation/evaluator.py
 create mode 100644 allennlp/evaluation/postprocessors/__init__.py
 create mode 100644 allennlp/evaluation/postprocessors/postprocessor.py
 create mode 100644 tests/evaluation/__init__.py
 create mode 100644 tests/evaluation/evaluator_tests.py
 create mode 100644 tests/evaluation/postprocessors/__init__.py
 create mode 100644 tests/evaluation/postprocessors/postprocessor_test.py

diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py
index 9f466e8ee6b..05a34505079 100644
--- a/allennlp/common/testing/test_case.py
+++ b/allennlp/common/testing/test_case.py
@@ -2,11 +2,13 @@
 import os
 import pathlib
 import shutil
+import pytest
 import tempfile
 
 from allennlp.common.checks import log_pytorch_version_info
 
-TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests")
+
+# TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests")
 
 
 class AllenNlpTestCase:
@@ -21,6 +23,12 @@ class AllenNlpTestCase:
     TESTS_ROOT = PROJECT_ROOT / "tests"
     FIXTURES_ROOT = PROJECT_ROOT / "test_fixtures"
 
+    # Had issues with the old tempdir, so used this pytest fixture that is
+    # always ran and will always create a tmpdir.
+    @pytest.fixture(autouse=True)
+    def test_dir(self, tmpdir):
+        self.TEST_DIR = pathlib.Path(tmpdir.mkdir('semparse'))
+
     def setup_method(self):
         logging.basicConfig(
             format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG
@@ -33,9 +41,5 @@ def setup_method(self):
         logging.getLogger("urllib3.connectionpool").disabled = True
         log_pytorch_version_info()
 
-        self.TEST_DIR = pathlib.Path(TEST_DIR)
-
-        os.makedirs(self.TEST_DIR, exist_ok=True)
-
     def teardown_method(self):
         shutil.rmtree(self.TEST_DIR)
diff --git a/allennlp/evaluation/__init__.py b/allennlp/evaluation/__init__.py
new file mode 100644
index 00000000000..ea63688b772
--- /dev/null
+++ b/allennlp/evaluation/__init__.py
@@ -0,0 +1,2 @@
+from allennlp.evaluation.evaluator import Evaluator, SimpleEvaluator
+from allennlp.evaluation.postprocessors.postprocessor import Postprocessor
diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
new file mode 100644
index 00000000000..30e7313bc36
--- /dev/null
+++ b/allennlp/evaluation/evaluator.py
@@ -0,0 +1,201 @@
+"""
+Evaluator class for evaluating a model with a given dataset
+"""
+from collections import defaultdict
+from typing import Union, List, Dict, Any, Tuple, Optional
+from os import PathLike
+from pathlib import Path
+import torch
+from itertools import groupby
+import logging
+import json
+import numpy as np
+
+from allennlp.common.checks import check_for_gpu, ConfigurationError
+from allennlp.common.tqdm import Tqdm
+from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL
+from allennlp.data.fields import TensorField
+from allennlp.nn import util as nn_util
+from allennlp.common import Registrable, Params
+from allennlp.models import Model
+from allennlp.data import DataLoader, Vocabulary
+from allennlp.evaluation.postprocessors.postprocessor import Postprocessor
+
+logger = logging.getLogger(__name__)
+
+
+class Evaluator(Registrable):
+    """
+    Evaluation class
+    
+    # Parameters
+    
+    cuda_device : `Union[int, torch.device]`, optional (default=`-1`)
+        The cuda device to use for this evaluation.  The model is assumed to already be using this
+        device; this parameter is only used for moving the input data to the correct device.
+    """
+    default_implementation = "simple"
+
+    def __init__(
+            self,
+            batch_postprocessor: Postprocessor,
+            cuda_device: Union[int, torch.device] = -1
+    ):
+        self.batch_human_serializer = batch_postprocessor
+        self.cuda_device = cuda_device
+
+    def __call__(
+            self,
+            model: Model,
+            data_loader: DataLoader,
+            batch_weight_key: str = None,
+            output_file: Union[str, PathLike] = None,
+            predictions_file: Union[str, PathLike] = None
+    ):
+        """
+        # Parameters
+
+        model : `Model`
+            The model to evaluate
+        data_loader : `DataLoader`
+            The `DataLoader` that will iterate over the evaluation data (data loaders already contain
+            their data).
+        batch_weight_key : `str`, optional (default=`None`)
+            If given, this is a key in the output dictionary for each batch that specifies how to weight
+            the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
+        metrics_output_file : `Union[str, PathLike]`, optional (default=`None`)
+            Optional path to write the final metrics to.
+        predictions_output_file : `Union[str, PathLike]`, optional (default=`None`)
+            Optional path to write the predictions to.
+
+        # Returns
+
+        `Dict[str, Any]`
+            The final metrics.
+        """
+        raise NotImplementedError("__call__")
+
+
+@Evaluator.register("simple")
+class SimpleEvaluator(Evaluator):
+    def __init__(
+            self,
+            batch_postprocessor,
+            cuda_device: Union[int, torch.device] = -1
+    ):
+        super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device)
+
+    def __call__(
+            self,
+            model: Model,
+            data_loader: DataLoader,
+            batch_weight_key: str = None,
+            output_file: Union[str, PathLike] = None,
+            predictions_file: Union[str, PathLike] = None
+    ):
+        """
+        # Parameters
+
+        model : `Model`
+            The model to evaluate
+        data_loader : `DataLoader`
+            The `DataLoader` that will iterate over the evaluation data (data loaders already contain
+            their data).
+        batch_weight_key : `str`, optional (default=`None`)
+            If given, this is a key in the output dictionary for each batch that specifies how to weight
+            the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
+        metrics_output_file : `Union[str, PathLike]`, optional (default=`None`)
+            Optional path to write the final metrics to.
+        predictions_output_file : `Union[str, PathLike]`, optional (default=`None`)
+            Optional path to write the predictions to.
+
+        # Returns
+
+        `Dict[str, Any]`
+            The final metrics.
+        """
+        check_for_gpu(self.cuda_device)
+        data_loader.set_target_device(int_to_device(self.cuda_device))
+        output_file = Path(output_file) if output_file is not None else None
+        if predictions_file is not None:
+            predictions_file = Path(predictions_file).open("w", encoding="utf-8")
+
+        with torch.no_grad():
+            model.eval()
+
+            iterator = iter(data_loader)
+            logger.info("Iterating over dataset")
+            generator_tqdm = Tqdm.tqdm(iterator)
+            # Number of batches in instances.
+            batch_count = 0
+            # Number of batches where the model produces a loss.
+            loss_count = 0
+            # Cumulative weighted loss
+            total_loss = 0.0
+            # Cumulative weight across all batches.
+            total_weight = 0.0
+
+            for batch in generator_tqdm:
+                batch_count += 1
+                batch = nn_util.move_to_device(batch, self.cuda_device)
+                output_dict = model(**batch)
+                loss = output_dict.get("loss")
+
+                metrics = model.get_metrics()
+
+                if loss is not None:
+                    loss_count += 1
+                    if batch_weight_key:
+                        weight = output_dict[batch_weight_key].item()
+                    else:
+                        weight = 1.0
+
+                    total_weight += weight
+                    total_loss += loss.item() * weight
+                    # Report the average loss so far.
+                    metrics["loss"] = total_loss / total_weight
+
+                description = (
+                        ", ".join(
+                            [
+                                "%s: %.2f" % (name, value)
+                                for name, value in metrics.items()
+                                if not name.startswith("_")
+                            ]
+                        )
+                        + " ||"
+                )
+                generator_tqdm.set_description(description, refresh=False)
+
+                # TODO(gabeorlanski): Add in postprocessing the batch for token
+                #  metrics
+                if predictions_file is not None:
+                    predictions_file.write(
+                        self.batch_human_serializer(
+                            batch, output_dict, data_loader
+                        ) + '\n'
+                    )
+
+            if predictions_file is not None:
+                predictions_file.close()
+
+            final_metrics = model.get_metrics(reset=True)
+            if loss_count > 0:
+                # Sanity check
+                if loss_count != batch_count:
+                    raise RuntimeError(
+                        "The model you are trying to evaluate only sometimes produced a loss!"
+                    )
+                final_metrics["loss"] = total_loss / total_weight
+
+            if output_file is not None:
+                dump_metrics(str(output_file), final_metrics, log=True)
+
+            return final_metrics
+
+    def _to_params(self) -> Dict[str, Any]:
+        return {
+            "type"               : "simple",
+            "cuda_device"        : self.cuda_device,
+            "batch_postprocessor": self.batch_human_serializer.to_params()
+        }
diff --git a/allennlp/evaluation/postprocessors/__init__.py b/allennlp/evaluation/postprocessors/__init__.py
new file mode 100644
index 00000000000..df1c98089c0
--- /dev/null
+++ b/allennlp/evaluation/postprocessors/__init__.py
@@ -0,0 +1 @@
+from allennlp.evaluation.postprocessors.postprocessor import SimplePostprocessor
diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/postprocessors/postprocessor.py
new file mode 100644
index 00000000000..00e0dd0b154
--- /dev/null
+++ b/allennlp/evaluation/postprocessors/postprocessor.py
@@ -0,0 +1,56 @@
+from collections import defaultdict
+from typing import Union, List, Dict, Any, Tuple, Optional
+from os import PathLike
+from pathlib import Path
+import torch
+from itertools import groupby
+import logging
+import json
+import numpy as np
+
+from allennlp.common.checks import check_for_gpu, ConfigurationError
+from allennlp.common.tqdm import Tqdm
+from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL
+from allennlp.data.fields import TensorField
+from allennlp.nn import util as nn_util
+from allennlp.common import Registrable, Params
+from allennlp.models import Model
+from allennlp.data import DataLoader, Vocabulary
+
+logger = logging.getLogger(__name__)
+
+
+class Postprocessor(Registrable):
+    """
+    General Postprocessor class for turning batches into human readable data
+    """
+
+    def __call__(
+            self,
+            batch: Dict[str, TensorField],
+            output_dict: Dict,
+            data_loader: DataLoader
+    ) -> str:
+        raise NotImplementedError("__call__")
+
+    default_implementation = "simple"
+
+
+@Postprocessor.register("simple")
+class SimplePostprocessor(Postprocessor):
+    def _to_params(self) -> Dict[str, Any]:
+        return {
+            "type": "simple"
+        }
+
+    def __call__(
+            self,
+            batch: Dict[str, TensorField],
+            output_dict: Dict,
+            data_loader: DataLoader
+    ):
+        if batch is None:
+            raise ValueError("Postprocessor got a batch that is None")
+        if output_dict is None:
+            raise ValueError("Postprocessor got an output_dict that is None")
+        return json.dumps(sanitize({**batch, **output_dict}))
diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py
index 932831c7e81..d16521d58df 100644
--- a/tests/commands/evaluate_test.py
+++ b/tests/commands/evaluate_test.py
@@ -4,9 +4,8 @@
 
 import torch
 from flaky import flaky
-import pytest
 
-from allennlp.commands.evaluate import evaluate_from_args, Evaluate, evaluate
+from allennlp.commands.evaluate import evaluate_from_args, Evaluate
 from allennlp.common.testing import AllenNlpTestCase
 from allennlp.data.data_loaders import TensorDict
 from allennlp.models import Model
@@ -43,25 +42,6 @@ def setup_method(self):
         subparsers = self.parser.add_subparsers(title="Commands", metavar="")
         Evaluate().add_subparser(subparsers)
 
-    def test_evaluate_calculates_average_loss(self):
-        losses = [7.0, 9.0, 8.0]
-        outputs = [{"loss": torch.Tensor([loss])} for loss in losses]
-        data_loader = DummyDataLoader(outputs)
-        metrics = evaluate(DummyModel(), data_loader, -1, "")
-        assert metrics["loss"] == pytest.approx(8.0)
-
-    def test_evaluate_calculates_average_loss_with_weights(self):
-        losses = [7.0, 9.0, 8.0]
-        weights = [10, 2, 1.5]
-        inputs = zip(losses, weights)
-        outputs = [
-            {"loss": torch.Tensor([loss]), "batch_weight": torch.Tensor([weight])}
-            for loss, weight in inputs
-        ]
-        data_loader = DummyDataLoader(outputs)
-        metrics = evaluate(DummyModel(), data_loader, -1, "batch_weight")
-        assert metrics["loss"] == pytest.approx((70 + 18 + 12) / 13.5)
-
     @flaky
     def test_evaluate_from_args(self):
         kebab_args = [
diff --git a/tests/evaluation/__init__.py b/tests/evaluation/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/evaluation/evaluator_tests.py b/tests/evaluation/evaluator_tests.py
new file mode 100644
index 00000000000..28a088a9b06
--- /dev/null
+++ b/tests/evaluation/evaluator_tests.py
@@ -0,0 +1,57 @@
+from typing import Iterator, List, Dict
+
+import torch
+import pytest
+
+from allennlp.common.testing import AllenNlpTestCase
+from allennlp.data.data_loaders import TensorDict
+from allennlp.models import Model
+from allennlp.evaluation import Evaluator
+from allennlp.common import Params
+
+
+class DummyDataLoader:
+    def __init__(self, outputs: List[TensorDict]) -> None:
+        super().__init__()
+        self._outputs = outputs
+
+    def __iter__(self) -> Iterator[TensorDict]:
+        yield from self._outputs
+
+    def __len__(self):
+        return len(self._outputs)
+
+    def set_target_device(self, _):
+        pass
+
+
+class DummyModel(Model):
+    def __init__(self) -> None:
+        super().__init__(None)  # type: ignore
+
+    def forward(self, **kwargs) -> Dict[str, torch.Tensor]:  # type: ignore
+        return kwargs
+
+
+class TestEvaluator(AllenNlpTestCase):
+    def setup_method(self):
+        self.evaluator = Evaluator.from_params(Params({"batch_postprocessor": "simple"}))
+
+    def test_evaluate_calculates_average_loss(self):
+        losses = [7.0, 9.0, 8.0]
+        outputs = [{"loss": torch.Tensor([loss])} for loss in losses]
+        data_loader = DummyDataLoader(outputs)
+        metrics = self.evaluator(DummyModel(), data_loader, "")  # type: ignore
+        assert metrics["loss"] == pytest.approx(8.0)
+
+    def test_evaluate_calculates_average_loss_with_weights(self):
+        losses = [7.0, 9.0, 8.0]
+        weights = [10, 2, 1.5]
+        inputs = zip(losses, weights)
+        outputs = [
+            {"loss": torch.Tensor([loss]), "batch_weight": torch.Tensor([weight])}
+            for loss, weight in inputs
+        ]
+        data_loader = DummyDataLoader(outputs)
+        metrics = self.evaluator(DummyModel(), data_loader, "batch_weight")  # type: ignore
+        assert metrics["loss"] == pytest.approx((70 + 18 + 12) / 13.5)
diff --git a/tests/evaluation/postprocessors/__init__.py b/tests/evaluation/postprocessors/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py
new file mode 100644
index 00000000000..5a552959544
--- /dev/null
+++ b/tests/evaluation/postprocessors/postprocessor_test.py
@@ -0,0 +1,59 @@
+from typing import Iterator, List, Dict
+import torch
+import pytest
+import json
+
+from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common import Params
+from allennlp.common.util import sanitize
+from allennlp.data.data_loaders import TensorDict
+from allennlp.evaluation import Postprocessor
+from allennlp.evaluation.postprocessors import SimplePostprocessor
+
+
+class DummyDataLoader:
+    def __init__(self, outputs: List[TensorDict]) -> None:
+        super().__init__()
+        self._outputs = outputs
+
+    def __iter__(self) -> Iterator[TensorDict]:
+        yield from self._outputs
+
+    def __len__(self):
+        return len(self._outputs)
+
+    def set_target_device(self, _):
+        pass
+
+
+class TestPostprocessor(AllenNlpTestCase):
+    def setup_method(self):
+        self.postprocessor = Postprocessor.from_params(Params({}))
+
+    def test_postprocessor_default_implementation(self):
+        assert self.postprocessor.to_params().params == {"type": "simple"}
+        assert isinstance(self.postprocessor, SimplePostprocessor)
+
+    @pytest.mark.parametrize("batch", [
+        {
+            "Do you want ants?": "Because that's how you get ants.",
+            "testing"          : torch.tensor([[1, 2, 3]])
+        },
+        {},
+        None
+    ], ids=["TestBatch", "EmptyBatch", "None"])
+    @pytest.mark.parametrize("output_dict", [
+        {"You're": ["Not", [["My"]], "Supervisor"]},
+        {},
+        None
+    ], ids=["TestOutput", "EmptyOutput", "None"])
+    def test_simple_postprocessor_call(self, batch, output_dict):
+        data_loader = DummyDataLoader([])
+        if batch is None or output_dict is None:
+            with pytest.raises(ValueError):
+                self.postprocessor(batch, output_dict, data_loader)  # type: ignore
+            return
+
+        expected = json.dumps(sanitize({**batch, **output_dict}))
+        result = self.postprocessor(batch, output_dict, data_loader)  # type: ignore
+        assert result == expected

From fda9469d8c18494e2325c9d67920fd5697e777d4 Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Sat, 23 Oct 2021 08:35:22 -0400
Subject: [PATCH 02/12] Better Docs + Minor Bugfix

Added in fix for the SimpleEvaluator not initializing a post processor
---
 allennlp/evaluation/evaluator.py    | 44 ++++++++++++++++-------------
 tests/evaluation/evaluator_tests.py |  7 +++++
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
index 30e7313bc36..d19a0c3524a 100644
--- a/allennlp/evaluation/evaluator.py
+++ b/allennlp/evaluation/evaluator.py
@@ -1,24 +1,19 @@
 """
 Evaluator class for evaluating a model with a given dataset
 """
-from collections import defaultdict
-from typing import Union, List, Dict, Any, Tuple, Optional
+from typing import Union, Dict, Any
 from os import PathLike
 from pathlib import Path
 import torch
-from itertools import groupby
 import logging
-import json
-import numpy as np
 
-from allennlp.common.checks import check_for_gpu, ConfigurationError
+from allennlp.common.checks import check_for_gpu
 from allennlp.common.tqdm import Tqdm
-from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL
-from allennlp.data.fields import TensorField
+from allennlp.common.util import dump_metrics, int_to_device
 from allennlp.nn import util as nn_util
-from allennlp.common import Registrable, Params
+from allennlp.common import Registrable
 from allennlp.models import Model
-from allennlp.data import DataLoader, Vocabulary
+from allennlp.data import DataLoader
 from allennlp.evaluation.postprocessors.postprocessor import Postprocessor
 
 logger = logging.getLogger(__name__)
@@ -29,10 +24,15 @@ class Evaluator(Registrable):
     Evaluation class
     
     # Parameters
-    
+
+    batch_postprocessor: `Postprocessor`
+        The postprocessor to use for turning both the batches and the outputs
+        of the model into human readable data.
+
     cuda_device : `Union[int, torch.device]`, optional (default=`-1`)
-        The cuda device to use for this evaluation.  The model is assumed to already be using this
-        device; this parameter is only used for moving the input data to the correct device.
+        The cuda device to use for this evaluation.  The model is assumed to
+         already be using this device; this parameter is only used for moving
+         the input data to the correct device.
     """
     default_implementation = "simple"
 
@@ -41,7 +41,7 @@ def __init__(
             batch_postprocessor: Postprocessor,
             cuda_device: Union[int, torch.device] = -1
     ):
-        self.batch_human_serializer = batch_postprocessor
+        self.batch_postprocessor = batch_postprocessor
         self.cuda_device = cuda_device
 
     def __call__(
@@ -51,7 +51,7 @@ def __call__(
             batch_weight_key: str = None,
             output_file: Union[str, PathLike] = None,
             predictions_file: Union[str, PathLike] = None
-    ):
+    ) -> Dict[str, Any]:
         """
         # Parameters
 
@@ -65,8 +65,10 @@ def __call__(
             the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
         metrics_output_file : `Union[str, PathLike]`, optional (default=`None`)
             Optional path to write the final metrics to.
+
         predictions_output_file : `Union[str, PathLike]`, optional (default=`None`)
-            Optional path to write the predictions to.
+            Optional path to write the predictions to. If passed the
+            postprocessor will be called and its output will be written as lines.
 
         # Returns
 
@@ -78,9 +80,13 @@ def __call__(
 
 @Evaluator.register("simple")
 class SimpleEvaluator(Evaluator):
+    """
+    Simple evaluator implementation. Uses the vanilla evaluation code.
+    """
+
     def __init__(
             self,
-            batch_postprocessor,
+            batch_postprocessor:Postprocessor,
             cuda_device: Union[int, torch.device] = -1
     ):
         super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device)
@@ -171,7 +177,7 @@ def __call__(
                 #  metrics
                 if predictions_file is not None:
                     predictions_file.write(
-                        self.batch_human_serializer(
+                        self.batch_postprocessor(
                             batch, output_dict, data_loader
                         ) + '\n'
                     )
@@ -197,5 +203,5 @@ def _to_params(self) -> Dict[str, Any]:
         return {
             "type"               : "simple",
             "cuda_device"        : self.cuda_device,
-            "batch_postprocessor": self.batch_human_serializer.to_params()
+            "batch_postprocessor": self.batch_postprocessor.to_params()
         }
diff --git a/tests/evaluation/evaluator_tests.py b/tests/evaluation/evaluator_tests.py
index 28a088a9b06..9c21ffce190 100644
--- a/tests/evaluation/evaluator_tests.py
+++ b/tests/evaluation/evaluator_tests.py
@@ -55,3 +55,10 @@ def test_evaluate_calculates_average_loss_with_weights(self):
         data_loader = DummyDataLoader(outputs)
         metrics = self.evaluator(DummyModel(), data_loader, "batch_weight")  # type: ignore
         assert metrics["loss"] == pytest.approx((70 + 18 + 12) / 13.5)
+
+    def test_to_params(self):
+        assert self.evaluator.to_params() == {
+            "type"               : "simple",
+            "cuda_device"        : -1,
+            "batch_postprocessor": {"type": "simple"}
+        }

From f3ee8c3d7f143af227e0eccb06525d3b42409a56 Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Sat, 23 Oct 2021 09:48:12 -0400
Subject: [PATCH 03/12] Basic Support added to the evaluate function for the
 evaluator

---
 allennlp/commands/evaluate.py                 | 154 ++++++++++++++----
 allennlp/common/testing/test_case.py          |  12 +-
 allennlp/evaluation/evaluator.py              |  22 +--
 tests/commands/evaluate_test.py               | 126 ++++++++++----
 .../postprocessors/postprocessor_test.py      |   1 +
 5 files changed, 225 insertions(+), 90 deletions(-)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index b517de979de..77924fce3ac 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -7,8 +7,9 @@
 import argparse
 import json
 import logging
-from typing import Any, Dict
-
+from pathlib import Path
+from os import PathLike
+from typing import Union, Dict, Any, Optional, List
 from copy import deepcopy
 
 from overrides import overrides
@@ -18,7 +19,7 @@
 from allennlp.common.util import prepare_environment
 from allennlp.data import DataLoader
 from allennlp.models.archival import load_archive
-from allennlp.training.util import evaluate
+from allennlp.evaluation import Evaluator
 
 logger = logging.getLogger(__name__)
 
@@ -98,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             action="store_true",
             default=False,
             help="if specified, we will use the instances in your new dataset to "
-            "extend your vocabulary. If pretrained-file was used to initialize "
-            "embedding layers, you may also need to pass --embedding-sources-mapping.",
+                 "extend your vocabulary. If pretrained-file was used to initialize "
+                 "embedding layers, you may also need to pass --embedding-sources-mapping.",
         )
 
         subparser.add_argument(
@@ -107,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             type=str,
             default="",
             help="a JSON dict defining mapping from embedding module path to embedding "
-            "pretrained-file used during training. If not passed, and embedding needs to be "
-            "extended, we will try to use the original file paths used during training. If "
-            "they are not available we will use random vectors for embedding extension.",
+                 "pretrained-file used during training. If not passed, and embedding needs to be "
+                 "extended, we will try to use the original file paths used during training. If "
+                 "they are not available we will use random vectors for embedding extension.",
         )
         subparser.add_argument(
             "--file-friendly-logging",
@@ -118,13 +119,52 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             help="outputs tqdm status on separate lines and slows tqdm refresh rate",
         )
 
+        subparser.add_argument(
+            "--auto-names",
+            default="NONE",
+            help="Automatically create output names for each evaluation file.",
+            choices=["NONE", "METRICS", "PREDS", "ALL"]
+        )
+
         subparser.set_defaults(func=evaluate_from_args)
 
         return subparser
 
 
 def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
-    common_logging.FILE_FRIENDLY_LOGGING = args.file_friendly_logging
+    return evaluate_from_archive(
+        archive_file=args.archive_file,
+        input_file=args.input_file,
+        output_file=args.output_file,
+        predictions_output_file=args.predictions_output_file,
+        batch_size=args.batch_size,
+        cmd_overrides=args.overrides,
+        cuda_device=args.cuda_device,
+        embedding_sources_mapping=args.embedding_sources_mapping,
+        extend_vocab=args.extend_vocab,
+        weights_file=args.weights_file,
+        file_friendly_logging=args.file_friendly_logging,
+        batch_weight_key=args.batch_weight_key,
+        auto_names=args.auto_names
+    )
+
+
+def evaluate_from_archive(
+        archive_file: Union[str, PathLike],
+        input_file: str,
+        output_file: Optional[str] = None,
+        predictions_output_file: Optional[str] = None,
+        batch_size: Optional[int] = None,
+        cmd_overrides: Union[str, Dict[str, Any]] = "",
+        cuda_device: int = -1,
+        embedding_sources_mapping: str = None,
+        extend_vocab: bool = False,
+        weights_file: str = None,
+        file_friendly_logging: bool = False,
+        batch_weight_key: str = None,
+        auto_names: str = "None",
+):
+    common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging
 
     # Disable some of the more verbose logging statements
     logging.getLogger("allennlp.common.params").disabled = True
@@ -133,77 +173,119 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
 
     # Load from archive
     archive = load_archive(
-        args.archive_file,
-        weights_file=args.weights_file,
-        cuda_device=args.cuda_device,
-        overrides=args.overrides,
+        archive_file,
+        weights_file=weights_file,
+        cuda_device=cuda_device,
+        overrides=cmd_overrides,
     )
     config = deepcopy(archive.config)
     prepare_environment(config)
     model = archive.model
     model.eval()
 
+    # Load the evaluator from the config key `Evaluator`
+    evaluator_params = config.pop("evaluation", {})
+    evaluator_params['cuda_device'] = cuda_device
+    evaluator = Evaluator.from_params(evaluator_params)
+
     # Load the evaluation data
     dataset_reader = archive.validation_dataset_reader
 
     # split files
-    evaluation_data_path_list = args.input_file.split(":")
-    if args.output_file is not None:
-        output_file_list = args.output_file.split(":")
-        assert len(output_file_list) == len(
-            evaluation_data_path_list
-        ), "The number of `output_file` paths must be equal to the number of datasets being evaluated."
-    if args.predictions_output_file is not None:
-        predictions_output_file_list = args.predictions_output_file.split(":")
-        assert len(predictions_output_file_list) == len(evaluation_data_path_list), (
-            "The number of `predictions_output_file` paths must be equal"
-            + "to the number of datasets being evaluated. "
-        )
+    evaluation_data_path_list = input_file.split(":")
+
+    # TODO(gabeorlanski): Is it safe to always default to .outputs and .preds?
+    # TODO(gabeorlanski): Add in way to save to specific output directory
+    if output_file is not None:
+        if auto_names == "METRICS" or auto_names == "ALL":
+            logger.warning(f"Passed output_files will be ignored, auto_names is"
+                           f" set to {auto_names}")
+
+            # Keep the path of the parent otherwise it will write to the CWD
+            output_file_list = [
+                 p.parent.joinpath(f"{p.stem}.outputs")
+                for p in map(Path, evaluation_data_path_list)
+            ]
+        else:
+            output_file_list = output_file.split(":")
+            assert len(output_file_list) == len(
+                evaluation_data_path_list
+            ), "The number of `output_file` paths must be equal to the number of datasets being evaluated."
+    if predictions_output_file is not None:
+        if auto_names == "PREDS" or auto_names == "ALL":
+            logger.warning(f"Passed predictions files will be ignored, auto_names is"
+                           f" set to {auto_names}")
+
+            # Keep the path of the parent otherwise it will write to the CWD
+            predictions_output_file_list = [
+                p.parent.joinpath(f"{p.stem}.preds")
+                for p in map(Path, evaluation_data_path_list)
+            ]
+        else:
+            predictions_output_file_list = predictions_output_file.split(":")
+            assert len(predictions_output_file_list) == len(evaluation_data_path_list), (
+                    "The number of `predictions_output_file` paths must be equal"
+                    + "to the number of datasets being evaluated. "
+            )
 
     # output file
     output_file_path = None
     predictions_output_file_path = None
 
     # embedding sources
-    if args.extend_vocab:
+    if extend_vocab:
         logger.info("Vocabulary is being extended with embedding sources.")
         embedding_sources = (
-            json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}
+            json.loads(embedding_sources_mapping) if embedding_sources_mapping else {}
         )
 
+    all_metrics = {}
     for index in range(len(evaluation_data_path_list)):
         config = deepcopy(archive.config)
         evaluation_data_path = evaluation_data_path_list[index]
-        if args.output_file is not None:
+
+        # Get the eval file name so we can save each metric by file name in the
+        # output dictionary.
+        eval_file_name = Path(evaluation_data_path).stem
+
+        if output_file is not None:
+            # noinspection PyUnboundLocalVariable
             output_file_path = output_file_list[index]
-        if args.predictions_output_file is not None:
+
+        if predictions_output_file is not None:
+            # noinspection PyUnboundLocalVariable
             predictions_output_file_path = predictions_output_file_list[index]
 
         logger.info("Reading evaluation data from %s", evaluation_data_path)
         data_loader_params = config.get("validation_data_loader", None)
         if data_loader_params is None:
             data_loader_params = config.get("data_loader")
-        if args.batch_size:
-            data_loader_params["batch_size"] = args.batch_size
+        if batch_size:
+            data_loader_params["batch_size"] = batch_size
         data_loader = DataLoader.from_params(
             params=data_loader_params, reader=dataset_reader, data_path=evaluation_data_path
         )
 
-        if args.extend_vocab:
+        if extend_vocab:
             logger.info("Vocabulary is being extended with test instances.")
             model.vocab.extend_from_instances(instances=data_loader.iter_instances())
+            # noinspection PyUnboundLocalVariable
             model.extend_embedder_vocab(embedding_sources)
 
         data_loader.index_with(model.vocab)
 
-        metrics = evaluate(
+        metrics = evaluator(
             model,
             data_loader,
-            args.cuda_device,
-            args.batch_weight_key,
+            batch_weight_key,
             output_file=output_file_path,
             predictions_output_file=predictions_output_file_path,
         )
+
+        # Add the metric prefixed by the file it came from.
+        for name, value in metrics.items():
+            all_metrics[f"{eval_file_name}_{name}"] = value
+
     logger.info("Finished evaluating.")
 
-    return metrics
+    return all_metrics
diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py
index 05a34505079..dcfd5da217f 100644
--- a/allennlp/common/testing/test_case.py
+++ b/allennlp/common/testing/test_case.py
@@ -7,8 +7,7 @@
 
 from allennlp.common.checks import log_pytorch_version_info
 
-
-# TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests")
+TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests")
 
 
 class AllenNlpTestCase:
@@ -23,12 +22,6 @@ class AllenNlpTestCase:
     TESTS_ROOT = PROJECT_ROOT / "tests"
     FIXTURES_ROOT = PROJECT_ROOT / "test_fixtures"
 
-    # Had issues with the old tempdir, so used this pytest fixture that is
-    # always ran and will always create a tmpdir.
-    @pytest.fixture(autouse=True)
-    def test_dir(self, tmpdir):
-        self.TEST_DIR = pathlib.Path(tmpdir.mkdir('semparse'))
-
     def setup_method(self):
         logging.basicConfig(
             format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG
@@ -40,6 +33,9 @@ def setup_method(self):
         logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(logging.INFO)
         logging.getLogger("urllib3.connectionpool").disabled = True
         log_pytorch_version_info()
+        self.TEST_DIR = pathlib.Path(TEST_DIR)
+
+        os.makedirs(self.TEST_DIR, exist_ok=True)
 
     def teardown_method(self):
         shutil.rmtree(self.TEST_DIR)
diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
index d19a0c3524a..fe9ee36d1f3 100644
--- a/allennlp/evaluation/evaluator.py
+++ b/allennlp/evaluation/evaluator.py
@@ -1,7 +1,7 @@
 """
 Evaluator class for evaluating a model with a given dataset
 """
-from typing import Union, Dict, Any
+from typing import Union, Dict, Any, Optional
 from os import PathLike
 from pathlib import Path
 import torch
@@ -14,7 +14,7 @@
 from allennlp.common import Registrable
 from allennlp.models import Model
 from allennlp.data import DataLoader
-from allennlp.evaluation.postprocessors.postprocessor import Postprocessor
+from allennlp.evaluation.postprocessors.postprocessor import Postprocessor, SimplePostprocessor
 
 logger = logging.getLogger(__name__)
 
@@ -25,7 +25,7 @@ class Evaluator(Registrable):
     
     # Parameters
 
-    batch_postprocessor: `Postprocessor`
+    batch_postprocessor: `Postprocessor`, optional (default=`SimplePostprocessor`)
         The postprocessor to use for turning both the batches and the outputs
         of the model into human readable data.
 
@@ -38,10 +38,10 @@ class Evaluator(Registrable):
 
     def __init__(
             self,
-            batch_postprocessor: Postprocessor,
+            batch_postprocessor: Optional[Postprocessor] = None,
             cuda_device: Union[int, torch.device] = -1
     ):
-        self.batch_postprocessor = batch_postprocessor
+        self.batch_postprocessor = batch_postprocessor or SimplePostprocessor()
         self.cuda_device = cuda_device
 
     def __call__(
@@ -50,7 +50,7 @@ def __call__(
             data_loader: DataLoader,
             batch_weight_key: str = None,
             output_file: Union[str, PathLike] = None,
-            predictions_file: Union[str, PathLike] = None
+            predictions_output_file: Union[str, PathLike] = None
     ) -> Dict[str, Any]:
         """
         # Parameters
@@ -86,7 +86,7 @@ class SimpleEvaluator(Evaluator):
 
     def __init__(
             self,
-            batch_postprocessor:Postprocessor,
+            batch_postprocessor: Optional[Postprocessor] = None,
             cuda_device: Union[int, torch.device] = -1
     ):
         super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device)
@@ -97,7 +97,7 @@ def __call__(
             data_loader: DataLoader,
             batch_weight_key: str = None,
             output_file: Union[str, PathLike] = None,
-            predictions_file: Union[str, PathLike] = None
+            predictions_output_file: Union[str, PathLike] = None
     ):
         """
         # Parameters
@@ -123,8 +123,10 @@ def __call__(
         check_for_gpu(self.cuda_device)
         data_loader.set_target_device(int_to_device(self.cuda_device))
         output_file = Path(output_file) if output_file is not None else None
-        if predictions_file is not None:
-            predictions_file = Path(predictions_file).open("w", encoding="utf-8")
+        if predictions_output_file is not None:
+            predictions_file = Path(predictions_output_file).open("w", encoding="utf-8")
+        else:
+            predictions_file = None
 
         with torch.no_grad():
             model.eval()
diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py
index d16521d58df..950b56f07c9 100644
--- a/tests/commands/evaluate_test.py
+++ b/tests/commands/evaluate_test.py
@@ -1,7 +1,9 @@
 import argparse
 import json
+from pathlib import Path
 from typing import Iterator, List, Dict
-
+from shutil import copyfile
+import pytest
 import torch
 from flaky import flaky
 
@@ -57,12 +59,12 @@ def test_evaluate_from_args(self):
         args = self.parser.parse_args(kebab_args)
         metrics = evaluate_from_args(args)
         assert metrics.keys() == {
-            "accuracy",
-            "accuracy3",
-            "precision-overall",
-            "recall-overall",
-            "f1-measure-overall",
-            "loss",
+            "conll2003_accuracy",
+            "conll2003_accuracy3",
+            "conll2003_precision-overall",
+            "conll2003_recall-overall",
+            "conll2003_f1-measure-overall",
+            "conll2003_loss",
         }
 
     def test_output_file_evaluate_from_args(self):
@@ -86,7 +88,7 @@ def test_output_file_evaluate_from_args(self):
 
         with open(output_file, "r") as file:
             saved_metrics = json.load(file)
-        assert computed_metrics == saved_metrics
+        assert computed_metrics == {f"conll2003_{k}": v for k, v in saved_metrics.items()}
 
         with open(predictions_output_file, "r") as file:
             for line in file:
@@ -94,34 +96,35 @@ def test_output_file_evaluate_from_args(self):
             assert "tags" in prediction
 
     def test_multiple_output_files_evaluate_from_args(self):
-        output_file = str(self.TEST_DIR / "metrics.json")
-        predictions_output_file = str(self.TEST_DIR / "predictions.jsonl")
-        kebab_args = [
-            "evaluate",
-            str(
-                self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
-            ),
-            str(self.FIXTURES_ROOT / "data" / "conll2003.txt")
-            + ":"
-            + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"),
-            "--cuda-device",
-            "-1",
-            "--output-file",
-            output_file + ":" + output_file,
-            "--predictions-output-file",
-            predictions_output_file + ":" + predictions_output_file,
-        ]
-        args = self.parser.parse_args(kebab_args)
-        computed_metrics = evaluate_from_args(args)
-
-        with open(output_file, "r") as file:
-            saved_metrics = json.load(file)
-        assert computed_metrics == saved_metrics
-
-        with open(predictions_output_file, "r") as file:
-            for line in file:
-                prediction = json.loads(line.strip())
-            assert "tags" in prediction
+        pytest.fail()
+        # output_file = str(self.TEST_DIR / "metrics.json")
+        # predictions_output_file = str(self.TEST_DIR / "predictions.jsonl")
+        # kebab_args = [
+        #     "evaluate",
+        #     str(
+        #         self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
+        #     ),
+        #     str(self.FIXTURES_ROOT / "data" / "conll2003.txt")
+        #     + ":"
+        #     + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"),
+        #     "--cuda-device",
+        #     "-1",
+        #     "--output-file",
+        #     output_file + ":" + output_file,
+        #     "--predictions-output-file",
+        #     predictions_output_file + ":" + predictions_output_file,
+        # ]
+        # args = self.parser.parse_args(kebab_args)
+        # computed_metrics = evaluate_from_args(args)
+        #
+        # with open(output_file, "r") as file:
+        #     saved_metrics = json.load(file)
+        # assert computed_metrics == {f"conll2003_{k}":v for k, v in saved_metrics.items()}
+        #
+        # with open(predictions_output_file, "r") as file:
+        #     for line in file:
+        #         prediction = json.loads(line.strip())
+        #     assert "tags" in prediction
 
     def test_evaluate_works_with_vocab_expansion(self):
         archive_path = str(
@@ -155,3 +158,54 @@ def test_evaluate_works_with_vocab_expansion(self):
         )
         assert metrics_1 != metrics_2
         assert metrics_2 != metrics_3
+
+    @pytest.mark.parametrize("auto_names", ["NONE", "METRICS", "PREDS", "ALL"])
+    def test_auto_names_creates_files(self, auto_names):
+        data_file = Path(self.FIXTURES_ROOT / "data" / "conll2003.txt")
+        paths = []
+        out_paths = []
+        pred_paths = []
+        for i in range(5):
+            tmp_path = self.TEST_DIR.joinpath(f"TEST{i}.txt")
+
+            # Need to create paths to check when they do not exist
+            out_paths.append(tmp_path.parent.joinpath(f"OUTPUTS{i}.json"))
+            pred_paths.append(tmp_path.parent.joinpath(f"PREDS{i}.txt"))
+
+            copyfile(data_file, tmp_path)
+            paths.append(tmp_path)
+
+        kebab_args = [
+            "evaluate",
+            str(
+                self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
+            ),
+            ":".join(map(str, paths)),
+            "--cuda-device",
+            "-1",
+            "--output-file", ":".join(map(str, out_paths)),
+            "--predictions-output-file", ":".join(map(str, pred_paths)),
+            "--auto-names", auto_names
+        ]
+
+        args = self.parser.parse_args(kebab_args)
+        _ = evaluate_from_args(args)
+
+        expected_input_data = data_file.read_text("utf-8")
+
+        for i, p in enumerate(paths):
+            # Make sure it was not modified
+            assert p.read_text('utf-8') == expected_input_data
+
+            if auto_names == "METRICS" or auto_names == "ALL":
+                assert not out_paths[i].exists()
+                assert p.parent.joinpath(f"{p.stem}.outputs").exists()
+            else:
+                assert out_paths[i].exists()
+                assert not p.parent.joinpath(f"{p.stem}.outputs").exists()
+            if auto_names == "PREDS" or auto_names == "ALL":
+                assert not pred_paths[i].exists()
+                assert p.parent.joinpath(f"{p.stem}.preds").exists()
+            else:
+                assert pred_paths[i].exists()
+                assert not p.parent.joinpath(f"{p.stem}.preds").exists()
diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py
index 5a552959544..3969686af07 100644
--- a/tests/evaluation/postprocessors/postprocessor_test.py
+++ b/tests/evaluation/postprocessors/postprocessor_test.py
@@ -28,6 +28,7 @@ def set_target_device(self, _):
 
 class TestPostprocessor(AllenNlpTestCase):
     def setup_method(self):
+        super(TestPostprocessor, self).setup_method()
         self.postprocessor = Postprocessor.from_params(Params({}))
 
     def test_postprocessor_default_implementation(self):

From 4fe6edd4f0e610da11ee7986f57eb8ec04fafa5b Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Sat, 23 Oct 2021 09:57:00 -0400
Subject: [PATCH 04/12] Better tests for evaluate with multiple

---
 allennlp/commands/evaluate.py   |  8 ++-
 allennlp/commands/train.py      |  5 ++
 tests/commands/evaluate_test.py | 90 ++++++++++++++++++++-------------
 3 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index 77924fce3ac..356b31d0122 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -203,7 +203,7 @@ def evaluate_from_archive(
 
             # Keep the path of the parent otherwise it will write to the CWD
             output_file_list = [
-                 p.parent.joinpath(f"{p.stem}.outputs")
+                p.parent.joinpath(f"{p.stem}.outputs")
                 for p in map(Path, evaluation_data_path_list)
             ]
         else:
@@ -284,7 +284,11 @@ def evaluate_from_archive(
 
         # Add the metric prefixed by the file it came from.
         for name, value in metrics.items():
-            all_metrics[f"{eval_file_name}_{name}"] = value
+            if len(evaluation_data_path_list) > 1:
+                key = f"{eval_file_name}_"
+            else:
+                key = ""
+            all_metrics[f"{key}{name}"] = value
 
     logger.info("Finished evaluating.")
 
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index ea55749f7bc..5da3aa3969b 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -239,6 +239,11 @@ def train_model(
     training_util.create_serialization_dir(params, serialization_dir, recover, force)
     params.to_file(os.path.join(serialization_dir, CONFIG_NAME))
 
+    # Change Author: Gabe Orlanski
+    # Placeholder for the time being to make sure no errors are raised b/c of
+    # the evaluator.
+    params.pop("evaluation",None)
+
     meta = Meta.new()
     meta.to_file(os.path.join(serialization_dir, META_NAME))
 
diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py
index 950b56f07c9..df671fa6aa1 100644
--- a/tests/commands/evaluate_test.py
+++ b/tests/commands/evaluate_test.py
@@ -59,12 +59,12 @@ def test_evaluate_from_args(self):
         args = self.parser.parse_args(kebab_args)
         metrics = evaluate_from_args(args)
         assert metrics.keys() == {
-            "conll2003_accuracy",
-            "conll2003_accuracy3",
-            "conll2003_precision-overall",
-            "conll2003_recall-overall",
-            "conll2003_f1-measure-overall",
-            "conll2003_loss",
+            "accuracy",
+            "accuracy3",
+            "precision-overall",
+            "recall-overall",
+            "f1-measure-overall",
+            "loss",
         }
 
     def test_output_file_evaluate_from_args(self):
@@ -88,7 +88,7 @@ def test_output_file_evaluate_from_args(self):
 
         with open(output_file, "r") as file:
             saved_metrics = json.load(file)
-        assert computed_metrics == {f"conll2003_{k}": v for k, v in saved_metrics.items()}
+        assert computed_metrics == saved_metrics
 
         with open(predictions_output_file, "r") as file:
             for line in file:
@@ -96,35 +96,53 @@ def test_output_file_evaluate_from_args(self):
             assert "tags" in prediction
 
     def test_multiple_output_files_evaluate_from_args(self):
-        pytest.fail()
-        # output_file = str(self.TEST_DIR / "metrics.json")
-        # predictions_output_file = str(self.TEST_DIR / "predictions.jsonl")
-        # kebab_args = [
-        #     "evaluate",
-        #     str(
-        #         self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
-        #     ),
-        #     str(self.FIXTURES_ROOT / "data" / "conll2003.txt")
-        #     + ":"
-        #     + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"),
-        #     "--cuda-device",
-        #     "-1",
-        #     "--output-file",
-        #     output_file + ":" + output_file,
-        #     "--predictions-output-file",
-        #     predictions_output_file + ":" + predictions_output_file,
-        # ]
-        # args = self.parser.parse_args(kebab_args)
-        # computed_metrics = evaluate_from_args(args)
-        #
-        # with open(output_file, "r") as file:
-        #     saved_metrics = json.load(file)
-        # assert computed_metrics == {f"conll2003_{k}":v for k, v in saved_metrics.items()}
-        #
-        # with open(predictions_output_file, "r") as file:
-        #     for line in file:
-        #         prediction = json.loads(line.strip())
-        #     assert "tags" in prediction
+        data_file = Path(self.FIXTURES_ROOT / "data" / "conll2003.txt")
+        paths = []
+        out_paths = []
+        pred_paths = []
+        for i in range(3):
+            tmp_path = self.TEST_DIR.joinpath(f"TEST{i}.txt")
+
+            # Need to create paths to check when they do not exist
+            out_paths.append(tmp_path.parent.joinpath(f"OUTPUTS{i}.json"))
+            pred_paths.append(tmp_path.parent.joinpath(f"PREDS{i}.txt"))
+
+            copyfile(data_file, tmp_path)
+            paths.append(tmp_path)
+
+        kebab_args = [
+            "evaluate",
+            str(
+                self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
+            ),
+            ":".join(map(str, paths)),
+            "--cuda-device",
+            "-1",
+            "--output-file", ":".join(map(str, out_paths)),
+            "--predictions-output-file", ":".join(map(str, pred_paths))
+        ]
+        args = self.parser.parse_args(kebab_args)
+        computed_metrics = evaluate_from_args(args)
+        computed_by_file = {}
+        for k, v in computed_metrics.items():
+            fn, *metric_name = k.split("_")
+            if fn not in computed_by_file:
+                computed_by_file[fn] = {}
+            computed_by_file[fn]["_".join(metric_name)] = v
+
+        assert len(computed_by_file) == len(paths)
+        expected_input_data = data_file.read_text("utf-8")
+
+        for i, p in enumerate(paths):
+            # Make sure it was not modified
+            assert p.read_text('utf-8') == expected_input_data
+
+            assert p.stem in computed_by_file, f"paths[{i}]={p.stem}"
+
+            assert out_paths[i].exists(), f"paths[{i}]={p.stem}"
+            saved_metrics = json.loads(out_paths[i].read_text('utf-8'))
+            assert saved_metrics == computed_by_file[p.stem], f"paths[{i}]={p.stem}"
+            assert pred_paths[i].exists(), f"paths[{i}]={p.stem}"
 
     def test_evaluate_works_with_vocab_expansion(self):
         archive_path = str(

From f1a820f85e71d45c5689b19f157174b9e76cfcec Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Sat, 23 Oct 2021 10:05:36 -0400
Subject: [PATCH 05/12] Added in way to pass postprocess function

---
 allennlp/evaluation/evaluator.py              |  9 +++++-
 .../postprocessors/postprocessor.py           | 31 ++++++++++++-------
 .../postprocessors/postprocessor_test.py      | 21 +++++++++++--
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
index fe9ee36d1f3..bbb46158e6f 100644
--- a/allennlp/evaluation/evaluator.py
+++ b/allennlp/evaluation/evaluator.py
@@ -128,6 +128,10 @@ def __call__(
         else:
             predictions_file = None
 
+        model_postprocess_function = getattr(
+            model, "make_output_human_readable", None
+        )
+
         with torch.no_grad():
             model.eval()
 
@@ -180,7 +184,10 @@ def __call__(
                 if predictions_file is not None:
                     predictions_file.write(
                         self.batch_postprocessor(
-                            batch, output_dict, data_loader
+                            batch,
+                            output_dict,
+                            data_loader,
+                            output_postprocess_function=model_postprocess_function
                         ) + '\n'
                     )
 
diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/postprocessors/postprocessor.py
index 00e0dd0b154..3a9e0bfa35e 100644
--- a/allennlp/evaluation/postprocessors/postprocessor.py
+++ b/allennlp/evaluation/postprocessors/postprocessor.py
@@ -1,16 +1,9 @@
 from collections import defaultdict
-from typing import Union, List, Dict, Any, Tuple, Optional
-from os import PathLike
-from pathlib import Path
-import torch
-from itertools import groupby
+from typing import Optional, Dict, Any, Callable
 import logging
 import json
-import numpy as np
 
-from allennlp.common.checks import check_for_gpu, ConfigurationError
-from allennlp.common.tqdm import Tqdm
-from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL
+from allennlp.common.util import sanitize
 from allennlp.data.fields import TensorField
 from allennlp.nn import util as nn_util
 from allennlp.common import Registrable, Params
@@ -29,7 +22,8 @@ def __call__(
             self,
             batch: Dict[str, TensorField],
             output_dict: Dict,
-            data_loader: DataLoader
+            data_loader: DataLoader,
+            output_postprocess_function: Optional[Callable] = None
     ) -> str:
         raise NotImplementedError("__call__")
 
@@ -38,6 +32,11 @@ def __call__(
 
 @Postprocessor.register("simple")
 class SimplePostprocessor(Postprocessor):
+    """
+    Very simple postprocesser. Only sanitizes the batches and outputs. Will use
+     a passed postprocess function for the outputs if it exists.
+    """
+
     def _to_params(self) -> Dict[str, Any]:
         return {
             "type": "simple"
@@ -47,10 +46,18 @@ def __call__(
             self,
             batch: Dict[str, TensorField],
             output_dict: Dict,
-            data_loader: DataLoader
+            data_loader: DataLoader,
+            output_postprocess_function: Optional[Callable] = None
     ):
         if batch is None:
             raise ValueError("Postprocessor got a batch that is None")
         if output_dict is None:
             raise ValueError("Postprocessor got an output_dict that is None")
-        return json.dumps(sanitize({**batch, **output_dict}))
+
+        postprocessed = sanitize(batch)
+        if output_postprocess_function is not None:
+            postprocessed.update(sanitize(output_postprocess_function(output_dict)))
+        else:
+            postprocessed.update(sanitize(output_dict))
+
+        return json.dumps(postprocessed)
diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py
index 3969686af07..06e449e6485 100644
--- a/tests/evaluation/postprocessors/postprocessor_test.py
+++ b/tests/evaluation/postprocessors/postprocessor_test.py
@@ -48,13 +48,28 @@ def test_postprocessor_default_implementation(self):
         {},
         None
     ], ids=["TestOutput", "EmptyOutput", "None"])
-    def test_simple_postprocessor_call(self, batch, output_dict):
+    @pytest.mark.parametrize("postprocess_func", [
+        lambda x: {k.upper(): v for k, v in x.items()},
+        None
+    ], ids=["PassedFunction", "NoPassedFunction"])
+    def test_simple_postprocessor_call(self, batch, output_dict, postprocess_func):
         data_loader = DummyDataLoader([])
         if batch is None or output_dict is None:
             with pytest.raises(ValueError):
                 self.postprocessor(batch, output_dict, data_loader)  # type: ignore
             return
 
-        expected = json.dumps(sanitize({**batch, **output_dict}))
-        result = self.postprocessor(batch, output_dict, data_loader)  # type: ignore
+        expected = json.dumps(sanitize(
+            {
+                **batch,
+                **(postprocess_func(output_dict) if postprocess_func else output_dict)
+            }
+        ))
+
+        result = self.postprocessor(
+            batch,
+            output_dict,
+            data_loader,  # type: ignore
+            postprocess_func
+        )
         assert result == expected

From 14fad34954eb6297ab4e5c4acb5c1c157357dfda Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Sat, 23 Oct 2021 10:50:24 -0400
Subject: [PATCH 06/12] Base Evaluator done

---
 allennlp/commands/evaluate.py                 | 137 +++++++++++++-----
 allennlp/commands/train.py                    |   2 +-
 allennlp/common/testing/test_case.py          |   1 -
 allennlp/evaluation/evaluator.py              |  97 +++++++------
 .../postprocessors/postprocessor.py           |  79 +++++++---
 tests/commands/evaluate_test.py               |  21 ++-
 tests/evaluation/evaluator_tests.py           |   6 +-
 .../postprocessors/postprocessor_test.py      |  57 ++++----
 8 files changed, 262 insertions(+), 138 deletions(-)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index 356b31d0122..3c941cedd17 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -9,7 +9,7 @@
 import logging
 from pathlib import Path
 from os import PathLike
-from typing import Union, Dict, Any, Optional, List
+from typing import Union, Dict, Any, Optional
 from copy import deepcopy
 
 from overrides import overrides
@@ -40,7 +40,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             type=str,
             help=(
                 "path to the file containing the evaluation data"
-                ' (for mutiple files, put ":" between filenames e.g., input1.txt:input2.txt)'
+                ' (for multiple files, put ":" between filenames e.g., input1.txt:input2.txt)'
             ),
         )
 
@@ -49,7 +49,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             type=str,
             help=(
                 "optional path to write the metrics to as JSON"
-                ' (for mutiple files, put ":" between filenames e.g., output1.txt:output2.txt)'
+                ' (for multiple files, put ":" between filenames e.g., output1.txt:output2.txt)'
             ),
         )
 
@@ -58,7 +58,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             type=str,
             help=(
                 "optional path to write the predictions to as JSON lines"
-                ' (for mutiple files, put ":" between filenames e.g., output1.jsonl:output2.jsonl)'
+                ' (for multiple files, put ":" between filenames e.g., output1.jsonl:output2.jsonl)'
             ),
         )
 
@@ -99,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             action="store_true",
             default=False,
             help="if specified, we will use the instances in your new dataset to "
-                 "extend your vocabulary. If pretrained-file was used to initialize "
-                 "embedding layers, you may also need to pass --embedding-sources-mapping.",
+            "extend your vocabulary. If pretrained-file was used to initialize "
+            "embedding layers, you may also need to pass --embedding-sources-mapping.",
         )
 
         subparser.add_argument(
@@ -108,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             type=str,
             default="",
             help="a JSON dict defining mapping from embedding module path to embedding "
-                 "pretrained-file used during training. If not passed, and embedding needs to be "
-                 "extended, we will try to use the original file paths used during training. If "
-                 "they are not available we will use random vectors for embedding extension.",
+            "pretrained-file used during training. If not passed, and embedding needs to be "
+            "extended, we will try to use the original file paths used during training. If "
+            "they are not available we will use random vectors for embedding extension.",
         )
         subparser.add_argument(
             "--file-friendly-logging",
@@ -123,7 +123,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             "--auto-names",
             default="NONE",
             help="Automatically create output names for each evaluation file.",
-            choices=["NONE", "METRICS", "PREDS", "ALL"]
+            choices=["NONE", "METRICS", "PREDS", "ALL"],
         )
 
         subparser.set_defaults(func=evaluate_from_args)
@@ -145,25 +145,86 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
         weights_file=args.weights_file,
         file_friendly_logging=args.file_friendly_logging,
         batch_weight_key=args.batch_weight_key,
-        auto_names=args.auto_names
+        auto_names=args.auto_names,
     )
 
 
 def evaluate_from_archive(
-        archive_file: Union[str, PathLike],
-        input_file: str,
-        output_file: Optional[str] = None,
-        predictions_output_file: Optional[str] = None,
-        batch_size: Optional[int] = None,
-        cmd_overrides: Union[str, Dict[str, Any]] = "",
-        cuda_device: int = -1,
-        embedding_sources_mapping: str = None,
-        extend_vocab: bool = False,
-        weights_file: str = None,
-        file_friendly_logging: bool = False,
-        batch_weight_key: str = None,
-        auto_names: str = "None",
-):
+    archive_file: Union[str, PathLike],
+    input_file: str,
+    output_file: Optional[str] = None,
+    predictions_output_file: Optional[str] = None,
+    batch_size: Optional[int] = None,
+    cmd_overrides: Union[str, Dict[str, Any]] = "",
+    cuda_device: int = -1,
+    embedding_sources_mapping: str = None,
+    extend_vocab: bool = False,
+    weights_file: str = None,
+    file_friendly_logging: bool = False,
+    batch_weight_key: str = None,
+    auto_names: str = "NONE",
+) -> Dict[str, Any]:
+    """
+
+    # Parameters
+
+    archive_file: `Union[str, PathLike]`
+        Path to an archived trained model.
+
+    input_file: `str`
+        path to the file containing the evaluation data (for multiple files,
+         put ":" between filenames e.g., input1.txt:input2.txt)
+
+    output_file: `str`, optional (default=`None`)
+        optional path to write the metrics to as JSON (for multiple files, put
+         ":" between filenames e.g., output1.txt:output2.txt)
+
+    predictions_output_file: `str`, optional (default=`None`)
+        "optional path to write the predictions to (for multiple files, put ":"
+         between filenames e.g., output1.jsonl:output2.jsonl)
+
+    batch_size: `int`, optional (default=`None`)
+        If non-empty, the batch size to use during evaluation.
+
+    cmd_overrides: `str`, optional (default=`""`)
+        a json(net) structure used to override the experiment configuration,
+         e.g., '{\"iterator.batch_size\": 16}'.  Nested parameters can be
+          specified either with nested dictionaries or with dot syntax.
+
+    cuda_device: `int`, optional (default=`-1`)
+        id of GPU to use (if any)
+
+    embedding_sources_mapping: `str`, optional (default=`None`)
+        a JSON dict defining mapping from embedding module path to embedding
+        pretrained-file used during training. If not passed, and embedding
+        needs to be extended, we will try to use the original file paths used
+        during training. If they are not available we will use random vectors
+        for embedding extension.
+
+    extend_vocab: `bool`, optional (default=`False`)
+        if specified, we will use the instances in your new dataset to extend
+        your vocabulary. If pretrained-file was used to initialize embedding
+        layers, you may also need to pass --embedding-sources-mapping.
+
+    weights_file:`str`, optional (default=`None`)
+        A path that overrides which weights file to use
+
+    file_friendly_logging : `bool`, optional (default=`False`)
+        If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow
+        down tqdm's output to only once every 10 seconds.
+
+    batch_weight_key: `str`, optional (default=`None`)
+        If non-empty, name of metric used to weight the loss on a per-batch basis.
+
+    auto_names:`str`, optional (default=`"NONE"`)
+        Automatically create output names for each evaluation file.
+
+    # Returns
+
+    all_metrics: `Dict[str, Any]`
+        The metrics from every evaluation file passed.
+
+    """
     common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging
 
     # Disable some of the more verbose logging statements
@@ -185,7 +246,7 @@ def evaluate_from_archive(
 
     # Load the evaluator from the config key `Evaluator`
     evaluator_params = config.pop("evaluation", {})
-    evaluator_params['cuda_device'] = cuda_device
+    evaluator_params["cuda_device"] = cuda_device
     evaluator = Evaluator.from_params(evaluator_params)
 
     # Load the evaluation data
@@ -198,34 +259,34 @@ def evaluate_from_archive(
     # TODO(gabeorlanski): Add in way to save to specific output directory
     if output_file is not None:
         if auto_names == "METRICS" or auto_names == "ALL":
-            logger.warning(f"Passed output_files will be ignored, auto_names is"
-                           f" set to {auto_names}")
+            logger.warning(
+                f"Passed output_files will be ignored, auto_names is" f" set to {auto_names}"
+            )
 
             # Keep the path of the parent otherwise it will write to the CWD
             output_file_list = [
-                p.parent.joinpath(f"{p.stem}.outputs")
-                for p in map(Path, evaluation_data_path_list)
+                p.parent.joinpath(f"{p.stem}.outputs") for p in map(Path, evaluation_data_path_list)
             ]
         else:
-            output_file_list = output_file.split(":")
+            output_file_list = output_file.split(":") # type: ignore
             assert len(output_file_list) == len(
                 evaluation_data_path_list
             ), "The number of `output_file` paths must be equal to the number of datasets being evaluated."
     if predictions_output_file is not None:
         if auto_names == "PREDS" or auto_names == "ALL":
-            logger.warning(f"Passed predictions files will be ignored, auto_names is"
-                           f" set to {auto_names}")
+            logger.warning(
+                f"Passed predictions files will be ignored, auto_names is" f" set to {auto_names}"
+            )
 
             # Keep the path of the parent otherwise it will write to the CWD
             predictions_output_file_list = [
-                p.parent.joinpath(f"{p.stem}.preds")
-                for p in map(Path, evaluation_data_path_list)
+                p.parent.joinpath(f"{p.stem}.preds") for p in map(Path, evaluation_data_path_list)
             ]
         else:
-            predictions_output_file_list = predictions_output_file.split(":")
+            predictions_output_file_list = predictions_output_file.split(":") # type: ignore
             assert len(predictions_output_file_list) == len(evaluation_data_path_list), (
-                    "The number of `predictions_output_file` paths must be equal"
-                    + "to the number of datasets being evaluated. "
+                "The number of `predictions_output_file` paths must be equal"
+                + "to the number of datasets being evaluated. "
             )
 
     # output file
diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py
index 5da3aa3969b..7acb54581bf 100644
--- a/allennlp/commands/train.py
+++ b/allennlp/commands/train.py
@@ -242,7 +242,7 @@ def train_model(
     # Change Author: Gabe Orlanski
     # Placeholder for the time being to make sure no errors are raised b/c of
     # the evaluator.
-    params.pop("evaluation",None)
+    params.pop("evaluation", None)
 
     meta = Meta.new()
     meta.to_file(os.path.join(serialization_dir, META_NAME))
diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py
index dcfd5da217f..b3c4cbd00ab 100644
--- a/allennlp/common/testing/test_case.py
+++ b/allennlp/common/testing/test_case.py
@@ -2,7 +2,6 @@
 import os
 import pathlib
 import shutil
-import pytest
 import tempfile
 
 from allennlp.common.checks import log_pytorch_version_info
diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
index bbb46158e6f..648cc5ba1bc 100644
--- a/allennlp/evaluation/evaluator.py
+++ b/allennlp/evaluation/evaluator.py
@@ -21,8 +21,8 @@
 
 class Evaluator(Registrable):
     """
-    Evaluation class
-    
+    Evaluation Base class
+
     # Parameters
 
     batch_postprocessor: `Postprocessor`, optional (default=`SimplePostprocessor`)
@@ -34,25 +34,28 @@ class Evaluator(Registrable):
          already be using this device; this parameter is only used for moving
          the input data to the correct device.
     """
+
     default_implementation = "simple"
 
     def __init__(
-            self,
-            batch_postprocessor: Optional[Postprocessor] = None,
-            cuda_device: Union[int, torch.device] = -1
+        self,
+        batch_postprocessor: Optional[Postprocessor] = None,
+        cuda_device: Union[int, torch.device] = -1,
     ):
         self.batch_postprocessor = batch_postprocessor or SimplePostprocessor()
         self.cuda_device = cuda_device
 
     def __call__(
-            self,
-            model: Model,
-            data_loader: DataLoader,
-            batch_weight_key: str = None,
-            output_file: Union[str, PathLike] = None,
-            predictions_output_file: Union[str, PathLike] = None
+        self,
+        model: Model,
+        data_loader: DataLoader,
+        batch_weight_key: str = None,
+        output_file: Union[str, PathLike] = None,
+        predictions_output_file: Union[str, PathLike] = None,
     ) -> Dict[str, Any]:
         """
+        Evaluate a single data source.
+
         # Parameters
 
         model : `Model`
@@ -72,8 +75,8 @@ def __call__(
 
         # Returns
 
-        `Dict[str, Any]`
-            The final metrics.
+        metrics: `Dict[str, Any]`
+            The metrics from evaluating the file.
         """
         raise NotImplementedError("__call__")
 
@@ -82,24 +85,37 @@ def __call__(
 class SimpleEvaluator(Evaluator):
     """
     Simple evaluator implementation. Uses the vanilla evaluation code.
+
+    # Parameters
+
+    batch_postprocessor: `Postprocessor`, optional (default=`SimplePostprocessor`)
+        The postprocessor to use for turning both the batches and the outputs
+        of the model into human readable data.
+
+    cuda_device : `Union[int, torch.device]`, optional (default=`-1`)
+        The cuda device to use for this evaluation.  The model is assumed to
+         already be using this device; this parameter is only used for moving
+         the input data to the correct device.
     """
 
     def __init__(
-            self,
-            batch_postprocessor: Optional[Postprocessor] = None,
-            cuda_device: Union[int, torch.device] = -1
+        self,
+        batch_postprocessor: Optional[Postprocessor] = None,
+        cuda_device: Union[int, torch.device] = -1,
     ):
         super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device)
 
     def __call__(
-            self,
-            model: Model,
-            data_loader: DataLoader,
-            batch_weight_key: str = None,
-            output_file: Union[str, PathLike] = None,
-            predictions_output_file: Union[str, PathLike] = None
+        self,
+        model: Model,
+        data_loader: DataLoader,
+        batch_weight_key: str = None,
+        output_file: Union[str, PathLike] = None,
+        predictions_output_file: Union[str, PathLike] = None,
     ):
         """
+        Evaluate a single data source.
+
         # Parameters
 
         model : `Model`
@@ -117,8 +133,8 @@ def __call__(
 
         # Returns
 
-        `Dict[str, Any]`
-            The final metrics.
+        metrics: `Dict[str, Any]`
+            The metrics from evaluating the file.
         """
         check_for_gpu(self.cuda_device)
         data_loader.set_target_device(int_to_device(self.cuda_device))
@@ -126,11 +142,9 @@ def __call__(
         if predictions_output_file is not None:
             predictions_file = Path(predictions_output_file).open("w", encoding="utf-8")
         else:
-            predictions_file = None
+            predictions_file = None # type: ignore
 
-        model_postprocess_function = getattr(
-            model, "make_output_human_readable", None
-        )
+        model_postprocess_function = getattr(model, "make_output_human_readable", None)
 
         with torch.no_grad():
             model.eval()
@@ -168,14 +182,14 @@ def __call__(
                     metrics["loss"] = total_loss / total_weight
 
                 description = (
-                        ", ".join(
-                            [
-                                "%s: %.2f" % (name, value)
-                                for name, value in metrics.items()
-                                if not name.startswith("_")
-                            ]
-                        )
-                        + " ||"
+                    ", ".join(
+                        [
+                            "%s: %.2f" % (name, value)
+                            for name, value in metrics.items()
+                            if not name.startswith("_")
+                        ]
+                    )
+                    + " ||"
                 )
                 generator_tqdm.set_description(description, refresh=False)
 
@@ -187,8 +201,9 @@ def __call__(
                             batch,
                             output_dict,
                             data_loader,
-                            output_postprocess_function=model_postprocess_function
-                        ) + '\n'
+                            output_postprocess_function=model_postprocess_function,
+                        )
+                        + "\n"
                     )
 
             if predictions_file is not None:
@@ -210,7 +225,7 @@ def __call__(
 
     def _to_params(self) -> Dict[str, Any]:
         return {
-            "type"               : "simple",
-            "cuda_device"        : self.cuda_device,
-            "batch_postprocessor": self.batch_postprocessor.to_params()
+            "type": "simple",
+            "cuda_device": self.cuda_device,
+            "batch_postprocessor": self.batch_postprocessor.to_params(),
         }
diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/postprocessors/postprocessor.py
index 3a9e0bfa35e..118cd90ee4a 100644
--- a/allennlp/evaluation/postprocessors/postprocessor.py
+++ b/allennlp/evaluation/postprocessors/postprocessor.py
@@ -1,14 +1,11 @@
-from collections import defaultdict
 from typing import Optional, Dict, Any, Callable
 import logging
 import json
 
 from allennlp.common.util import sanitize
 from allennlp.data.fields import TensorField
-from allennlp.nn import util as nn_util
-from allennlp.common import Registrable, Params
-from allennlp.models import Model
-from allennlp.data import DataLoader, Vocabulary
+from allennlp.common import Registrable
+from allennlp.data import DataLoader
 
 logger = logging.getLogger(__name__)
 
@@ -19,12 +16,36 @@ class Postprocessor(Registrable):
     """
 
     def __call__(
-            self,
-            batch: Dict[str, TensorField],
-            output_dict: Dict,
-            data_loader: DataLoader,
-            output_postprocess_function: Optional[Callable] = None
+        self,
+        batch: Dict[str, TensorField],
+        output_dict: Dict,
+        data_loader: DataLoader,
+        output_postprocess_function: Optional[Callable] = None,
     ) -> str:
+        """
+        Postprocess a batch.
+
+        # Parameters
+
+        batch: `Dict[str, TensorField]`
+            The batch that was passed to the model's forward function.
+
+        output_dict: `Dict`
+            The output of the model's forward function on the batch
+
+        data_loader: `DataLoader`
+            The dataloader to be used.
+
+        output_postprocess_function: `Callable`, optional (default=`None`)
+            If you have a function to preprocess only the outputs (
+            i.e. `model.make_human_readable`), use this parameter to have it
+            called on the output dict.
+
+        # Returns
+
+        postprocessed: `str`
+            The postprocessed batches as strings
+        """
         raise NotImplementedError("__call__")
 
     default_implementation = "simple"
@@ -38,17 +59,39 @@ class SimplePostprocessor(Postprocessor):
     """
 
     def _to_params(self) -> Dict[str, Any]:
-        return {
-            "type": "simple"
-        }
+        return {"type": "simple"}
 
     def __call__(
-            self,
-            batch: Dict[str, TensorField],
-            output_dict: Dict,
-            data_loader: DataLoader,
-            output_postprocess_function: Optional[Callable] = None
+        self,
+        batch: Dict[str, TensorField],
+        output_dict: Dict,
+        data_loader: DataLoader,
+        output_postprocess_function: Optional[Callable] = None,
     ):
+        """
+        Postprocess a batch.
+
+        # Parameters
+
+        batch: `Dict[str, TensorField]`
+            The batch that was passed to the model's forward function.
+
+        output_dict: `Dict`
+            The output of the model's forward function on the batch
+
+        data_loader: `DataLoader`
+            The dataloader to be used.
+
+        output_postprocess_function: `Callable`, optional (default=`None`)
+            If you have a function to preprocess only the outputs (
+            i.e. `model.make_human_readable`), use this parameter to have it
+            called on the output dict.
+
+        # Returns
+
+        postprocessed: `str`
+            The postprocessed batches as strings
+        """
         if batch is None:
             raise ValueError("Postprocessor got a batch that is None")
         if output_dict is None:
diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py
index df671fa6aa1..5ada1368b2b 100644
--- a/tests/commands/evaluate_test.py
+++ b/tests/commands/evaluate_test.py
@@ -118,8 +118,10 @@ def test_multiple_output_files_evaluate_from_args(self):
             ":".join(map(str, paths)),
             "--cuda-device",
             "-1",
-            "--output-file", ":".join(map(str, out_paths)),
-            "--predictions-output-file", ":".join(map(str, pred_paths))
+            "--output-file",
+            ":".join(map(str, out_paths)),
+            "--predictions-output-file",
+            ":".join(map(str, pred_paths)),
         ]
         args = self.parser.parse_args(kebab_args)
         computed_metrics = evaluate_from_args(args)
@@ -135,12 +137,12 @@ def test_multiple_output_files_evaluate_from_args(self):
 
         for i, p in enumerate(paths):
             # Make sure it was not modified
-            assert p.read_text('utf-8') == expected_input_data
+            assert p.read_text("utf-8") == expected_input_data
 
             assert p.stem in computed_by_file, f"paths[{i}]={p.stem}"
 
             assert out_paths[i].exists(), f"paths[{i}]={p.stem}"
-            saved_metrics = json.loads(out_paths[i].read_text('utf-8'))
+            saved_metrics = json.loads(out_paths[i].read_text("utf-8"))
             assert saved_metrics == computed_by_file[p.stem], f"paths[{i}]={p.stem}"
             assert pred_paths[i].exists(), f"paths[{i}]={p.stem}"
 
@@ -201,9 +203,12 @@ def test_auto_names_creates_files(self, auto_names):
             ":".join(map(str, paths)),
             "--cuda-device",
             "-1",
-            "--output-file", ":".join(map(str, out_paths)),
-            "--predictions-output-file", ":".join(map(str, pred_paths)),
-            "--auto-names", auto_names
+            "--output-file",
+            ":".join(map(str, out_paths)),
+            "--predictions-output-file",
+            ":".join(map(str, pred_paths)),
+            "--auto-names",
+            auto_names,
         ]
 
         args = self.parser.parse_args(kebab_args)
@@ -213,7 +218,7 @@ def test_auto_names_creates_files(self, auto_names):
 
         for i, p in enumerate(paths):
             # Make sure it was not modified
-            assert p.read_text('utf-8') == expected_input_data
+            assert p.read_text("utf-8") == expected_input_data
 
             if auto_names == "METRICS" or auto_names == "ALL":
                 assert not out_paths[i].exists()
diff --git a/tests/evaluation/evaluator_tests.py b/tests/evaluation/evaluator_tests.py
index 9c21ffce190..57ec28e6bfe 100644
--- a/tests/evaluation/evaluator_tests.py
+++ b/tests/evaluation/evaluator_tests.py
@@ -58,7 +58,7 @@ def test_evaluate_calculates_average_loss_with_weights(self):
 
     def test_to_params(self):
         assert self.evaluator.to_params() == {
-            "type"               : "simple",
-            "cuda_device"        : -1,
-            "batch_postprocessor": {"type": "simple"}
+            "type": "simple",
+            "cuda_device": -1,
+            "batch_postprocessor": {"type": "simple"},
         }
diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py
index 06e449e6485..fc0996c03fa 100644
--- a/tests/evaluation/postprocessors/postprocessor_test.py
+++ b/tests/evaluation/postprocessors/postprocessor_test.py
@@ -1,4 +1,4 @@
-from typing import Iterator, List, Dict
+from typing import Iterator, List
 import torch
 import pytest
 import json
@@ -35,23 +35,28 @@ def test_postprocessor_default_implementation(self):
         assert self.postprocessor.to_params().params == {"type": "simple"}
         assert isinstance(self.postprocessor, SimplePostprocessor)
 
-    @pytest.mark.parametrize("batch", [
-        {
-            "Do you want ants?": "Because that's how you get ants.",
-            "testing"          : torch.tensor([[1, 2, 3]])
-        },
-        {},
-        None
-    ], ids=["TestBatch", "EmptyBatch", "None"])
-    @pytest.mark.parametrize("output_dict", [
-        {"You're": ["Not", [["My"]], "Supervisor"]},
-        {},
-        None
-    ], ids=["TestOutput", "EmptyOutput", "None"])
-    @pytest.mark.parametrize("postprocess_func", [
-        lambda x: {k.upper(): v for k, v in x.items()},
-        None
-    ], ids=["PassedFunction", "NoPassedFunction"])
+    @pytest.mark.parametrize(
+        "batch",
+        [
+            {
+                "Do you want ants?": "Because that's how you get ants.",
+                "testing": torch.tensor([[1, 2, 3]]),
+            },
+            {},
+            None,
+        ],
+        ids=["TestBatch", "EmptyBatch", "None"],
+    )
+    @pytest.mark.parametrize(
+        "output_dict",
+        [{"You're": ["Not", [["My"]], "Supervisor"]}, {}, None],
+        ids=["TestOutput", "EmptyOutput", "None"],
+    )
+    @pytest.mark.parametrize(
+        "postprocess_func",
+        [lambda x: {k.upper(): v for k, v in x.items()}, None],
+        ids=["PassedFunction", "NoPassedFunction"],
+    )
     def test_simple_postprocessor_call(self, batch, output_dict, postprocess_func):
         data_loader = DummyDataLoader([])
         if batch is None or output_dict is None:
@@ -59,17 +64,13 @@ def test_simple_postprocessor_call(self, batch, output_dict, postprocess_func):
                 self.postprocessor(batch, output_dict, data_loader)  # type: ignore
             return
 
-        expected = json.dumps(sanitize(
-            {
-                **batch,
-                **(postprocess_func(output_dict) if postprocess_func else output_dict)
-            }
-        ))
+        expected = json.dumps(
+            sanitize(
+                {**batch, **(postprocess_func(output_dict) if postprocess_func else output_dict)}
+            )
+        )
 
         result = self.postprocessor(
-            batch,
-            output_dict,
-            data_loader,  # type: ignore
-            postprocess_func
+            batch, output_dict, data_loader, postprocess_func  # type: ignore
         )
         assert result == expected

From b901865c16c3f13929ee13e0da1c92a79d9bd7a8 Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Fri, 5 Nov 2021 21:24:45 -0400
Subject: [PATCH 07/12] Style Fixes

---
 allennlp/commands/evaluate.py    | 4 ++--
 allennlp/evaluation/evaluator.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index 3c941cedd17..c54be3ecead 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -268,7 +268,7 @@ def evaluate_from_archive(
                 p.parent.joinpath(f"{p.stem}.outputs") for p in map(Path, evaluation_data_path_list)
             ]
         else:
-            output_file_list = output_file.split(":") # type: ignore
+            output_file_list = output_file.split(":")  # type: ignore
             assert len(output_file_list) == len(
                 evaluation_data_path_list
             ), "The number of `output_file` paths must be equal to the number of datasets being evaluated."
@@ -283,7 +283,7 @@ def evaluate_from_archive(
                 p.parent.joinpath(f"{p.stem}.preds") for p in map(Path, evaluation_data_path_list)
             ]
         else:
-            predictions_output_file_list = predictions_output_file.split(":") # type: ignore
+            predictions_output_file_list = predictions_output_file.split(":")  # type: ignore
             assert len(predictions_output_file_list) == len(evaluation_data_path_list), (
                 "The number of `predictions_output_file` paths must be equal"
                 + "to the number of datasets being evaluated. "
diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
index 648cc5ba1bc..e58736786b8 100644
--- a/allennlp/evaluation/evaluator.py
+++ b/allennlp/evaluation/evaluator.py
@@ -142,7 +142,7 @@ def __call__(
         if predictions_output_file is not None:
             predictions_file = Path(predictions_output_file).open("w", encoding="utf-8")
         else:
-            predictions_file = None # type: ignore
+            predictions_file = None  # type: ignore
 
         model_postprocess_function = getattr(model, "make_output_human_readable", None)
 

From b9b4f9e5ca6e978b4e12e12d50f9a5853641ce98 Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Fri, 5 Nov 2021 21:27:15 -0400
Subject: [PATCH 08/12] Changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4de3f4ba397..e30453881ad 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+- Added an `Evaluator` class to make comparing source, target, and predictions easier.
+
 ## [v2.8.0](https://github.com/allenai/allennlp/releases/tag/v2.8.0) - 2021-11-01
 
 ### Added

From 21d81657660df4e4c99b10dac3e0bf9d518cbedf Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Fri, 10 Dec 2021 11:44:06 -0500
Subject: [PATCH 09/12] PR Review Fixes

---
 allennlp/commands/evaluate.py                 | 72 +++++++++-------
 allennlp/evaluation/__init__.py               |  2 +-
 allennlp/evaluation/evaluator.py              | 84 +++++++++++--------
 .../evaluation/postprocessors/__init__.py     |  1 -
 allennlp/evaluation/serializers/__init__.py   |  1 +
 .../serializers.py}                           | 30 +++----
 .../__init__.py                               |  0
 .../serializer_test.py}                       | 12 +--
 8 files changed, 113 insertions(+), 89 deletions(-)
 create mode 100644 allennlp/evaluation/serializers/__init__.py
 rename allennlp/evaluation/{postprocessors/postprocessor.py => serializers/serializers.py} (74%)
 rename tests/evaluation/{postprocessors => serializers}/__init__.py (100%)
 rename tests/evaluation/{postprocessors/postprocessor_test.py => serializers/serializer_test.py} (85%)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index c54be3ecead..4bdbd6e3652 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -99,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             action="store_true",
             default=False,
             help="if specified, we will use the instances in your new dataset to "
-            "extend your vocabulary. If pretrained-file was used to initialize "
-            "embedding layers, you may also need to pass --embedding-sources-mapping.",
+                 "extend your vocabulary. If pretrained-file was used to initialize "
+                 "embedding layers, you may also need to pass --embedding-sources-mapping.",
         )
 
         subparser.add_argument(
@@ -108,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             type=str,
             default="",
             help="a JSON dict defining mapping from embedding module path to embedding "
-            "pretrained-file used during training. If not passed, and embedding needs to be "
-            "extended, we will try to use the original file paths used during training. If "
-            "they are not available we will use random vectors for embedding extension.",
+                 "pretrained-file used during training. If not passed, and embedding needs to be "
+                 "extended, we will try to use the original file paths used during training. If "
+                 "they are not available we will use random vectors for embedding extension.",
         )
         subparser.add_argument(
             "--file-friendly-logging",
@@ -122,7 +122,14 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
         subparser.add_argument(
             "--auto-names",
             default="NONE",
-            help="Automatically create output names for each evaluation file.",
+            help="Automatically create output names for each evaluation file. "
+                 "`NONE` will not automatically generate a file name for the "
+                 "neither the metrics nor the predictions. In this case you will"
+                 " need to pas in both `metrics_output_file` and `predictions_output_file`. "
+                 "`METRICS` will only automatically create a file name for the"
+                 " metrics file. `PREDS` will only automatically create a file"
+                 " name for the predictions outputs. `ALL` will create a "
+                 "filename for both the metrics and the predictions.",
             choices=["NONE", "METRICS", "PREDS", "ALL"],
         )
 
@@ -135,7 +142,7 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
     return evaluate_from_archive(
         archive_file=args.archive_file,
         input_file=args.input_file,
-        output_file=args.output_file,
+        metrics_output_file=args.output_file,
         predictions_output_file=args.predictions_output_file,
         batch_size=args.batch_size,
         cmd_overrides=args.overrides,
@@ -150,19 +157,19 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
 
 
 def evaluate_from_archive(
-    archive_file: Union[str, PathLike],
-    input_file: str,
-    output_file: Optional[str] = None,
-    predictions_output_file: Optional[str] = None,
-    batch_size: Optional[int] = None,
-    cmd_overrides: Union[str, Dict[str, Any]] = "",
-    cuda_device: int = -1,
-    embedding_sources_mapping: str = None,
-    extend_vocab: bool = False,
-    weights_file: str = None,
-    file_friendly_logging: bool = False,
-    batch_weight_key: str = None,
-    auto_names: str = "NONE",
+        archive_file: Union[str, PathLike],
+        input_file: str,
+        metrics_output_file: Optional[str] = None,
+        predictions_output_file: Optional[str] = None,
+        batch_size: Optional[int] = None,
+        cmd_overrides: Union[str, Dict[str, Any]] = "",
+        cuda_device: int = -1,
+        embedding_sources_mapping: str = None,
+        extend_vocab: bool = False,
+        weights_file: str = None,
+        file_friendly_logging: bool = False,
+        batch_weight_key: str = None,
+        auto_names: str = "NONE",
 ) -> Dict[str, Any]:
     """
 
@@ -175,7 +182,7 @@ def evaluate_from_archive(
         path to the file containing the evaluation data (for multiple files,
          put ":" between filenames e.g., input1.txt:input2.txt)
 
-    output_file: `str`, optional (default=`None`)
+    metrics_output_file: `str`, optional (default=`None`)
         optional path to write the metrics to as JSON (for multiple files, put
          ":" between filenames e.g., output1.txt:output2.txt)
 
@@ -217,7 +224,13 @@ def evaluate_from_archive(
         If non-empty, name of metric used to weight the loss on a per-batch basis.
 
     auto_names:`str`, optional (default=`"NONE"`)
-        Automatically create output names for each evaluation file.
+        Automatically create output names for each evaluation file.`NONE` will
+        not automatically generate a file name for the neither the metrics nor
+        the predictions. In this case you will need to pas in both
+        `metrics_output_file` and `predictions_output_file`. `METRICS` will only
+        automatically create a file name for the metrics file. `PREDS` will only
+        automatically create a file name for the predictions outputs. `ALL`
+        will create a filename for both the metrics and the predictions.
 
     # Returns
 
@@ -257,7 +270,7 @@ def evaluate_from_archive(
 
     # TODO(gabeorlanski): Is it safe to always default to .outputs and .preds?
     # TODO(gabeorlanski): Add in way to save to specific output directory
-    if output_file is not None:
+    if metrics_output_file is not None:
         if auto_names == "METRICS" or auto_names == "ALL":
             logger.warning(
                 f"Passed output_files will be ignored, auto_names is" f" set to {auto_names}"
@@ -268,10 +281,11 @@ def evaluate_from_archive(
                 p.parent.joinpath(f"{p.stem}.outputs") for p in map(Path, evaluation_data_path_list)
             ]
         else:
-            output_file_list = output_file.split(":")  # type: ignore
+            output_file_list = metrics_output_file.split(":")  # type: ignore
             assert len(output_file_list) == len(
                 evaluation_data_path_list
-            ), "The number of `output_file` paths must be equal to the number of datasets being evaluated."
+            ), "The number of `metrics_output_file` paths must be equal to the number " \
+               "of datasets being evaluated."
     if predictions_output_file is not None:
         if auto_names == "PREDS" or auto_names == "ALL":
             logger.warning(
@@ -285,8 +299,8 @@ def evaluate_from_archive(
         else:
             predictions_output_file_list = predictions_output_file.split(":")  # type: ignore
             assert len(predictions_output_file_list) == len(evaluation_data_path_list), (
-                "The number of `predictions_output_file` paths must be equal"
-                + "to the number of datasets being evaluated. "
+                    "The number of `predictions_output_file` paths must be equal"
+                    + "to the number of datasets being evaluated. "
             )
 
     # output file
@@ -309,7 +323,7 @@ def evaluate_from_archive(
         # output dictionary.
         eval_file_name = Path(evaluation_data_path).stem
 
-        if output_file is not None:
+        if metrics_output_file is not None:
             # noinspection PyUnboundLocalVariable
             output_file_path = output_file_list[index]
 
@@ -339,7 +353,7 @@ def evaluate_from_archive(
             model,
             data_loader,
             batch_weight_key,
-            output_file=output_file_path,
+            metrics_output_file=output_file_path,
             predictions_output_file=predictions_output_file_path,
         )
 
diff --git a/allennlp/evaluation/__init__.py b/allennlp/evaluation/__init__.py
index ea63688b772..feb03dd961e 100644
--- a/allennlp/evaluation/__init__.py
+++ b/allennlp/evaluation/__init__.py
@@ -1,2 +1,2 @@
 from allennlp.evaluation.evaluator import Evaluator, SimpleEvaluator
-from allennlp.evaluation.postprocessors.postprocessor import Postprocessor
+from allennlp.evaluation.serializers.serializers import Serializer
diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
index e58736786b8..d42a7745e0e 100644
--- a/allennlp/evaluation/evaluator.py
+++ b/allennlp/evaluation/evaluator.py
@@ -14,7 +14,7 @@
 from allennlp.common import Registrable
 from allennlp.models import Model
 from allennlp.data import DataLoader
-from allennlp.evaluation.postprocessors.postprocessor import Postprocessor, SimplePostprocessor
+from allennlp.evaluation.serializers.serializers import Serializer, SimpleSerializer
 
 logger = logging.getLogger(__name__)
 
@@ -33,25 +33,30 @@ class Evaluator(Registrable):
         The cuda device to use for this evaluation.  The model is assumed to
          already be using this device; this parameter is only used for moving
          the input data to the correct device.
+
+    postprocessor_fn_name: `str`, optional (default=`"make_output_human_readable"`)
+        Function name of the model's postprocessing function.
     """
 
     default_implementation = "simple"
 
     def __init__(
-        self,
-        batch_postprocessor: Optional[Postprocessor] = None,
-        cuda_device: Union[int, torch.device] = -1,
+            self,
+            batch_serializer: Optional[Serializer] = None,
+            cuda_device: Union[int, torch.device] = -1,
+            postprocessor_fn_name: str = "make_output_human_readable"
     ):
-        self.batch_postprocessor = batch_postprocessor or SimplePostprocessor()
+        self.batch_serializer = batch_serializer or SimpleSerializer()
         self.cuda_device = cuda_device
+        self.postprocessor_fn_name = postprocessor_fn_name
 
     def __call__(
-        self,
-        model: Model,
-        data_loader: DataLoader,
-        batch_weight_key: str = None,
-        output_file: Union[str, PathLike] = None,
-        predictions_output_file: Union[str, PathLike] = None,
+            self,
+            model: Model,
+            data_loader: DataLoader,
+            batch_weight_key: str = None,
+            metrics_output_file: Union[str, PathLike] = None,
+            predictions_output_file: Union[str, PathLike] = None,
     ) -> Dict[str, Any]:
         """
         Evaluate a single data source.
@@ -73,6 +78,7 @@ def __call__(
             Optional path to write the predictions to. If passed the
             postprocessor will be called and its output will be written as lines.
 
+
         # Returns
 
         metrics: `Dict[str, Any]`
@@ -96,22 +102,26 @@ class SimpleEvaluator(Evaluator):
         The cuda device to use for this evaluation.  The model is assumed to
          already be using this device; this parameter is only used for moving
          the input data to the correct device.
+
+    postprocessor_fn_name: `str`, optional (default=`"make_output_human_readable"`)
+        Function name of the model's postprocessing function.
     """
 
     def __init__(
-        self,
-        batch_postprocessor: Optional[Postprocessor] = None,
-        cuda_device: Union[int, torch.device] = -1,
+            self,
+            batch_serializer: Optional[Serializer] = None,
+            cuda_device: Union[int, torch.device] = -1,
+            postprocessor_fn_name: str = "make_output_human_readable"
     ):
-        super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device)
+        super(SimpleEvaluator, self).__init__(batch_serializer, cuda_device, postprocessor_fn_name)
 
     def __call__(
-        self,
-        model: Model,
-        data_loader: DataLoader,
-        batch_weight_key: str = None,
-        output_file: Union[str, PathLike] = None,
-        predictions_output_file: Union[str, PathLike] = None,
+            self,
+            model: Model,
+            data_loader: DataLoader,
+            batch_weight_key: str = None,
+            metrics_output_file: Union[str, PathLike] = None,
+            predictions_output_file: Union[str, PathLike] = None,
     ):
         """
         Evaluate a single data source.
@@ -138,13 +148,13 @@ def __call__(
         """
         check_for_gpu(self.cuda_device)
         data_loader.set_target_device(int_to_device(self.cuda_device))
-        output_file = Path(output_file) if output_file is not None else None
+        metrics_output_file = Path(metrics_output_file) if metrics_output_file is not None else None
         if predictions_output_file is not None:
             predictions_file = Path(predictions_output_file).open("w", encoding="utf-8")
         else:
             predictions_file = None  # type: ignore
 
-        model_postprocess_function = getattr(model, "make_output_human_readable", None)
+        model_postprocess_function = getattr(model, self.postprocessor_fn_name, None)
 
         with torch.no_grad():
             model.eval()
@@ -182,14 +192,14 @@ def __call__(
                     metrics["loss"] = total_loss / total_weight
 
                 description = (
-                    ", ".join(
-                        [
-                            "%s: %.2f" % (name, value)
-                            for name, value in metrics.items()
-                            if not name.startswith("_")
-                        ]
-                    )
-                    + " ||"
+                        ", ".join(
+                            [
+                                "%s: %.2f" % (name, value)
+                                for name, value in metrics.items()
+                                if not name.startswith("_")
+                            ]
+                        )
+                        + " ||"
                 )
                 generator_tqdm.set_description(description, refresh=False)
 
@@ -197,7 +207,7 @@ def __call__(
                 #  metrics
                 if predictions_file is not None:
                     predictions_file.write(
-                        self.batch_postprocessor(
+                        self.batch_serializer(
                             batch,
                             output_dict,
                             data_loader,
@@ -218,14 +228,14 @@ def __call__(
                     )
                 final_metrics["loss"] = total_loss / total_weight
 
-            if output_file is not None:
-                dump_metrics(str(output_file), final_metrics, log=True)
+            if metrics_output_file is not None:
+                dump_metrics(str(metrics_output_file), final_metrics, log=True)
 
             return final_metrics
 
     def _to_params(self) -> Dict[str, Any]:
         return {
-            "type": "simple",
-            "cuda_device": self.cuda_device,
-            "batch_postprocessor": self.batch_postprocessor.to_params(),
+            "type"               : "simple",
+            "cuda_device"        : self.cuda_device,
+            "batch_postprocessor": self.batch_serializer.to_params(),
         }
diff --git a/allennlp/evaluation/postprocessors/__init__.py b/allennlp/evaluation/postprocessors/__init__.py
index df1c98089c0..e69de29bb2d 100644
--- a/allennlp/evaluation/postprocessors/__init__.py
+++ b/allennlp/evaluation/postprocessors/__init__.py
@@ -1 +0,0 @@
-from allennlp.evaluation.postprocessors.postprocessor import SimplePostprocessor
diff --git a/allennlp/evaluation/serializers/__init__.py b/allennlp/evaluation/serializers/__init__.py
new file mode 100644
index 00000000000..a9aef4a6b22
--- /dev/null
+++ b/allennlp/evaluation/serializers/__init__.py
@@ -0,0 +1 @@
+from allennlp.evaluation.serializers.serializers import SimpleSerializer
diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/serializers/serializers.py
similarity index 74%
rename from allennlp/evaluation/postprocessors/postprocessor.py
rename to allennlp/evaluation/serializers/serializers.py
index 118cd90ee4a..2fbcfa79f6e 100644
--- a/allennlp/evaluation/postprocessors/postprocessor.py
+++ b/allennlp/evaluation/serializers/serializers.py
@@ -10,9 +10,9 @@
 logger = logging.getLogger(__name__)
 
 
-class Postprocessor(Registrable):
+class Serializer(Registrable):
     """
-    General Postprocessor class for turning batches into human readable data
+    General serializer class for turning batches into human readable data
     """
 
     def __call__(
@@ -51,11 +51,11 @@ def __call__(
     default_implementation = "simple"
 
 
-@Postprocessor.register("simple")
-class SimplePostprocessor(Postprocessor):
+@Serializer.register("simple")
+class SimpleSerializer(Serializer):
     """
-    Very simple postprocesser. Only sanitizes the batches and outputs. Will use
-     a passed postprocess function for the outputs if it exists.
+    Very simple serializer. Only sanitizes the batches and outputs. Will use
+     a passed serializer function for the outputs if it exists.
     """
 
     def _to_params(self) -> Dict[str, Any]:
@@ -69,7 +69,7 @@ def __call__(
         output_postprocess_function: Optional[Callable] = None,
     ):
         """
-        Postprocess a batch.
+        Serializer a batch.
 
         # Parameters
 
@@ -89,18 +89,18 @@ def __call__(
 
         # Returns
 
-        postprocessed: `str`
-            The postprocessed batches as strings
+        serialized: `str`
+            The serialized batches as strings
         """
         if batch is None:
-            raise ValueError("Postprocessor got a batch that is None")
+            raise ValueError("Serializer got a batch that is None")
         if output_dict is None:
-            raise ValueError("Postprocessor got an output_dict that is None")
+            raise ValueError("Serializer got an output_dict that is None")
 
-        postprocessed = sanitize(batch)
+        serialized = sanitize(batch)
         if output_postprocess_function is not None:
-            postprocessed.update(sanitize(output_postprocess_function(output_dict)))
+            serialized.update(sanitize(output_postprocess_function(output_dict)))
         else:
-            postprocessed.update(sanitize(output_dict))
+            serialized.update(sanitize(output_dict))
 
-        return json.dumps(postprocessed)
+        return json.dumps(serialized)
diff --git a/tests/evaluation/postprocessors/__init__.py b/tests/evaluation/serializers/__init__.py
similarity index 100%
rename from tests/evaluation/postprocessors/__init__.py
rename to tests/evaluation/serializers/__init__.py
diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/serializers/serializer_test.py
similarity index 85%
rename from tests/evaluation/postprocessors/postprocessor_test.py
rename to tests/evaluation/serializers/serializer_test.py
index fc0996c03fa..684f01c8a5b 100644
--- a/tests/evaluation/postprocessors/postprocessor_test.py
+++ b/tests/evaluation/serializers/serializer_test.py
@@ -7,8 +7,8 @@
 from allennlp.common import Params
 from allennlp.common.util import sanitize
 from allennlp.data.data_loaders import TensorDict
-from allennlp.evaluation import Postprocessor
-from allennlp.evaluation.postprocessors import SimplePostprocessor
+from allennlp.evaluation import Serializer
+from allennlp.evaluation.serializers import SimpleSerializer
 
 
 class DummyDataLoader:
@@ -26,14 +26,14 @@ def set_target_device(self, _):
         pass
 
 
-class TestPostprocessor(AllenNlpTestCase):
+class TestSerializer(AllenNlpTestCase):
     def setup_method(self):
-        super(TestPostprocessor, self).setup_method()
-        self.postprocessor = Postprocessor.from_params(Params({}))
+        super(TestSerializer, self).setup_method()
+        self.postprocessor = Serializer.from_params(Params({}))
 
     def test_postprocessor_default_implementation(self):
         assert self.postprocessor.to_params().params == {"type": "simple"}
-        assert isinstance(self.postprocessor, SimplePostprocessor)
+        assert isinstance(self.postprocessor, SimpleSerializer)
 
     @pytest.mark.parametrize(
         "batch",

From 90983a1acb8b12c3603e6ae2d6cb15c442ba37c9 Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Fri, 10 Dec 2021 11:46:40 -0500
Subject: [PATCH 10/12] Formatting

---
 allennlp/commands/evaluate.py    | 62 ++++++++++++++++----------------
 allennlp/evaluation/evaluator.py | 60 +++++++++++++++----------------
 2 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index 4bdbd6e3652..bd80fbb4aa7 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -99,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             action="store_true",
             default=False,
             help="if specified, we will use the instances in your new dataset to "
-                 "extend your vocabulary. If pretrained-file was used to initialize "
-                 "embedding layers, you may also need to pass --embedding-sources-mapping.",
+            "extend your vocabulary. If pretrained-file was used to initialize "
+            "embedding layers, you may also need to pass --embedding-sources-mapping.",
         )
 
         subparser.add_argument(
@@ -108,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             type=str,
             default="",
             help="a JSON dict defining mapping from embedding module path to embedding "
-                 "pretrained-file used during training. If not passed, and embedding needs to be "
-                 "extended, we will try to use the original file paths used during training. If "
-                 "they are not available we will use random vectors for embedding extension.",
+            "pretrained-file used during training. If not passed, and embedding needs to be "
+            "extended, we will try to use the original file paths used during training. If "
+            "they are not available we will use random vectors for embedding extension.",
         )
         subparser.add_argument(
             "--file-friendly-logging",
@@ -123,13 +123,13 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             "--auto-names",
             default="NONE",
             help="Automatically create output names for each evaluation file. "
-                 "`NONE` will not automatically generate a file name for the "
-                 "neither the metrics nor the predictions. In this case you will"
-                 " need to pas in both `metrics_output_file` and `predictions_output_file`. "
-                 "`METRICS` will only automatically create a file name for the"
-                 " metrics file. `PREDS` will only automatically create a file"
-                 " name for the predictions outputs. `ALL` will create a "
-                 "filename for both the metrics and the predictions.",
+            "`NONE` will not automatically generate a file name for the "
+            "neither the metrics nor the predictions. In this case you will"
+            " need to pas in both `metrics_output_file` and `predictions_output_file`. "
+            "`METRICS` will only automatically create a file name for the"
+            " metrics file. `PREDS` will only automatically create a file"
+            " name for the predictions outputs. `ALL` will create a "
+            "filename for both the metrics and the predictions.",
             choices=["NONE", "METRICS", "PREDS", "ALL"],
         )
 
@@ -157,19 +157,19 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
 
 
 def evaluate_from_archive(
-        archive_file: Union[str, PathLike],
-        input_file: str,
-        metrics_output_file: Optional[str] = None,
-        predictions_output_file: Optional[str] = None,
-        batch_size: Optional[int] = None,
-        cmd_overrides: Union[str, Dict[str, Any]] = "",
-        cuda_device: int = -1,
-        embedding_sources_mapping: str = None,
-        extend_vocab: bool = False,
-        weights_file: str = None,
-        file_friendly_logging: bool = False,
-        batch_weight_key: str = None,
-        auto_names: str = "NONE",
+    archive_file: Union[str, PathLike],
+    input_file: str,
+    metrics_output_file: Optional[str] = None,
+    predictions_output_file: Optional[str] = None,
+    batch_size: Optional[int] = None,
+    cmd_overrides: Union[str, Dict[str, Any]] = "",
+    cuda_device: int = -1,
+    embedding_sources_mapping: str = None,
+    extend_vocab: bool = False,
+    weights_file: str = None,
+    file_friendly_logging: bool = False,
+    batch_weight_key: str = None,
+    auto_names: str = "NONE",
 ) -> Dict[str, Any]:
     """
 
@@ -282,10 +282,10 @@ def evaluate_from_archive(
             ]
         else:
             output_file_list = metrics_output_file.split(":")  # type: ignore
-            assert len(output_file_list) == len(
-                evaluation_data_path_list
-            ), "The number of `metrics_output_file` paths must be equal to the number " \
-               "of datasets being evaluated."
+            assert len(output_file_list) == len(evaluation_data_path_list), (
+                "The number of `metrics_output_file` paths must be equal to the number "
+                "of datasets being evaluated."
+            )
     if predictions_output_file is not None:
         if auto_names == "PREDS" or auto_names == "ALL":
             logger.warning(
@@ -299,8 +299,8 @@ def evaluate_from_archive(
         else:
             predictions_output_file_list = predictions_output_file.split(":")  # type: ignore
             assert len(predictions_output_file_list) == len(evaluation_data_path_list), (
-                    "The number of `predictions_output_file` paths must be equal"
-                    + "to the number of datasets being evaluated. "
+                "The number of `predictions_output_file` paths must be equal"
+                + "to the number of datasets being evaluated. "
             )
 
     # output file
diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py
index d42a7745e0e..9c7ef7b6703 100644
--- a/allennlp/evaluation/evaluator.py
+++ b/allennlp/evaluation/evaluator.py
@@ -41,22 +41,22 @@ class Evaluator(Registrable):
     default_implementation = "simple"
 
     def __init__(
-            self,
-            batch_serializer: Optional[Serializer] = None,
-            cuda_device: Union[int, torch.device] = -1,
-            postprocessor_fn_name: str = "make_output_human_readable"
+        self,
+        batch_serializer: Optional[Serializer] = None,
+        cuda_device: Union[int, torch.device] = -1,
+        postprocessor_fn_name: str = "make_output_human_readable",
     ):
         self.batch_serializer = batch_serializer or SimpleSerializer()
         self.cuda_device = cuda_device
         self.postprocessor_fn_name = postprocessor_fn_name
 
     def __call__(
-            self,
-            model: Model,
-            data_loader: DataLoader,
-            batch_weight_key: str = None,
-            metrics_output_file: Union[str, PathLike] = None,
-            predictions_output_file: Union[str, PathLike] = None,
+        self,
+        model: Model,
+        data_loader: DataLoader,
+        batch_weight_key: str = None,
+        metrics_output_file: Union[str, PathLike] = None,
+        predictions_output_file: Union[str, PathLike] = None,
     ) -> Dict[str, Any]:
         """
         Evaluate a single data source.
@@ -108,20 +108,20 @@ class SimpleEvaluator(Evaluator):
     """
 
     def __init__(
-            self,
-            batch_serializer: Optional[Serializer] = None,
-            cuda_device: Union[int, torch.device] = -1,
-            postprocessor_fn_name: str = "make_output_human_readable"
+        self,
+        batch_serializer: Optional[Serializer] = None,
+        cuda_device: Union[int, torch.device] = -1,
+        postprocessor_fn_name: str = "make_output_human_readable",
     ):
         super(SimpleEvaluator, self).__init__(batch_serializer, cuda_device, postprocessor_fn_name)
 
     def __call__(
-            self,
-            model: Model,
-            data_loader: DataLoader,
-            batch_weight_key: str = None,
-            metrics_output_file: Union[str, PathLike] = None,
-            predictions_output_file: Union[str, PathLike] = None,
+        self,
+        model: Model,
+        data_loader: DataLoader,
+        batch_weight_key: str = None,
+        metrics_output_file: Union[str, PathLike] = None,
+        predictions_output_file: Union[str, PathLike] = None,
     ):
         """
         Evaluate a single data source.
@@ -192,14 +192,14 @@ def __call__(
                     metrics["loss"] = total_loss / total_weight
 
                 description = (
-                        ", ".join(
-                            [
-                                "%s: %.2f" % (name, value)
-                                for name, value in metrics.items()
-                                if not name.startswith("_")
-                            ]
-                        )
-                        + " ||"
+                    ", ".join(
+                        [
+                            "%s: %.2f" % (name, value)
+                            for name, value in metrics.items()
+                            if not name.startswith("_")
+                        ]
+                    )
+                    + " ||"
                 )
                 generator_tqdm.set_description(description, refresh=False)
 
@@ -235,7 +235,7 @@ def __call__(
 
     def _to_params(self) -> Dict[str, Any]:
         return {
-            "type"               : "simple",
-            "cuda_device"        : self.cuda_device,
+            "type": "simple",
+            "cuda_device": self.cuda_device,
             "batch_postprocessor": self.batch_serializer.to_params(),
         }

From 1ee6546870831e095bb503b2c978da4442fb1bd5 Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Tue, 4 Jan 2022 09:33:00 -0500
Subject: [PATCH 11/12] Fixed Flake8 and Black issues

---
 allennlp/commands/evaluate.py   | 21 +++++++--------------
 tests/commands/evaluate_test.py | 11 +++--------
 2 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index 5fdef7a5533..bd5b9603d55 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -12,7 +12,6 @@
 from typing import Union, Dict, Any, Optional
 from copy import deepcopy
 
-
 from allennlp.commands.subcommand import Subcommand
 from allennlp.common import logging as common_logging
 from allennlp.common.util import prepare_environment
@@ -36,28 +35,23 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
         subparser.add_argument(
             "input_file",
             type=str,
-            help=(
-                "path to the file containing the evaluation data"
-                " (for mutiple files, put "," between filenames e.g., input1.txt,input2.txt)"
-            ),
+            help="path to the file containing the evaluation data (for mutiple "
+                 "files, put between filenames e.g., input1.txt,input2.txt)",
         )
 
         subparser.add_argument(
             "--output-file",
             type=str,
-            help=(
-                "optional path to write the metrics to as JSON"
-                " (for mutiple files, put "," between filenames e.g., output1.txt,output2.txt)"
-            ),
+            help="optional path to write the metrics to as JSON (for mutiple "
+                 "files, put  between filenames e.g., output1.txt,output2.txt)",
         )
 
         subparser.add_argument(
             "--predictions-output-file",
             type=str,
-            help=(
-                "optional path to write the predictions to as JSON lines"
-                " (for mutiple files, put "," between filenames e.g., output1.jsonl,output2.jsonl)"
-            ),
+            help="optional path to write the predictions to as JSON lines "
+            "(for mutiple files, put  between filenames e.g., "
+            "output1.jsonl,output2.jsonl)",
         )
 
         subparser.add_argument(
@@ -301,7 +295,6 @@ def evaluate_from_archive(
                 + "to the number of datasets being evaluated. "
             )
 
-
     # output file
     output_file_path = None
     predictions_output_file_path = None
diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py
index 692a20d9b2b..c454e07eaf3 100644
--- a/tests/commands/evaluate_test.py
+++ b/tests/commands/evaluate_test.py
@@ -112,18 +112,13 @@ def test_multiple_output_files_evaluate_from_args(self):
 
         kebab_args = [
             "evaluate",
-            str(
-                self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
-            ),
-            str(self.FIXTURES_ROOT / "data" / "conll2003.txt")
-            + ","
-            + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"),
+            ",".join(map(str, paths)),
             "--cuda-device",
             "-1",
             "--output-file",
-            output_file + "," + output_file,
+            ",".join(map(str, out_paths)),
             "--predictions-output-file",
-            predictions_output_file + "," + predictions_output_file,
+            ",".join(map(str, pred_paths)),
         ]
         args = self.parser.parse_args(kebab_args)
         computed_metrics = evaluate_from_args(args)

From 11ca86efd54c2c2b40c09cdecc16bc1c6de99b1a Mon Sep 17 00:00:00 2001
From: Gabe <gabeorlanski@gmail.com>
Date: Tue, 4 Jan 2022 10:05:01 -0500
Subject: [PATCH 12/12] Fixed Test Issues

---
 allennlp/commands/evaluate.py   | 4 ++--
 tests/commands/evaluate_test.py | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py
index bd5b9603d55..2b9403a417f 100644
--- a/allennlp/commands/evaluate.py
+++ b/allennlp/commands/evaluate.py
@@ -36,14 +36,14 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument
             "input_file",
             type=str,
             help="path to the file containing the evaluation data (for mutiple "
-                 "files, put between filenames e.g., input1.txt,input2.txt)",
+            "files, put between filenames e.g., input1.txt,input2.txt)",
         )
 
         subparser.add_argument(
             "--output-file",
             type=str,
             help="optional path to write the metrics to as JSON (for mutiple "
-                 "files, put  between filenames e.g., output1.txt,output2.txt)",
+            "files, put  between filenames e.g., output1.txt,output2.txt)",
         )
 
         subparser.add_argument(
diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py
index c454e07eaf3..eebf7753453 100644
--- a/tests/commands/evaluate_test.py
+++ b/tests/commands/evaluate_test.py
@@ -112,6 +112,9 @@ def test_multiple_output_files_evaluate_from_args(self):
 
         kebab_args = [
             "evaluate",
+            str(
+                self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
+            ),
             ",".join(map(str, paths)),
             "--cuda-device",
             "-1",
@@ -197,13 +200,13 @@ def test_auto_names_creates_files(self, auto_names):
             str(
                 self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz"
             ),
-            ":".join(map(str, paths)),
+            ",".join(map(str, paths)),
             "--cuda-device",
             "-1",
             "--output-file",
-            ":".join(map(str, out_paths)),
+            ",".join(map(str, out_paths)),
             "--predictions-output-file",
-            ":".join(map(str, pred_paths)),
+            ",".join(map(str, pred_paths)),
             "--auto-names",
             auto_names,
         ]