From 0969a1465943c13d1470ae2eda4dbe38cffde8ea Mon Sep 17 00:00:00 2001 From: Gabe Date: Sat, 23 Oct 2021 08:28:56 -0400 Subject: [PATCH 01/12] Barebones implementation --- allennlp/common/testing/test_case.py | 14 +- allennlp/evaluation/__init__.py | 2 + allennlp/evaluation/evaluator.py | 201 ++++++++++++++++++ .../evaluation/postprocessors/__init__.py | 1 + .../postprocessors/postprocessor.py | 56 +++++ tests/commands/evaluate_test.py | 22 +- tests/evaluation/__init__.py | 0 tests/evaluation/evaluator_tests.py | 57 +++++ tests/evaluation/postprocessors/__init__.py | 0 .../postprocessors/postprocessor_test.py | 59 +++++ 10 files changed, 386 insertions(+), 26 deletions(-) create mode 100644 allennlp/evaluation/__init__.py create mode 100644 allennlp/evaluation/evaluator.py create mode 100644 allennlp/evaluation/postprocessors/__init__.py create mode 100644 allennlp/evaluation/postprocessors/postprocessor.py create mode 100644 tests/evaluation/__init__.py create mode 100644 tests/evaluation/evaluator_tests.py create mode 100644 tests/evaluation/postprocessors/__init__.py create mode 100644 tests/evaluation/postprocessors/postprocessor_test.py diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index 9f466e8ee6b..05a34505079 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -2,11 +2,13 @@ import os import pathlib import shutil +import pytest import tempfile from allennlp.common.checks import log_pytorch_version_info -TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests") + +# TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests") class AllenNlpTestCase: @@ -21,6 +23,12 @@ class AllenNlpTestCase: TESTS_ROOT = PROJECT_ROOT / "tests" FIXTURES_ROOT = PROJECT_ROOT / "test_fixtures" + # Had issues with the old tempdir, so used this pytest fixture that is + # always ran and will always create a tmpdir. + @pytest.fixture(autouse=True) + def test_dir(self, tmpdir): + self.TEST_DIR = pathlib.Path(tmpdir.mkdir('semparse')) + def setup_method(self): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG @@ -33,9 +41,5 @@ def setup_method(self): logging.getLogger("urllib3.connectionpool").disabled = True log_pytorch_version_info() - self.TEST_DIR = pathlib.Path(TEST_DIR) - - os.makedirs(self.TEST_DIR, exist_ok=True) - def teardown_method(self): shutil.rmtree(self.TEST_DIR) diff --git a/allennlp/evaluation/__init__.py b/allennlp/evaluation/__init__.py new file mode 100644 index 00000000000..ea63688b772 --- /dev/null +++ b/allennlp/evaluation/__init__.py @@ -0,0 +1,2 @@ +from allennlp.evaluation.evaluator import Evaluator, SimpleEvaluator +from allennlp.evaluation.postprocessors.postprocessor import Postprocessor diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py new file mode 100644 index 00000000000..30e7313bc36 --- /dev/null +++ b/allennlp/evaluation/evaluator.py @@ -0,0 +1,201 @@ +""" +Evaluator class for evaluating a model with a given dataset +""" +from collections import defaultdict +from typing import Union, List, Dict, Any, Tuple, Optional +from os import PathLike +from pathlib import Path +import torch +from itertools import groupby +import logging +import json +import numpy as np + +from allennlp.common.checks import check_for_gpu, ConfigurationError +from allennlp.common.tqdm import Tqdm +from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL +from allennlp.data.fields import TensorField +from allennlp.nn import util as nn_util +from allennlp.common import Registrable, Params +from allennlp.models import Model +from allennlp.data import DataLoader, Vocabulary +from allennlp.evaluation.postprocessors.postprocessor import Postprocessor + +logger = logging.getLogger(__name__) + + +class Evaluator(Registrable): + """ + Evaluation class + + # Parameters + + cuda_device : `Union[int, torch.device]`, optional (default=`-1`) + The cuda device to use for this evaluation. The model is assumed to already be using this + device; this parameter is only used for moving the input data to the correct device. + """ + default_implementation = "simple" + + def __init__( + self, + batch_postprocessor: Postprocessor, + cuda_device: Union[int, torch.device] = -1 + ): + self.batch_human_serializer = batch_postprocessor + self.cuda_device = cuda_device + + def __call__( + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + output_file: Union[str, PathLike] = None, + predictions_file: Union[str, PathLike] = None + ): + """ + # Parameters + + model : `Model` + The model to evaluate + data_loader : `DataLoader` + The `DataLoader` that will iterate over the evaluation data (data loaders already contain + their data). + batch_weight_key : `str`, optional (default=`None`) + If given, this is a key in the output dictionary for each batch that specifies how to weight + the loss for that batch. If this is not given, we use a weight of 1 for every batch. + metrics_output_file : `Union[str, PathLike]`, optional (default=`None`) + Optional path to write the final metrics to. + predictions_output_file : `Union[str, PathLike]`, optional (default=`None`) + Optional path to write the predictions to. + + # Returns + + `Dict[str, Any]` + The final metrics. + """ + raise NotImplementedError("__call__") + + +@Evaluator.register("simple") +class SimpleEvaluator(Evaluator): + def __init__( + self, + batch_postprocessor, + cuda_device: Union[int, torch.device] = -1 + ): + super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device) + + def __call__( + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + output_file: Union[str, PathLike] = None, + predictions_file: Union[str, PathLike] = None + ): + """ + # Parameters + + model : `Model` + The model to evaluate + data_loader : `DataLoader` + The `DataLoader` that will iterate over the evaluation data (data loaders already contain + their data). + batch_weight_key : `str`, optional (default=`None`) + If given, this is a key in the output dictionary for each batch that specifies how to weight + the loss for that batch. If this is not given, we use a weight of 1 for every batch. + metrics_output_file : `Union[str, PathLike]`, optional (default=`None`) + Optional path to write the final metrics to. + predictions_output_file : `Union[str, PathLike]`, optional (default=`None`) + Optional path to write the predictions to. + + # Returns + + `Dict[str, Any]` + The final metrics. + """ + check_for_gpu(self.cuda_device) + data_loader.set_target_device(int_to_device(self.cuda_device)) + output_file = Path(output_file) if output_file is not None else None + if predictions_file is not None: + predictions_file = Path(predictions_file).open("w", encoding="utf-8") + + with torch.no_grad(): + model.eval() + + iterator = iter(data_loader) + logger.info("Iterating over dataset") + generator_tqdm = Tqdm.tqdm(iterator) + # Number of batches in instances. + batch_count = 0 + # Number of batches where the model produces a loss. + loss_count = 0 + # Cumulative weighted loss + total_loss = 0.0 + # Cumulative weight across all batches. + total_weight = 0.0 + + for batch in generator_tqdm: + batch_count += 1 + batch = nn_util.move_to_device(batch, self.cuda_device) + output_dict = model(**batch) + loss = output_dict.get("loss") + + metrics = model.get_metrics() + + if loss is not None: + loss_count += 1 + if batch_weight_key: + weight = output_dict[batch_weight_key].item() + else: + weight = 1.0 + + total_weight += weight + total_loss += loss.item() * weight + # Report the average loss so far. + metrics["loss"] = total_loss / total_weight + + description = ( + ", ".join( + [ + "%s: %.2f" % (name, value) + for name, value in metrics.items() + if not name.startswith("_") + ] + ) + + " ||" + ) + generator_tqdm.set_description(description, refresh=False) + + # TODO(gabeorlanski): Add in postprocessing the batch for token + # metrics + if predictions_file is not None: + predictions_file.write( + self.batch_human_serializer( + batch, output_dict, data_loader + ) + '\n' + ) + + if predictions_file is not None: + predictions_file.close() + + final_metrics = model.get_metrics(reset=True) + if loss_count > 0: + # Sanity check + if loss_count != batch_count: + raise RuntimeError( + "The model you are trying to evaluate only sometimes produced a loss!" + ) + final_metrics["loss"] = total_loss / total_weight + + if output_file is not None: + dump_metrics(str(output_file), final_metrics, log=True) + + return final_metrics + + def _to_params(self) -> Dict[str, Any]: + return { + "type" : "simple", + "cuda_device" : self.cuda_device, + "batch_postprocessor": self.batch_human_serializer.to_params() + } diff --git a/allennlp/evaluation/postprocessors/__init__.py b/allennlp/evaluation/postprocessors/__init__.py new file mode 100644 index 00000000000..df1c98089c0 --- /dev/null +++ b/allennlp/evaluation/postprocessors/__init__.py @@ -0,0 +1 @@ +from allennlp.evaluation.postprocessors.postprocessor import SimplePostprocessor diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/postprocessors/postprocessor.py new file mode 100644 index 00000000000..00e0dd0b154 --- /dev/null +++ b/allennlp/evaluation/postprocessors/postprocessor.py @@ -0,0 +1,56 @@ +from collections import defaultdict +from typing import Union, List, Dict, Any, Tuple, Optional +from os import PathLike +from pathlib import Path +import torch +from itertools import groupby +import logging +import json +import numpy as np + +from allennlp.common.checks import check_for_gpu, ConfigurationError +from allennlp.common.tqdm import Tqdm +from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL +from allennlp.data.fields import TensorField +from allennlp.nn import util as nn_util +from allennlp.common import Registrable, Params +from allennlp.models import Model +from allennlp.data import DataLoader, Vocabulary + +logger = logging.getLogger(__name__) + + +class Postprocessor(Registrable): + """ + General Postprocessor class for turning batches into human readable data + """ + + def __call__( + self, + batch: Dict[str, TensorField], + output_dict: Dict, + data_loader: DataLoader + ) -> str: + raise NotImplementedError("__call__") + + default_implementation = "simple" + + +@Postprocessor.register("simple") +class SimplePostprocessor(Postprocessor): + def _to_params(self) -> Dict[str, Any]: + return { + "type": "simple" + } + + def __call__( + self, + batch: Dict[str, TensorField], + output_dict: Dict, + data_loader: DataLoader + ): + if batch is None: + raise ValueError("Postprocessor got a batch that is None") + if output_dict is None: + raise ValueError("Postprocessor got an output_dict that is None") + return json.dumps(sanitize({**batch, **output_dict})) diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index 932831c7e81..d16521d58df 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -4,9 +4,8 @@ import torch from flaky import flaky -import pytest -from allennlp.commands.evaluate import evaluate_from_args, Evaluate, evaluate +from allennlp.commands.evaluate import evaluate_from_args, Evaluate from allennlp.common.testing import AllenNlpTestCase from allennlp.data.data_loaders import TensorDict from allennlp.models import Model @@ -43,25 +42,6 @@ def setup_method(self): subparsers = self.parser.add_subparsers(title="Commands", metavar="") Evaluate().add_subparser(subparsers) - def test_evaluate_calculates_average_loss(self): - losses = [7.0, 9.0, 8.0] - outputs = [{"loss": torch.Tensor([loss])} for loss in losses] - data_loader = DummyDataLoader(outputs) - metrics = evaluate(DummyModel(), data_loader, -1, "") - assert metrics["loss"] == pytest.approx(8.0) - - def test_evaluate_calculates_average_loss_with_weights(self): - losses = [7.0, 9.0, 8.0] - weights = [10, 2, 1.5] - inputs = zip(losses, weights) - outputs = [ - {"loss": torch.Tensor([loss]), "batch_weight": torch.Tensor([weight])} - for loss, weight in inputs - ] - data_loader = DummyDataLoader(outputs) - metrics = evaluate(DummyModel(), data_loader, -1, "batch_weight") - assert metrics["loss"] == pytest.approx((70 + 18 + 12) / 13.5) - @flaky def test_evaluate_from_args(self): kebab_args = [ diff --git a/tests/evaluation/__init__.py b/tests/evaluation/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/evaluation/evaluator_tests.py b/tests/evaluation/evaluator_tests.py new file mode 100644 index 00000000000..28a088a9b06 --- /dev/null +++ b/tests/evaluation/evaluator_tests.py @@ -0,0 +1,57 @@ +from typing import Iterator, List, Dict + +import torch +import pytest + +from allennlp.common.testing import AllenNlpTestCase +from allennlp.data.data_loaders import TensorDict +from allennlp.models import Model +from allennlp.evaluation import Evaluator +from allennlp.common import Params + + +class DummyDataLoader: + def __init__(self, outputs: List[TensorDict]) -> None: + super().__init__() + self._outputs = outputs + + def __iter__(self) -> Iterator[TensorDict]: + yield from self._outputs + + def __len__(self): + return len(self._outputs) + + def set_target_device(self, _): + pass + + +class DummyModel(Model): + def __init__(self) -> None: + super().__init__(None) # type: ignore + + def forward(self, **kwargs) -> Dict[str, torch.Tensor]: # type: ignore + return kwargs + + +class TestEvaluator(AllenNlpTestCase): + def setup_method(self): + self.evaluator = Evaluator.from_params(Params({"batch_postprocessor": "simple"})) + + def test_evaluate_calculates_average_loss(self): + losses = [7.0, 9.0, 8.0] + outputs = [{"loss": torch.Tensor([loss])} for loss in losses] + data_loader = DummyDataLoader(outputs) + metrics = self.evaluator(DummyModel(), data_loader, "") # type: ignore + assert metrics["loss"] == pytest.approx(8.0) + + def test_evaluate_calculates_average_loss_with_weights(self): + losses = [7.0, 9.0, 8.0] + weights = [10, 2, 1.5] + inputs = zip(losses, weights) + outputs = [ + {"loss": torch.Tensor([loss]), "batch_weight": torch.Tensor([weight])} + for loss, weight in inputs + ] + data_loader = DummyDataLoader(outputs) + metrics = self.evaluator(DummyModel(), data_loader, "batch_weight") # type: ignore + assert metrics["loss"] == pytest.approx((70 + 18 + 12) / 13.5) diff --git a/tests/evaluation/postprocessors/__init__.py b/tests/evaluation/postprocessors/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py new file mode 100644 index 00000000000..5a552959544 --- /dev/null +++ b/tests/evaluation/postprocessors/postprocessor_test.py @@ -0,0 +1,59 @@ +from typing import Iterator, List, Dict +import torch +import pytest +import json + +from allennlp.common.testing import AllenNlpTestCase +from allennlp.common import Params +from allennlp.common.util import sanitize +from allennlp.data.data_loaders import TensorDict +from allennlp.evaluation import Postprocessor +from allennlp.evaluation.postprocessors import SimplePostprocessor + + +class DummyDataLoader: + def __init__(self, outputs: List[TensorDict]) -> None: + super().__init__() + self._outputs = outputs + + def __iter__(self) -> Iterator[TensorDict]: + yield from self._outputs + + def __len__(self): + return len(self._outputs) + + def set_target_device(self, _): + pass + + +class TestPostprocessor(AllenNlpTestCase): + def setup_method(self): + self.postprocessor = Postprocessor.from_params(Params({})) + + def test_postprocessor_default_implementation(self): + assert self.postprocessor.to_params().params == {"type": "simple"} + assert isinstance(self.postprocessor, SimplePostprocessor) + + @pytest.mark.parametrize("batch", [ + { + "Do you want ants?": "Because that's how you get ants.", + "testing" : torch.tensor([[1, 2, 3]]) + }, + {}, + None + ], ids=["TestBatch", "EmptyBatch", "None"]) + @pytest.mark.parametrize("output_dict", [ + {"You're": ["Not", [["My"]], "Supervisor"]}, + {}, + None + ], ids=["TestOutput", "EmptyOutput", "None"]) + def test_simple_postprocessor_call(self, batch, output_dict): + data_loader = DummyDataLoader([]) + if batch is None or output_dict is None: + with pytest.raises(ValueError): + self.postprocessor(batch, output_dict, data_loader) # type: ignore + return + + expected = json.dumps(sanitize({**batch, **output_dict})) + result = self.postprocessor(batch, output_dict, data_loader) # type: ignore + assert result == expected From fda9469d8c18494e2325c9d67920fd5697e777d4 Mon Sep 17 00:00:00 2001 From: Gabe Date: Sat, 23 Oct 2021 08:35:22 -0400 Subject: [PATCH 02/12] Better Docs + Minor Bugfix Added in fix for the SimpleEvaluator not initializing a post processor --- allennlp/evaluation/evaluator.py | 44 ++++++++++++++++------------- tests/evaluation/evaluator_tests.py | 7 +++++ 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py index 30e7313bc36..d19a0c3524a 100644 --- a/allennlp/evaluation/evaluator.py +++ b/allennlp/evaluation/evaluator.py @@ -1,24 +1,19 @@ """ Evaluator class for evaluating a model with a given dataset """ -from collections import defaultdict -from typing import Union, List, Dict, Any, Tuple, Optional +from typing import Union, Dict, Any from os import PathLike from pathlib import Path import torch -from itertools import groupby import logging -import json -import numpy as np -from allennlp.common.checks import check_for_gpu, ConfigurationError +from allennlp.common.checks import check_for_gpu from allennlp.common.tqdm import Tqdm -from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL -from allennlp.data.fields import TensorField +from allennlp.common.util import dump_metrics, int_to_device from allennlp.nn import util as nn_util -from allennlp.common import Registrable, Params +from allennlp.common import Registrable from allennlp.models import Model -from allennlp.data import DataLoader, Vocabulary +from allennlp.data import DataLoader from allennlp.evaluation.postprocessors.postprocessor import Postprocessor logger = logging.getLogger(__name__) @@ -29,10 +24,15 @@ class Evaluator(Registrable): Evaluation class # Parameters - + + batch_postprocessor: `Postprocessor` + The postprocessor to use for turning both the batches and the outputs + of the model into human readable data. + cuda_device : `Union[int, torch.device]`, optional (default=`-1`) - The cuda device to use for this evaluation. The model is assumed to already be using this - device; this parameter is only used for moving the input data to the correct device. + The cuda device to use for this evaluation. The model is assumed to + already be using this device; this parameter is only used for moving + the input data to the correct device. """ default_implementation = "simple" @@ -41,7 +41,7 @@ def __init__( batch_postprocessor: Postprocessor, cuda_device: Union[int, torch.device] = -1 ): - self.batch_human_serializer = batch_postprocessor + self.batch_postprocessor = batch_postprocessor self.cuda_device = cuda_device def __call__( @@ -51,7 +51,7 @@ def __call__( batch_weight_key: str = None, output_file: Union[str, PathLike] = None, predictions_file: Union[str, PathLike] = None - ): + ) -> Dict[str, Any]: """ # Parameters @@ -65,8 +65,10 @@ def __call__( the loss for that batch. If this is not given, we use a weight of 1 for every batch. metrics_output_file : `Union[str, PathLike]`, optional (default=`None`) Optional path to write the final metrics to. + predictions_output_file : `Union[str, PathLike]`, optional (default=`None`) - Optional path to write the predictions to. + Optional path to write the predictions to. If passed the + postprocessor will be called and its output will be written as lines. # Returns @@ -78,9 +80,13 @@ def __call__( @Evaluator.register("simple") class SimpleEvaluator(Evaluator): + """ + Simple evaluator implementation. Uses the vanilla evaluation code. + """ + def __init__( self, - batch_postprocessor, + batch_postprocessor:Postprocessor, cuda_device: Union[int, torch.device] = -1 ): super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device) @@ -171,7 +177,7 @@ def __call__( # metrics if predictions_file is not None: predictions_file.write( - self.batch_human_serializer( + self.batch_postprocessor( batch, output_dict, data_loader ) + '\n' ) @@ -197,5 +203,5 @@ def _to_params(self) -> Dict[str, Any]: return { "type" : "simple", "cuda_device" : self.cuda_device, - "batch_postprocessor": self.batch_human_serializer.to_params() + "batch_postprocessor": self.batch_postprocessor.to_params() } diff --git a/tests/evaluation/evaluator_tests.py b/tests/evaluation/evaluator_tests.py index 28a088a9b06..9c21ffce190 100644 --- a/tests/evaluation/evaluator_tests.py +++ b/tests/evaluation/evaluator_tests.py @@ -55,3 +55,10 @@ def test_evaluate_calculates_average_loss_with_weights(self): data_loader = DummyDataLoader(outputs) metrics = self.evaluator(DummyModel(), data_loader, "batch_weight") # type: ignore assert metrics["loss"] == pytest.approx((70 + 18 + 12) / 13.5) + + def test_to_params(self): + assert self.evaluator.to_params() == { + "type" : "simple", + "cuda_device" : -1, + "batch_postprocessor": {"type": "simple"} + } From f3ee8c3d7f143af227e0eccb06525d3b42409a56 Mon Sep 17 00:00:00 2001 From: Gabe Date: Sat, 23 Oct 2021 09:48:12 -0400 Subject: [PATCH 03/12] Basic Support added to the evaluate function for the evaluator --- allennlp/commands/evaluate.py | 154 ++++++++++++++---- allennlp/common/testing/test_case.py | 12 +- allennlp/evaluation/evaluator.py | 22 +-- tests/commands/evaluate_test.py | 126 ++++++++++---- .../postprocessors/postprocessor_test.py | 1 + 5 files changed, 225 insertions(+), 90 deletions(-) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index b517de979de..77924fce3ac 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -7,8 +7,9 @@ import argparse import json import logging -from typing import Any, Dict - +from pathlib import Path +from os import PathLike +from typing import Union, Dict, Any, Optional, List from copy import deepcopy from overrides import overrides @@ -18,7 +19,7 @@ from allennlp.common.util import prepare_environment from allennlp.data import DataLoader from allennlp.models.archival import load_archive -from allennlp.training.util import evaluate +from allennlp.evaluation import Evaluator logger = logging.getLogger(__name__) @@ -98,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument action="store_true", default=False, help="if specified, we will use the instances in your new dataset to " - "extend your vocabulary. If pretrained-file was used to initialize " - "embedding layers, you may also need to pass --embedding-sources-mapping.", + "extend your vocabulary. If pretrained-file was used to initialize " + "embedding layers, you may also need to pass --embedding-sources-mapping.", ) subparser.add_argument( @@ -107,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument type=str, default="", help="a JSON dict defining mapping from embedding module path to embedding " - "pretrained-file used during training. If not passed, and embedding needs to be " - "extended, we will try to use the original file paths used during training. If " - "they are not available we will use random vectors for embedding extension.", + "pretrained-file used during training. If not passed, and embedding needs to be " + "extended, we will try to use the original file paths used during training. If " + "they are not available we will use random vectors for embedding extension.", ) subparser.add_argument( "--file-friendly-logging", @@ -118,13 +119,52 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument help="outputs tqdm status on separate lines and slows tqdm refresh rate", ) + subparser.add_argument( + "--auto-names", + default="NONE", + help="Automatically create output names for each evaluation file.", + choices=["NONE", "METRICS", "PREDS", "ALL"] + ) + subparser.set_defaults(func=evaluate_from_args) return subparser def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: - common_logging.FILE_FRIENDLY_LOGGING = args.file_friendly_logging + return evaluate_from_archive( + archive_file=args.archive_file, + input_file=args.input_file, + output_file=args.output_file, + predictions_output_file=args.predictions_output_file, + batch_size=args.batch_size, + cmd_overrides=args.overrides, + cuda_device=args.cuda_device, + embedding_sources_mapping=args.embedding_sources_mapping, + extend_vocab=args.extend_vocab, + weights_file=args.weights_file, + file_friendly_logging=args.file_friendly_logging, + batch_weight_key=args.batch_weight_key, + auto_names=args.auto_names + ) + + +def evaluate_from_archive( + archive_file: Union[str, PathLike], + input_file: str, + output_file: Optional[str] = None, + predictions_output_file: Optional[str] = None, + batch_size: Optional[int] = None, + cmd_overrides: Union[str, Dict[str, Any]] = "", + cuda_device: int = -1, + embedding_sources_mapping: str = None, + extend_vocab: bool = False, + weights_file: str = None, + file_friendly_logging: bool = False, + batch_weight_key: str = None, + auto_names: str = "None", +): + common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging # Disable some of the more verbose logging statements logging.getLogger("allennlp.common.params").disabled = True @@ -133,77 +173,119 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: # Load from archive archive = load_archive( - args.archive_file, - weights_file=args.weights_file, - cuda_device=args.cuda_device, - overrides=args.overrides, + archive_file, + weights_file=weights_file, + cuda_device=cuda_device, + overrides=cmd_overrides, ) config = deepcopy(archive.config) prepare_environment(config) model = archive.model model.eval() + # Load the evaluator from the config key `Evaluator` + evaluator_params = config.pop("evaluation", {}) + evaluator_params['cuda_device'] = cuda_device + evaluator = Evaluator.from_params(evaluator_params) + # Load the evaluation data dataset_reader = archive.validation_dataset_reader # split files - evaluation_data_path_list = args.input_file.split(":") - if args.output_file is not None: - output_file_list = args.output_file.split(":") - assert len(output_file_list) == len( - evaluation_data_path_list - ), "The number of `output_file` paths must be equal to the number of datasets being evaluated." - if args.predictions_output_file is not None: - predictions_output_file_list = args.predictions_output_file.split(":") - assert len(predictions_output_file_list) == len(evaluation_data_path_list), ( - "The number of `predictions_output_file` paths must be equal" - + "to the number of datasets being evaluated. " - ) + evaluation_data_path_list = input_file.split(":") + + # TODO(gabeorlanski): Is it safe to always default to .outputs and .preds? + # TODO(gabeorlanski): Add in way to save to specific output directory + if output_file is not None: + if auto_names == "METRICS" or auto_names == "ALL": + logger.warning(f"Passed output_files will be ignored, auto_names is" + f" set to {auto_names}") + + # Keep the path of the parent otherwise it will write to the CWD + output_file_list = [ + p.parent.joinpath(f"{p.stem}.outputs") + for p in map(Path, evaluation_data_path_list) + ] + else: + output_file_list = output_file.split(":") + assert len(output_file_list) == len( + evaluation_data_path_list + ), "The number of `output_file` paths must be equal to the number of datasets being evaluated." + if predictions_output_file is not None: + if auto_names == "PREDS" or auto_names == "ALL": + logger.warning(f"Passed predictions files will be ignored, auto_names is" + f" set to {auto_names}") + + # Keep the path of the parent otherwise it will write to the CWD + predictions_output_file_list = [ + p.parent.joinpath(f"{p.stem}.preds") + for p in map(Path, evaluation_data_path_list) + ] + else: + predictions_output_file_list = predictions_output_file.split(":") + assert len(predictions_output_file_list) == len(evaluation_data_path_list), ( + "The number of `predictions_output_file` paths must be equal" + + "to the number of datasets being evaluated. " + ) # output file output_file_path = None predictions_output_file_path = None # embedding sources - if args.extend_vocab: + if extend_vocab: logger.info("Vocabulary is being extended with embedding sources.") embedding_sources = ( - json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {} + json.loads(embedding_sources_mapping) if embedding_sources_mapping else {} ) + all_metrics = {} for index in range(len(evaluation_data_path_list)): config = deepcopy(archive.config) evaluation_data_path = evaluation_data_path_list[index] - if args.output_file is not None: + + # Get the eval file name so we can save each metric by file name in the + # output dictionary. + eval_file_name = Path(evaluation_data_path).stem + + if output_file is not None: + # noinspection PyUnboundLocalVariable output_file_path = output_file_list[index] - if args.predictions_output_file is not None: + + if predictions_output_file is not None: + # noinspection PyUnboundLocalVariable predictions_output_file_path = predictions_output_file_list[index] logger.info("Reading evaluation data from %s", evaluation_data_path) data_loader_params = config.get("validation_data_loader", None) if data_loader_params is None: data_loader_params = config.get("data_loader") - if args.batch_size: - data_loader_params["batch_size"] = args.batch_size + if batch_size: + data_loader_params["batch_size"] = batch_size data_loader = DataLoader.from_params( params=data_loader_params, reader=dataset_reader, data_path=evaluation_data_path ) - if args.extend_vocab: + if extend_vocab: logger.info("Vocabulary is being extended with test instances.") model.vocab.extend_from_instances(instances=data_loader.iter_instances()) + # noinspection PyUnboundLocalVariable model.extend_embedder_vocab(embedding_sources) data_loader.index_with(model.vocab) - metrics = evaluate( + metrics = evaluator( model, data_loader, - args.cuda_device, - args.batch_weight_key, + batch_weight_key, output_file=output_file_path, predictions_output_file=predictions_output_file_path, ) + + # Add the metric prefixed by the file it came from. + for name, value in metrics.items(): + all_metrics[f"{eval_file_name}_{name}"] = value + logger.info("Finished evaluating.") - return metrics + return all_metrics diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index 05a34505079..dcfd5da217f 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -7,8 +7,7 @@ from allennlp.common.checks import log_pytorch_version_info - -# TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests") +TEST_DIR = tempfile.mkdtemp(prefix="allennlp_tests") class AllenNlpTestCase: @@ -23,12 +22,6 @@ class AllenNlpTestCase: TESTS_ROOT = PROJECT_ROOT / "tests" FIXTURES_ROOT = PROJECT_ROOT / "test_fixtures" - # Had issues with the old tempdir, so used this pytest fixture that is - # always ran and will always create a tmpdir. - @pytest.fixture(autouse=True) - def test_dir(self, tmpdir): - self.TEST_DIR = pathlib.Path(tmpdir.mkdir('semparse')) - def setup_method(self): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.DEBUG @@ -40,6 +33,9 @@ def setup_method(self): logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(logging.INFO) logging.getLogger("urllib3.connectionpool").disabled = True log_pytorch_version_info() + self.TEST_DIR = pathlib.Path(TEST_DIR) + + os.makedirs(self.TEST_DIR, exist_ok=True) def teardown_method(self): shutil.rmtree(self.TEST_DIR) diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py index d19a0c3524a..fe9ee36d1f3 100644 --- a/allennlp/evaluation/evaluator.py +++ b/allennlp/evaluation/evaluator.py @@ -1,7 +1,7 @@ """ Evaluator class for evaluating a model with a given dataset """ -from typing import Union, Dict, Any +from typing import Union, Dict, Any, Optional from os import PathLike from pathlib import Path import torch @@ -14,7 +14,7 @@ from allennlp.common import Registrable from allennlp.models import Model from allennlp.data import DataLoader -from allennlp.evaluation.postprocessors.postprocessor import Postprocessor +from allennlp.evaluation.postprocessors.postprocessor import Postprocessor, SimplePostprocessor logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ class Evaluator(Registrable): # Parameters - batch_postprocessor: `Postprocessor` + batch_postprocessor: `Postprocessor`, optional (default=`SimplePostprocessor`) The postprocessor to use for turning both the batches and the outputs of the model into human readable data. @@ -38,10 +38,10 @@ class Evaluator(Registrable): def __init__( self, - batch_postprocessor: Postprocessor, + batch_postprocessor: Optional[Postprocessor] = None, cuda_device: Union[int, torch.device] = -1 ): - self.batch_postprocessor = batch_postprocessor + self.batch_postprocessor = batch_postprocessor or SimplePostprocessor() self.cuda_device = cuda_device def __call__( @@ -50,7 +50,7 @@ def __call__( data_loader: DataLoader, batch_weight_key: str = None, output_file: Union[str, PathLike] = None, - predictions_file: Union[str, PathLike] = None + predictions_output_file: Union[str, PathLike] = None ) -> Dict[str, Any]: """ # Parameters @@ -86,7 +86,7 @@ class SimpleEvaluator(Evaluator): def __init__( self, - batch_postprocessor:Postprocessor, + batch_postprocessor: Optional[Postprocessor] = None, cuda_device: Union[int, torch.device] = -1 ): super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device) @@ -97,7 +97,7 @@ def __call__( data_loader: DataLoader, batch_weight_key: str = None, output_file: Union[str, PathLike] = None, - predictions_file: Union[str, PathLike] = None + predictions_output_file: Union[str, PathLike] = None ): """ # Parameters @@ -123,8 +123,10 @@ def __call__( check_for_gpu(self.cuda_device) data_loader.set_target_device(int_to_device(self.cuda_device)) output_file = Path(output_file) if output_file is not None else None - if predictions_file is not None: - predictions_file = Path(predictions_file).open("w", encoding="utf-8") + if predictions_output_file is not None: + predictions_file = Path(predictions_output_file).open("w", encoding="utf-8") + else: + predictions_file = None with torch.no_grad(): model.eval() diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index d16521d58df..950b56f07c9 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -1,7 +1,9 @@ import argparse import json +from pathlib import Path from typing import Iterator, List, Dict - +from shutil import copyfile +import pytest import torch from flaky import flaky @@ -57,12 +59,12 @@ def test_evaluate_from_args(self): args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert metrics.keys() == { - "accuracy", - "accuracy3", - "precision-overall", - "recall-overall", - "f1-measure-overall", - "loss", + "conll2003_accuracy", + "conll2003_accuracy3", + "conll2003_precision-overall", + "conll2003_recall-overall", + "conll2003_f1-measure-overall", + "conll2003_loss", } def test_output_file_evaluate_from_args(self): @@ -86,7 +88,7 @@ def test_output_file_evaluate_from_args(self): with open(output_file, "r") as file: saved_metrics = json.load(file) - assert computed_metrics == saved_metrics + assert computed_metrics == {f"conll2003_{k}": v for k, v in saved_metrics.items()} with open(predictions_output_file, "r") as file: for line in file: @@ -94,34 +96,35 @@ def test_output_file_evaluate_from_args(self): assert "tags" in prediction def test_multiple_output_files_evaluate_from_args(self): - output_file = str(self.TEST_DIR / "metrics.json") - predictions_output_file = str(self.TEST_DIR / "predictions.jsonl") - kebab_args = [ - "evaluate", - str( - self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" - ), - str(self.FIXTURES_ROOT / "data" / "conll2003.txt") - + ":" - + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"), - "--cuda-device", - "-1", - "--output-file", - output_file + ":" + output_file, - "--predictions-output-file", - predictions_output_file + ":" + predictions_output_file, - ] - args = self.parser.parse_args(kebab_args) - computed_metrics = evaluate_from_args(args) - - with open(output_file, "r") as file: - saved_metrics = json.load(file) - assert computed_metrics == saved_metrics - - with open(predictions_output_file, "r") as file: - for line in file: - prediction = json.loads(line.strip()) - assert "tags" in prediction + pytest.fail() + # output_file = str(self.TEST_DIR / "metrics.json") + # predictions_output_file = str(self.TEST_DIR / "predictions.jsonl") + # kebab_args = [ + # "evaluate", + # str( + # self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" + # ), + # str(self.FIXTURES_ROOT / "data" / "conll2003.txt") + # + ":" + # + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"), + # "--cuda-device", + # "-1", + # "--output-file", + # output_file + ":" + output_file, + # "--predictions-output-file", + # predictions_output_file + ":" + predictions_output_file, + # ] + # args = self.parser.parse_args(kebab_args) + # computed_metrics = evaluate_from_args(args) + # + # with open(output_file, "r") as file: + # saved_metrics = json.load(file) + # assert computed_metrics == {f"conll2003_{k}":v for k, v in saved_metrics.items()} + # + # with open(predictions_output_file, "r") as file: + # for line in file: + # prediction = json.loads(line.strip()) + # assert "tags" in prediction def test_evaluate_works_with_vocab_expansion(self): archive_path = str( @@ -155,3 +158,54 @@ def test_evaluate_works_with_vocab_expansion(self): ) assert metrics_1 != metrics_2 assert metrics_2 != metrics_3 + + @pytest.mark.parametrize("auto_names", ["NONE", "METRICS", "PREDS", "ALL"]) + def test_auto_names_creates_files(self, auto_names): + data_file = Path(self.FIXTURES_ROOT / "data" / "conll2003.txt") + paths = [] + out_paths = [] + pred_paths = [] + for i in range(5): + tmp_path = self.TEST_DIR.joinpath(f"TEST{i}.txt") + + # Need to create paths to check when they do not exist + out_paths.append(tmp_path.parent.joinpath(f"OUTPUTS{i}.json")) + pred_paths.append(tmp_path.parent.joinpath(f"PREDS{i}.txt")) + + copyfile(data_file, tmp_path) + paths.append(tmp_path) + + kebab_args = [ + "evaluate", + str( + self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" + ), + ":".join(map(str, paths)), + "--cuda-device", + "-1", + "--output-file", ":".join(map(str, out_paths)), + "--predictions-output-file", ":".join(map(str, pred_paths)), + "--auto-names", auto_names + ] + + args = self.parser.parse_args(kebab_args) + _ = evaluate_from_args(args) + + expected_input_data = data_file.read_text("utf-8") + + for i, p in enumerate(paths): + # Make sure it was not modified + assert p.read_text('utf-8') == expected_input_data + + if auto_names == "METRICS" or auto_names == "ALL": + assert not out_paths[i].exists() + assert p.parent.joinpath(f"{p.stem}.outputs").exists() + else: + assert out_paths[i].exists() + assert not p.parent.joinpath(f"{p.stem}.outputs").exists() + if auto_names == "PREDS" or auto_names == "ALL": + assert not pred_paths[i].exists() + assert p.parent.joinpath(f"{p.stem}.preds").exists() + else: + assert pred_paths[i].exists() + assert not p.parent.joinpath(f"{p.stem}.preds").exists() diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py index 5a552959544..3969686af07 100644 --- a/tests/evaluation/postprocessors/postprocessor_test.py +++ b/tests/evaluation/postprocessors/postprocessor_test.py @@ -28,6 +28,7 @@ def set_target_device(self, _): class TestPostprocessor(AllenNlpTestCase): def setup_method(self): + super(TestPostprocessor, self).setup_method() self.postprocessor = Postprocessor.from_params(Params({})) def test_postprocessor_default_implementation(self): From 4fe6edd4f0e610da11ee7986f57eb8ec04fafa5b Mon Sep 17 00:00:00 2001 From: Gabe Date: Sat, 23 Oct 2021 09:57:00 -0400 Subject: [PATCH 04/12] Better tests for evaluate with multiple --- allennlp/commands/evaluate.py | 8 ++- allennlp/commands/train.py | 5 ++ tests/commands/evaluate_test.py | 90 ++++++++++++++++++++------------- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index 77924fce3ac..356b31d0122 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -203,7 +203,7 @@ def evaluate_from_archive( # Keep the path of the parent otherwise it will write to the CWD output_file_list = [ - p.parent.joinpath(f"{p.stem}.outputs") + p.parent.joinpath(f"{p.stem}.outputs") for p in map(Path, evaluation_data_path_list) ] else: @@ -284,7 +284,11 @@ def evaluate_from_archive( # Add the metric prefixed by the file it came from. for name, value in metrics.items(): - all_metrics[f"{eval_file_name}_{name}"] = value + if len(evaluation_data_path_list) > 1: + key = f"{eval_file_name}_" + else: + key = "" + all_metrics[f"{key}{name}"] = value logger.info("Finished evaluating.") diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py index ea55749f7bc..5da3aa3969b 100644 --- a/allennlp/commands/train.py +++ b/allennlp/commands/train.py @@ -239,6 +239,11 @@ def train_model( training_util.create_serialization_dir(params, serialization_dir, recover, force) params.to_file(os.path.join(serialization_dir, CONFIG_NAME)) + # Change Author: Gabe Orlanski + # Placeholder for the time being to make sure no errors are raised b/c of + # the evaluator. + params.pop("evaluation",None) + meta = Meta.new() meta.to_file(os.path.join(serialization_dir, META_NAME)) diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index 950b56f07c9..df671fa6aa1 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -59,12 +59,12 @@ def test_evaluate_from_args(self): args = self.parser.parse_args(kebab_args) metrics = evaluate_from_args(args) assert metrics.keys() == { - "conll2003_accuracy", - "conll2003_accuracy3", - "conll2003_precision-overall", - "conll2003_recall-overall", - "conll2003_f1-measure-overall", - "conll2003_loss", + "accuracy", + "accuracy3", + "precision-overall", + "recall-overall", + "f1-measure-overall", + "loss", } def test_output_file_evaluate_from_args(self): @@ -88,7 +88,7 @@ def test_output_file_evaluate_from_args(self): with open(output_file, "r") as file: saved_metrics = json.load(file) - assert computed_metrics == {f"conll2003_{k}": v for k, v in saved_metrics.items()} + assert computed_metrics == saved_metrics with open(predictions_output_file, "r") as file: for line in file: @@ -96,35 +96,53 @@ def test_output_file_evaluate_from_args(self): assert "tags" in prediction def test_multiple_output_files_evaluate_from_args(self): - pytest.fail() - # output_file = str(self.TEST_DIR / "metrics.json") - # predictions_output_file = str(self.TEST_DIR / "predictions.jsonl") - # kebab_args = [ - # "evaluate", - # str( - # self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" - # ), - # str(self.FIXTURES_ROOT / "data" / "conll2003.txt") - # + ":" - # + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"), - # "--cuda-device", - # "-1", - # "--output-file", - # output_file + ":" + output_file, - # "--predictions-output-file", - # predictions_output_file + ":" + predictions_output_file, - # ] - # args = self.parser.parse_args(kebab_args) - # computed_metrics = evaluate_from_args(args) - # - # with open(output_file, "r") as file: - # saved_metrics = json.load(file) - # assert computed_metrics == {f"conll2003_{k}":v for k, v in saved_metrics.items()} - # - # with open(predictions_output_file, "r") as file: - # for line in file: - # prediction = json.loads(line.strip()) - # assert "tags" in prediction + data_file = Path(self.FIXTURES_ROOT / "data" / "conll2003.txt") + paths = [] + out_paths = [] + pred_paths = [] + for i in range(3): + tmp_path = self.TEST_DIR.joinpath(f"TEST{i}.txt") + + # Need to create paths to check when they do not exist + out_paths.append(tmp_path.parent.joinpath(f"OUTPUTS{i}.json")) + pred_paths.append(tmp_path.parent.joinpath(f"PREDS{i}.txt")) + + copyfile(data_file, tmp_path) + paths.append(tmp_path) + + kebab_args = [ + "evaluate", + str( + self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" + ), + ":".join(map(str, paths)), + "--cuda-device", + "-1", + "--output-file", ":".join(map(str, out_paths)), + "--predictions-output-file", ":".join(map(str, pred_paths)) + ] + args = self.parser.parse_args(kebab_args) + computed_metrics = evaluate_from_args(args) + computed_by_file = {} + for k, v in computed_metrics.items(): + fn, *metric_name = k.split("_") + if fn not in computed_by_file: + computed_by_file[fn] = {} + computed_by_file[fn]["_".join(metric_name)] = v + + assert len(computed_by_file) == len(paths) + expected_input_data = data_file.read_text("utf-8") + + for i, p in enumerate(paths): + # Make sure it was not modified + assert p.read_text('utf-8') == expected_input_data + + assert p.stem in computed_by_file, f"paths[{i}]={p.stem}" + + assert out_paths[i].exists(), f"paths[{i}]={p.stem}" + saved_metrics = json.loads(out_paths[i].read_text('utf-8')) + assert saved_metrics == computed_by_file[p.stem], f"paths[{i}]={p.stem}" + assert pred_paths[i].exists(), f"paths[{i}]={p.stem}" def test_evaluate_works_with_vocab_expansion(self): archive_path = str( From f1a820f85e71d45c5689b19f157174b9e76cfcec Mon Sep 17 00:00:00 2001 From: Gabe Date: Sat, 23 Oct 2021 10:05:36 -0400 Subject: [PATCH 05/12] Added in way to pass postprocess function --- allennlp/evaluation/evaluator.py | 9 +++++- .../postprocessors/postprocessor.py | 31 ++++++++++++------- .../postprocessors/postprocessor_test.py | 21 +++++++++++-- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py index fe9ee36d1f3..bbb46158e6f 100644 --- a/allennlp/evaluation/evaluator.py +++ b/allennlp/evaluation/evaluator.py @@ -128,6 +128,10 @@ def __call__( else: predictions_file = None + model_postprocess_function = getattr( + model, "make_output_human_readable", None + ) + with torch.no_grad(): model.eval() @@ -180,7 +184,10 @@ def __call__( if predictions_file is not None: predictions_file.write( self.batch_postprocessor( - batch, output_dict, data_loader + batch, + output_dict, + data_loader, + output_postprocess_function=model_postprocess_function ) + '\n' ) diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/postprocessors/postprocessor.py index 00e0dd0b154..3a9e0bfa35e 100644 --- a/allennlp/evaluation/postprocessors/postprocessor.py +++ b/allennlp/evaluation/postprocessors/postprocessor.py @@ -1,16 +1,9 @@ from collections import defaultdict -from typing import Union, List, Dict, Any, Tuple, Optional -from os import PathLike -from pathlib import Path -import torch -from itertools import groupby +from typing import Optional, Dict, Any, Callable import logging import json -import numpy as np -from allennlp.common.checks import check_for_gpu, ConfigurationError -from allennlp.common.tqdm import Tqdm -from allennlp.common.util import dump_metrics, sanitize, int_to_device, END_SYMBOL, START_SYMBOL +from allennlp.common.util import sanitize from allennlp.data.fields import TensorField from allennlp.nn import util as nn_util from allennlp.common import Registrable, Params @@ -29,7 +22,8 @@ def __call__( self, batch: Dict[str, TensorField], output_dict: Dict, - data_loader: DataLoader + data_loader: DataLoader, + output_postprocess_function: Optional[Callable] = None ) -> str: raise NotImplementedError("__call__") @@ -38,6 +32,11 @@ def __call__( @Postprocessor.register("simple") class SimplePostprocessor(Postprocessor): + """ + Very simple postprocesser. Only sanitizes the batches and outputs. Will use + a passed postprocess function for the outputs if it exists. + """ + def _to_params(self) -> Dict[str, Any]: return { "type": "simple" @@ -47,10 +46,18 @@ def __call__( self, batch: Dict[str, TensorField], output_dict: Dict, - data_loader: DataLoader + data_loader: DataLoader, + output_postprocess_function: Optional[Callable] = None ): if batch is None: raise ValueError("Postprocessor got a batch that is None") if output_dict is None: raise ValueError("Postprocessor got an output_dict that is None") - return json.dumps(sanitize({**batch, **output_dict})) + + postprocessed = sanitize(batch) + if output_postprocess_function is not None: + postprocessed.update(sanitize(output_postprocess_function(output_dict))) + else: + postprocessed.update(sanitize(output_dict)) + + return json.dumps(postprocessed) diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py index 3969686af07..06e449e6485 100644 --- a/tests/evaluation/postprocessors/postprocessor_test.py +++ b/tests/evaluation/postprocessors/postprocessor_test.py @@ -48,13 +48,28 @@ def test_postprocessor_default_implementation(self): {}, None ], ids=["TestOutput", "EmptyOutput", "None"]) - def test_simple_postprocessor_call(self, batch, output_dict): + @pytest.mark.parametrize("postprocess_func", [ + lambda x: {k.upper(): v for k, v in x.items()}, + None + ], ids=["PassedFunction", "NoPassedFunction"]) + def test_simple_postprocessor_call(self, batch, output_dict, postprocess_func): data_loader = DummyDataLoader([]) if batch is None or output_dict is None: with pytest.raises(ValueError): self.postprocessor(batch, output_dict, data_loader) # type: ignore return - expected = json.dumps(sanitize({**batch, **output_dict})) - result = self.postprocessor(batch, output_dict, data_loader) # type: ignore + expected = json.dumps(sanitize( + { + **batch, + **(postprocess_func(output_dict) if postprocess_func else output_dict) + } + )) + + result = self.postprocessor( + batch, + output_dict, + data_loader, # type: ignore + postprocess_func + ) assert result == expected From 14fad34954eb6297ab4e5c4acb5c1c157357dfda Mon Sep 17 00:00:00 2001 From: Gabe Date: Sat, 23 Oct 2021 10:50:24 -0400 Subject: [PATCH 06/12] Base Evaluator done --- allennlp/commands/evaluate.py | 137 +++++++++++++----- allennlp/commands/train.py | 2 +- allennlp/common/testing/test_case.py | 1 - allennlp/evaluation/evaluator.py | 97 +++++++------ .../postprocessors/postprocessor.py | 79 +++++++--- tests/commands/evaluate_test.py | 21 ++- tests/evaluation/evaluator_tests.py | 6 +- .../postprocessors/postprocessor_test.py | 57 ++++---- 8 files changed, 262 insertions(+), 138 deletions(-) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index 356b31d0122..3c941cedd17 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -9,7 +9,7 @@ import logging from pathlib import Path from os import PathLike -from typing import Union, Dict, Any, Optional, List +from typing import Union, Dict, Any, Optional from copy import deepcopy from overrides import overrides @@ -40,7 +40,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument type=str, help=( "path to the file containing the evaluation data" - ' (for mutiple files, put ":" between filenames e.g., input1.txt:input2.txt)' + ' (for multiple files, put ":" between filenames e.g., input1.txt:input2.txt)' ), ) @@ -49,7 +49,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument type=str, help=( "optional path to write the metrics to as JSON" - ' (for mutiple files, put ":" between filenames e.g., output1.txt:output2.txt)' + ' (for multiple files, put ":" between filenames e.g., output1.txt:output2.txt)' ), ) @@ -58,7 +58,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument type=str, help=( "optional path to write the predictions to as JSON lines" - ' (for mutiple files, put ":" between filenames e.g., output1.jsonl:output2.jsonl)' + ' (for multiple files, put ":" between filenames e.g., output1.jsonl:output2.jsonl)' ), ) @@ -99,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument action="store_true", default=False, help="if specified, we will use the instances in your new dataset to " - "extend your vocabulary. If pretrained-file was used to initialize " - "embedding layers, you may also need to pass --embedding-sources-mapping.", + "extend your vocabulary. If pretrained-file was used to initialize " + "embedding layers, you may also need to pass --embedding-sources-mapping.", ) subparser.add_argument( @@ -108,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument type=str, default="", help="a JSON dict defining mapping from embedding module path to embedding " - "pretrained-file used during training. If not passed, and embedding needs to be " - "extended, we will try to use the original file paths used during training. If " - "they are not available we will use random vectors for embedding extension.", + "pretrained-file used during training. If not passed, and embedding needs to be " + "extended, we will try to use the original file paths used during training. If " + "they are not available we will use random vectors for embedding extension.", ) subparser.add_argument( "--file-friendly-logging", @@ -123,7 +123,7 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument "--auto-names", default="NONE", help="Automatically create output names for each evaluation file.", - choices=["NONE", "METRICS", "PREDS", "ALL"] + choices=["NONE", "METRICS", "PREDS", "ALL"], ) subparser.set_defaults(func=evaluate_from_args) @@ -145,25 +145,86 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: weights_file=args.weights_file, file_friendly_logging=args.file_friendly_logging, batch_weight_key=args.batch_weight_key, - auto_names=args.auto_names + auto_names=args.auto_names, ) def evaluate_from_archive( - archive_file: Union[str, PathLike], - input_file: str, - output_file: Optional[str] = None, - predictions_output_file: Optional[str] = None, - batch_size: Optional[int] = None, - cmd_overrides: Union[str, Dict[str, Any]] = "", - cuda_device: int = -1, - embedding_sources_mapping: str = None, - extend_vocab: bool = False, - weights_file: str = None, - file_friendly_logging: bool = False, - batch_weight_key: str = None, - auto_names: str = "None", -): + archive_file: Union[str, PathLike], + input_file: str, + output_file: Optional[str] = None, + predictions_output_file: Optional[str] = None, + batch_size: Optional[int] = None, + cmd_overrides: Union[str, Dict[str, Any]] = "", + cuda_device: int = -1, + embedding_sources_mapping: str = None, + extend_vocab: bool = False, + weights_file: str = None, + file_friendly_logging: bool = False, + batch_weight_key: str = None, + auto_names: str = "NONE", +) -> Dict[str, Any]: + """ + + # Parameters + + archive_file: `Union[str, PathLike]` + Path to an archived trained model. + + input_file: `str` + path to the file containing the evaluation data (for multiple files, + put ":" between filenames e.g., input1.txt:input2.txt) + + output_file: `str`, optional (default=`None`) + optional path to write the metrics to as JSON (for multiple files, put + ":" between filenames e.g., output1.txt:output2.txt) + + predictions_output_file: `str`, optional (default=`None`) + "optional path to write the predictions to (for multiple files, put ":" + between filenames e.g., output1.jsonl:output2.jsonl) + + batch_size: `int`, optional (default=`None`) + If non-empty, the batch size to use during evaluation. + + cmd_overrides: `str`, optional (default=`""`) + a json(net) structure used to override the experiment configuration, + e.g., '{\"iterator.batch_size\": 16}'. Nested parameters can be + specified either with nested dictionaries or with dot syntax. + + cuda_device: `int`, optional (default=`-1`) + id of GPU to use (if any) + + embedding_sources_mapping: `str`, optional (default=`None`) + a JSON dict defining mapping from embedding module path to embedding + pretrained-file used during training. If not passed, and embedding + needs to be extended, we will try to use the original file paths used + during training. If they are not available we will use random vectors + for embedding extension. + + extend_vocab: `bool`, optional (default=`False`) + if specified, we will use the instances in your new dataset to extend + your vocabulary. If pretrained-file was used to initialize embedding + layers, you may also need to pass --embedding-sources-mapping. + + weights_file:`str`, optional (default=`None`) + A path that overrides which weights file to use + + file_friendly_logging : `bool`, optional (default=`False`) + If `True`, we add newlines to tqdm output, even on an interactive terminal, and we slow + down tqdm's output to only once every 10 seconds. + + batch_weight_key: `str`, optional (default=`None`) + If non-empty, name of metric used to weight the loss on a per-batch basis. + + auto_names:`str`, optional (default=`"NONE"`) + Automatically create output names for each evaluation file. + + # Returns + + all_metrics: `Dict[str, Any]` + The metrics from every evaluation file passed. + + """ common_logging.FILE_FRIENDLY_LOGGING = file_friendly_logging # Disable some of the more verbose logging statements @@ -185,7 +246,7 @@ def evaluate_from_archive( # Load the evaluator from the config key `Evaluator` evaluator_params = config.pop("evaluation", {}) - evaluator_params['cuda_device'] = cuda_device + evaluator_params["cuda_device"] = cuda_device evaluator = Evaluator.from_params(evaluator_params) # Load the evaluation data @@ -198,34 +259,34 @@ def evaluate_from_archive( # TODO(gabeorlanski): Add in way to save to specific output directory if output_file is not None: if auto_names == "METRICS" or auto_names == "ALL": - logger.warning(f"Passed output_files will be ignored, auto_names is" - f" set to {auto_names}") + logger.warning( + f"Passed output_files will be ignored, auto_names is" f" set to {auto_names}" + ) # Keep the path of the parent otherwise it will write to the CWD output_file_list = [ - p.parent.joinpath(f"{p.stem}.outputs") - for p in map(Path, evaluation_data_path_list) + p.parent.joinpath(f"{p.stem}.outputs") for p in map(Path, evaluation_data_path_list) ] else: - output_file_list = output_file.split(":") + output_file_list = output_file.split(":") # type: ignore assert len(output_file_list) == len( evaluation_data_path_list ), "The number of `output_file` paths must be equal to the number of datasets being evaluated." if predictions_output_file is not None: if auto_names == "PREDS" or auto_names == "ALL": - logger.warning(f"Passed predictions files will be ignored, auto_names is" - f" set to {auto_names}") + logger.warning( + f"Passed predictions files will be ignored, auto_names is" f" set to {auto_names}" + ) # Keep the path of the parent otherwise it will write to the CWD predictions_output_file_list = [ - p.parent.joinpath(f"{p.stem}.preds") - for p in map(Path, evaluation_data_path_list) + p.parent.joinpath(f"{p.stem}.preds") for p in map(Path, evaluation_data_path_list) ] else: - predictions_output_file_list = predictions_output_file.split(":") + predictions_output_file_list = predictions_output_file.split(":") # type: ignore assert len(predictions_output_file_list) == len(evaluation_data_path_list), ( - "The number of `predictions_output_file` paths must be equal" - + "to the number of datasets being evaluated. " + "The number of `predictions_output_file` paths must be equal" + + "to the number of datasets being evaluated. " ) # output file diff --git a/allennlp/commands/train.py b/allennlp/commands/train.py index 5da3aa3969b..7acb54581bf 100644 --- a/allennlp/commands/train.py +++ b/allennlp/commands/train.py @@ -242,7 +242,7 @@ def train_model( # Change Author: Gabe Orlanski # Placeholder for the time being to make sure no errors are raised b/c of # the evaluator. - params.pop("evaluation",None) + params.pop("evaluation", None) meta = Meta.new() meta.to_file(os.path.join(serialization_dir, META_NAME)) diff --git a/allennlp/common/testing/test_case.py b/allennlp/common/testing/test_case.py index dcfd5da217f..b3c4cbd00ab 100644 --- a/allennlp/common/testing/test_case.py +++ b/allennlp/common/testing/test_case.py @@ -2,7 +2,6 @@ import os import pathlib import shutil -import pytest import tempfile from allennlp.common.checks import log_pytorch_version_info diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py index bbb46158e6f..648cc5ba1bc 100644 --- a/allennlp/evaluation/evaluator.py +++ b/allennlp/evaluation/evaluator.py @@ -21,8 +21,8 @@ class Evaluator(Registrable): """ - Evaluation class - + Evaluation Base class + # Parameters batch_postprocessor: `Postprocessor`, optional (default=`SimplePostprocessor`) @@ -34,25 +34,28 @@ class Evaluator(Registrable): already be using this device; this parameter is only used for moving the input data to the correct device. """ + default_implementation = "simple" def __init__( - self, - batch_postprocessor: Optional[Postprocessor] = None, - cuda_device: Union[int, torch.device] = -1 + self, + batch_postprocessor: Optional[Postprocessor] = None, + cuda_device: Union[int, torch.device] = -1, ): self.batch_postprocessor = batch_postprocessor or SimplePostprocessor() self.cuda_device = cuda_device def __call__( - self, - model: Model, - data_loader: DataLoader, - batch_weight_key: str = None, - output_file: Union[str, PathLike] = None, - predictions_output_file: Union[str, PathLike] = None + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + output_file: Union[str, PathLike] = None, + predictions_output_file: Union[str, PathLike] = None, ) -> Dict[str, Any]: """ + Evaluate a single data source. + # Parameters model : `Model` @@ -72,8 +75,8 @@ def __call__( # Returns - `Dict[str, Any]` - The final metrics. + metrics: `Dict[str, Any]` + The metrics from evaluating the file. """ raise NotImplementedError("__call__") @@ -82,24 +85,37 @@ def __call__( class SimpleEvaluator(Evaluator): """ Simple evaluator implementation. Uses the vanilla evaluation code. + + # Parameters + + batch_postprocessor: `Postprocessor`, optional (default=`SimplePostprocessor`) + The postprocessor to use for turning both the batches and the outputs + of the model into human readable data. + + cuda_device : `Union[int, torch.device]`, optional (default=`-1`) + The cuda device to use for this evaluation. The model is assumed to + already be using this device; this parameter is only used for moving + the input data to the correct device. """ def __init__( - self, - batch_postprocessor: Optional[Postprocessor] = None, - cuda_device: Union[int, torch.device] = -1 + self, + batch_postprocessor: Optional[Postprocessor] = None, + cuda_device: Union[int, torch.device] = -1, ): super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device) def __call__( - self, - model: Model, - data_loader: DataLoader, - batch_weight_key: str = None, - output_file: Union[str, PathLike] = None, - predictions_output_file: Union[str, PathLike] = None + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + output_file: Union[str, PathLike] = None, + predictions_output_file: Union[str, PathLike] = None, ): """ + Evaluate a single data source. + # Parameters model : `Model` @@ -117,8 +133,8 @@ def __call__( # Returns - `Dict[str, Any]` - The final metrics. + metrics: `Dict[str, Any]` + The metrics from evaluating the file. """ check_for_gpu(self.cuda_device) data_loader.set_target_device(int_to_device(self.cuda_device)) @@ -126,11 +142,9 @@ def __call__( if predictions_output_file is not None: predictions_file = Path(predictions_output_file).open("w", encoding="utf-8") else: - predictions_file = None + predictions_file = None # type: ignore - model_postprocess_function = getattr( - model, "make_output_human_readable", None - ) + model_postprocess_function = getattr(model, "make_output_human_readable", None) with torch.no_grad(): model.eval() @@ -168,14 +182,14 @@ def __call__( metrics["loss"] = total_loss / total_weight description = ( - ", ".join( - [ - "%s: %.2f" % (name, value) - for name, value in metrics.items() - if not name.startswith("_") - ] - ) - + " ||" + ", ".join( + [ + "%s: %.2f" % (name, value) + for name, value in metrics.items() + if not name.startswith("_") + ] + ) + + " ||" ) generator_tqdm.set_description(description, refresh=False) @@ -187,8 +201,9 @@ def __call__( batch, output_dict, data_loader, - output_postprocess_function=model_postprocess_function - ) + '\n' + output_postprocess_function=model_postprocess_function, + ) + + "\n" ) if predictions_file is not None: @@ -210,7 +225,7 @@ def __call__( def _to_params(self) -> Dict[str, Any]: return { - "type" : "simple", - "cuda_device" : self.cuda_device, - "batch_postprocessor": self.batch_postprocessor.to_params() + "type": "simple", + "cuda_device": self.cuda_device, + "batch_postprocessor": self.batch_postprocessor.to_params(), } diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/postprocessors/postprocessor.py index 3a9e0bfa35e..118cd90ee4a 100644 --- a/allennlp/evaluation/postprocessors/postprocessor.py +++ b/allennlp/evaluation/postprocessors/postprocessor.py @@ -1,14 +1,11 @@ -from collections import defaultdict from typing import Optional, Dict, Any, Callable import logging import json from allennlp.common.util import sanitize from allennlp.data.fields import TensorField -from allennlp.nn import util as nn_util -from allennlp.common import Registrable, Params -from allennlp.models import Model -from allennlp.data import DataLoader, Vocabulary +from allennlp.common import Registrable +from allennlp.data import DataLoader logger = logging.getLogger(__name__) @@ -19,12 +16,36 @@ class Postprocessor(Registrable): """ def __call__( - self, - batch: Dict[str, TensorField], - output_dict: Dict, - data_loader: DataLoader, - output_postprocess_function: Optional[Callable] = None + self, + batch: Dict[str, TensorField], + output_dict: Dict, + data_loader: DataLoader, + output_postprocess_function: Optional[Callable] = None, ) -> str: + """ + Postprocess a batch. + + # Parameters + + batch: `Dict[str, TensorField]` + The batch that was passed to the model's forward function. + + output_dict: `Dict` + The output of the model's forward function on the batch + + data_loader: `DataLoader` + The dataloader to be used. + + output_postprocess_function: `Callable`, optional (default=`None`) + If you have a function to preprocess only the outputs ( + i.e. `model.make_human_readable`), use this parameter to have it + called on the output dict. + + # Returns + + postprocessed: `str` + The postprocessed batches as strings + """ raise NotImplementedError("__call__") default_implementation = "simple" @@ -38,17 +59,39 @@ class SimplePostprocessor(Postprocessor): """ def _to_params(self) -> Dict[str, Any]: - return { - "type": "simple" - } + return {"type": "simple"} def __call__( - self, - batch: Dict[str, TensorField], - output_dict: Dict, - data_loader: DataLoader, - output_postprocess_function: Optional[Callable] = None + self, + batch: Dict[str, TensorField], + output_dict: Dict, + data_loader: DataLoader, + output_postprocess_function: Optional[Callable] = None, ): + """ + Postprocess a batch. + + # Parameters + + batch: `Dict[str, TensorField]` + The batch that was passed to the model's forward function. + + output_dict: `Dict` + The output of the model's forward function on the batch + + data_loader: `DataLoader` + The dataloader to be used. + + output_postprocess_function: `Callable`, optional (default=`None`) + If you have a function to preprocess only the outputs ( + i.e. `model.make_human_readable`), use this parameter to have it + called on the output dict. + + # Returns + + postprocessed: `str` + The postprocessed batches as strings + """ if batch is None: raise ValueError("Postprocessor got a batch that is None") if output_dict is None: diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index df671fa6aa1..5ada1368b2b 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -118,8 +118,10 @@ def test_multiple_output_files_evaluate_from_args(self): ":".join(map(str, paths)), "--cuda-device", "-1", - "--output-file", ":".join(map(str, out_paths)), - "--predictions-output-file", ":".join(map(str, pred_paths)) + "--output-file", + ":".join(map(str, out_paths)), + "--predictions-output-file", + ":".join(map(str, pred_paths)), ] args = self.parser.parse_args(kebab_args) computed_metrics = evaluate_from_args(args) @@ -135,12 +137,12 @@ def test_multiple_output_files_evaluate_from_args(self): for i, p in enumerate(paths): # Make sure it was not modified - assert p.read_text('utf-8') == expected_input_data + assert p.read_text("utf-8") == expected_input_data assert p.stem in computed_by_file, f"paths[{i}]={p.stem}" assert out_paths[i].exists(), f"paths[{i}]={p.stem}" - saved_metrics = json.loads(out_paths[i].read_text('utf-8')) + saved_metrics = json.loads(out_paths[i].read_text("utf-8")) assert saved_metrics == computed_by_file[p.stem], f"paths[{i}]={p.stem}" assert pred_paths[i].exists(), f"paths[{i}]={p.stem}" @@ -201,9 +203,12 @@ def test_auto_names_creates_files(self, auto_names): ":".join(map(str, paths)), "--cuda-device", "-1", - "--output-file", ":".join(map(str, out_paths)), - "--predictions-output-file", ":".join(map(str, pred_paths)), - "--auto-names", auto_names + "--output-file", + ":".join(map(str, out_paths)), + "--predictions-output-file", + ":".join(map(str, pred_paths)), + "--auto-names", + auto_names, ] args = self.parser.parse_args(kebab_args) @@ -213,7 +218,7 @@ def test_auto_names_creates_files(self, auto_names): for i, p in enumerate(paths): # Make sure it was not modified - assert p.read_text('utf-8') == expected_input_data + assert p.read_text("utf-8") == expected_input_data if auto_names == "METRICS" or auto_names == "ALL": assert not out_paths[i].exists() diff --git a/tests/evaluation/evaluator_tests.py b/tests/evaluation/evaluator_tests.py index 9c21ffce190..57ec28e6bfe 100644 --- a/tests/evaluation/evaluator_tests.py +++ b/tests/evaluation/evaluator_tests.py @@ -58,7 +58,7 @@ def test_evaluate_calculates_average_loss_with_weights(self): def test_to_params(self): assert self.evaluator.to_params() == { - "type" : "simple", - "cuda_device" : -1, - "batch_postprocessor": {"type": "simple"} + "type": "simple", + "cuda_device": -1, + "batch_postprocessor": {"type": "simple"}, } diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/postprocessors/postprocessor_test.py index 06e449e6485..fc0996c03fa 100644 --- a/tests/evaluation/postprocessors/postprocessor_test.py +++ b/tests/evaluation/postprocessors/postprocessor_test.py @@ -1,4 +1,4 @@ -from typing import Iterator, List, Dict +from typing import Iterator, List import torch import pytest import json @@ -35,23 +35,28 @@ def test_postprocessor_default_implementation(self): assert self.postprocessor.to_params().params == {"type": "simple"} assert isinstance(self.postprocessor, SimplePostprocessor) - @pytest.mark.parametrize("batch", [ - { - "Do you want ants?": "Because that's how you get ants.", - "testing" : torch.tensor([[1, 2, 3]]) - }, - {}, - None - ], ids=["TestBatch", "EmptyBatch", "None"]) - @pytest.mark.parametrize("output_dict", [ - {"You're": ["Not", [["My"]], "Supervisor"]}, - {}, - None - ], ids=["TestOutput", "EmptyOutput", "None"]) - @pytest.mark.parametrize("postprocess_func", [ - lambda x: {k.upper(): v for k, v in x.items()}, - None - ], ids=["PassedFunction", "NoPassedFunction"]) + @pytest.mark.parametrize( + "batch", + [ + { + "Do you want ants?": "Because that's how you get ants.", + "testing": torch.tensor([[1, 2, 3]]), + }, + {}, + None, + ], + ids=["TestBatch", "EmptyBatch", "None"], + ) + @pytest.mark.parametrize( + "output_dict", + [{"You're": ["Not", [["My"]], "Supervisor"]}, {}, None], + ids=["TestOutput", "EmptyOutput", "None"], + ) + @pytest.mark.parametrize( + "postprocess_func", + [lambda x: {k.upper(): v for k, v in x.items()}, None], + ids=["PassedFunction", "NoPassedFunction"], + ) def test_simple_postprocessor_call(self, batch, output_dict, postprocess_func): data_loader = DummyDataLoader([]) if batch is None or output_dict is None: @@ -59,17 +64,13 @@ def test_simple_postprocessor_call(self, batch, output_dict, postprocess_func): self.postprocessor(batch, output_dict, data_loader) # type: ignore return - expected = json.dumps(sanitize( - { - **batch, - **(postprocess_func(output_dict) if postprocess_func else output_dict) - } - )) + expected = json.dumps( + sanitize( + {**batch, **(postprocess_func(output_dict) if postprocess_func else output_dict)} + ) + ) result = self.postprocessor( - batch, - output_dict, - data_loader, # type: ignore - postprocess_func + batch, output_dict, data_loader, postprocess_func # type: ignore ) assert result == expected From b901865c16c3f13929ee13e0da1c92a79d9bd7a8 Mon Sep 17 00:00:00 2001 From: Gabe Date: Fri, 5 Nov 2021 21:24:45 -0400 Subject: [PATCH 07/12] Style Fixes --- allennlp/commands/evaluate.py | 4 ++-- allennlp/evaluation/evaluator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index 3c941cedd17..c54be3ecead 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -268,7 +268,7 @@ def evaluate_from_archive( p.parent.joinpath(f"{p.stem}.outputs") for p in map(Path, evaluation_data_path_list) ] else: - output_file_list = output_file.split(":") # type: ignore + output_file_list = output_file.split(":") # type: ignore assert len(output_file_list) == len( evaluation_data_path_list ), "The number of `output_file` paths must be equal to the number of datasets being evaluated." @@ -283,7 +283,7 @@ def evaluate_from_archive( p.parent.joinpath(f"{p.stem}.preds") for p in map(Path, evaluation_data_path_list) ] else: - predictions_output_file_list = predictions_output_file.split(":") # type: ignore + predictions_output_file_list = predictions_output_file.split(":") # type: ignore assert len(predictions_output_file_list) == len(evaluation_data_path_list), ( "The number of `predictions_output_file` paths must be equal" + "to the number of datasets being evaluated. " diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py index 648cc5ba1bc..e58736786b8 100644 --- a/allennlp/evaluation/evaluator.py +++ b/allennlp/evaluation/evaluator.py @@ -142,7 +142,7 @@ def __call__( if predictions_output_file is not None: predictions_file = Path(predictions_output_file).open("w", encoding="utf-8") else: - predictions_file = None # type: ignore + predictions_file = None # type: ignore model_postprocess_function = getattr(model, "make_output_human_readable", None) From b9b4f9e5ca6e978b4e12e12d50f9a5853641ce98 Mon Sep 17 00:00:00 2001 From: Gabe Date: Fri, 5 Nov 2021 21:27:15 -0400 Subject: [PATCH 08/12] Changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4de3f4ba397..e30453881ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +- Added an `Evaluator` class to make comparing source, target, and predictions easier. + ## [v2.8.0](https://github.com/allenai/allennlp/releases/tag/v2.8.0) - 2021-11-01 ### Added From 21d81657660df4e4c99b10dac3e0bf9d518cbedf Mon Sep 17 00:00:00 2001 From: Gabe Date: Fri, 10 Dec 2021 11:44:06 -0500 Subject: [PATCH 09/12] PR Review Fixes --- allennlp/commands/evaluate.py | 72 +++++++++------- allennlp/evaluation/__init__.py | 2 +- allennlp/evaluation/evaluator.py | 84 +++++++++++-------- .../evaluation/postprocessors/__init__.py | 1 - allennlp/evaluation/serializers/__init__.py | 1 + .../serializers.py} | 30 +++---- .../__init__.py | 0 .../serializer_test.py} | 12 +-- 8 files changed, 113 insertions(+), 89 deletions(-) create mode 100644 allennlp/evaluation/serializers/__init__.py rename allennlp/evaluation/{postprocessors/postprocessor.py => serializers/serializers.py} (74%) rename tests/evaluation/{postprocessors => serializers}/__init__.py (100%) rename tests/evaluation/{postprocessors/postprocessor_test.py => serializers/serializer_test.py} (85%) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index c54be3ecead..4bdbd6e3652 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -99,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument action="store_true", default=False, help="if specified, we will use the instances in your new dataset to " - "extend your vocabulary. If pretrained-file was used to initialize " - "embedding layers, you may also need to pass --embedding-sources-mapping.", + "extend your vocabulary. If pretrained-file was used to initialize " + "embedding layers, you may also need to pass --embedding-sources-mapping.", ) subparser.add_argument( @@ -108,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument type=str, default="", help="a JSON dict defining mapping from embedding module path to embedding " - "pretrained-file used during training. If not passed, and embedding needs to be " - "extended, we will try to use the original file paths used during training. If " - "they are not available we will use random vectors for embedding extension.", + "pretrained-file used during training. If not passed, and embedding needs to be " + "extended, we will try to use the original file paths used during training. If " + "they are not available we will use random vectors for embedding extension.", ) subparser.add_argument( "--file-friendly-logging", @@ -122,7 +122,14 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument subparser.add_argument( "--auto-names", default="NONE", - help="Automatically create output names for each evaluation file.", + help="Automatically create output names for each evaluation file. " + "`NONE` will not automatically generate a file name for the " + "neither the metrics nor the predictions. In this case you will" + " need to pas in both `metrics_output_file` and `predictions_output_file`. " + "`METRICS` will only automatically create a file name for the" + " metrics file. `PREDS` will only automatically create a file" + " name for the predictions outputs. `ALL` will create a " + "filename for both the metrics and the predictions.", choices=["NONE", "METRICS", "PREDS", "ALL"], ) @@ -135,7 +142,7 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: return evaluate_from_archive( archive_file=args.archive_file, input_file=args.input_file, - output_file=args.output_file, + metrics_output_file=args.output_file, predictions_output_file=args.predictions_output_file, batch_size=args.batch_size, cmd_overrides=args.overrides, @@ -150,19 +157,19 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: def evaluate_from_archive( - archive_file: Union[str, PathLike], - input_file: str, - output_file: Optional[str] = None, - predictions_output_file: Optional[str] = None, - batch_size: Optional[int] = None, - cmd_overrides: Union[str, Dict[str, Any]] = "", - cuda_device: int = -1, - embedding_sources_mapping: str = None, - extend_vocab: bool = False, - weights_file: str = None, - file_friendly_logging: bool = False, - batch_weight_key: str = None, - auto_names: str = "NONE", + archive_file: Union[str, PathLike], + input_file: str, + metrics_output_file: Optional[str] = None, + predictions_output_file: Optional[str] = None, + batch_size: Optional[int] = None, + cmd_overrides: Union[str, Dict[str, Any]] = "", + cuda_device: int = -1, + embedding_sources_mapping: str = None, + extend_vocab: bool = False, + weights_file: str = None, + file_friendly_logging: bool = False, + batch_weight_key: str = None, + auto_names: str = "NONE", ) -> Dict[str, Any]: """ @@ -175,7 +182,7 @@ def evaluate_from_archive( path to the file containing the evaluation data (for multiple files, put ":" between filenames e.g., input1.txt:input2.txt) - output_file: `str`, optional (default=`None`) + metrics_output_file: `str`, optional (default=`None`) optional path to write the metrics to as JSON (for multiple files, put ":" between filenames e.g., output1.txt:output2.txt) @@ -217,7 +224,13 @@ def evaluate_from_archive( If non-empty, name of metric used to weight the loss on a per-batch basis. auto_names:`str`, optional (default=`"NONE"`) - Automatically create output names for each evaluation file. + Automatically create output names for each evaluation file.`NONE` will + not automatically generate a file name for the neither the metrics nor + the predictions. In this case you will need to pas in both + `metrics_output_file` and `predictions_output_file`. `METRICS` will only + automatically create a file name for the metrics file. `PREDS` will only + automatically create a file name for the predictions outputs. `ALL` + will create a filename for both the metrics and the predictions. # Returns @@ -257,7 +270,7 @@ def evaluate_from_archive( # TODO(gabeorlanski): Is it safe to always default to .outputs and .preds? # TODO(gabeorlanski): Add in way to save to specific output directory - if output_file is not None: + if metrics_output_file is not None: if auto_names == "METRICS" or auto_names == "ALL": logger.warning( f"Passed output_files will be ignored, auto_names is" f" set to {auto_names}" @@ -268,10 +281,11 @@ def evaluate_from_archive( p.parent.joinpath(f"{p.stem}.outputs") for p in map(Path, evaluation_data_path_list) ] else: - output_file_list = output_file.split(":") # type: ignore + output_file_list = metrics_output_file.split(":") # type: ignore assert len(output_file_list) == len( evaluation_data_path_list - ), "The number of `output_file` paths must be equal to the number of datasets being evaluated." + ), "The number of `metrics_output_file` paths must be equal to the number " \ + "of datasets being evaluated." if predictions_output_file is not None: if auto_names == "PREDS" or auto_names == "ALL": logger.warning( @@ -285,8 +299,8 @@ def evaluate_from_archive( else: predictions_output_file_list = predictions_output_file.split(":") # type: ignore assert len(predictions_output_file_list) == len(evaluation_data_path_list), ( - "The number of `predictions_output_file` paths must be equal" - + "to the number of datasets being evaluated. " + "The number of `predictions_output_file` paths must be equal" + + "to the number of datasets being evaluated. " ) # output file @@ -309,7 +323,7 @@ def evaluate_from_archive( # output dictionary. eval_file_name = Path(evaluation_data_path).stem - if output_file is not None: + if metrics_output_file is not None: # noinspection PyUnboundLocalVariable output_file_path = output_file_list[index] @@ -339,7 +353,7 @@ def evaluate_from_archive( model, data_loader, batch_weight_key, - output_file=output_file_path, + metrics_output_file=output_file_path, predictions_output_file=predictions_output_file_path, ) diff --git a/allennlp/evaluation/__init__.py b/allennlp/evaluation/__init__.py index ea63688b772..feb03dd961e 100644 --- a/allennlp/evaluation/__init__.py +++ b/allennlp/evaluation/__init__.py @@ -1,2 +1,2 @@ from allennlp.evaluation.evaluator import Evaluator, SimpleEvaluator -from allennlp.evaluation.postprocessors.postprocessor import Postprocessor +from allennlp.evaluation.serializers.serializers import Serializer diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py index e58736786b8..d42a7745e0e 100644 --- a/allennlp/evaluation/evaluator.py +++ b/allennlp/evaluation/evaluator.py @@ -14,7 +14,7 @@ from allennlp.common import Registrable from allennlp.models import Model from allennlp.data import DataLoader -from allennlp.evaluation.postprocessors.postprocessor import Postprocessor, SimplePostprocessor +from allennlp.evaluation.serializers.serializers import Serializer, SimpleSerializer logger = logging.getLogger(__name__) @@ -33,25 +33,30 @@ class Evaluator(Registrable): The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. + + postprocessor_fn_name: `str`, optional (default=`"make_output_human_readable"`) + Function name of the model's postprocessing function. """ default_implementation = "simple" def __init__( - self, - batch_postprocessor: Optional[Postprocessor] = None, - cuda_device: Union[int, torch.device] = -1, + self, + batch_serializer: Optional[Serializer] = None, + cuda_device: Union[int, torch.device] = -1, + postprocessor_fn_name: str = "make_output_human_readable" ): - self.batch_postprocessor = batch_postprocessor or SimplePostprocessor() + self.batch_serializer = batch_serializer or SimpleSerializer() self.cuda_device = cuda_device + self.postprocessor_fn_name = postprocessor_fn_name def __call__( - self, - model: Model, - data_loader: DataLoader, - batch_weight_key: str = None, - output_file: Union[str, PathLike] = None, - predictions_output_file: Union[str, PathLike] = None, + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + metrics_output_file: Union[str, PathLike] = None, + predictions_output_file: Union[str, PathLike] = None, ) -> Dict[str, Any]: """ Evaluate a single data source. @@ -73,6 +78,7 @@ def __call__( Optional path to write the predictions to. If passed the postprocessor will be called and its output will be written as lines. + # Returns metrics: `Dict[str, Any]` @@ -96,22 +102,26 @@ class SimpleEvaluator(Evaluator): The cuda device to use for this evaluation. The model is assumed to already be using this device; this parameter is only used for moving the input data to the correct device. + + postprocessor_fn_name: `str`, optional (default=`"make_output_human_readable"`) + Function name of the model's postprocessing function. """ def __init__( - self, - batch_postprocessor: Optional[Postprocessor] = None, - cuda_device: Union[int, torch.device] = -1, + self, + batch_serializer: Optional[Serializer] = None, + cuda_device: Union[int, torch.device] = -1, + postprocessor_fn_name: str = "make_output_human_readable" ): - super(SimpleEvaluator, self).__init__(batch_postprocessor, cuda_device) + super(SimpleEvaluator, self).__init__(batch_serializer, cuda_device, postprocessor_fn_name) def __call__( - self, - model: Model, - data_loader: DataLoader, - batch_weight_key: str = None, - output_file: Union[str, PathLike] = None, - predictions_output_file: Union[str, PathLike] = None, + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + metrics_output_file: Union[str, PathLike] = None, + predictions_output_file: Union[str, PathLike] = None, ): """ Evaluate a single data source. @@ -138,13 +148,13 @@ def __call__( """ check_for_gpu(self.cuda_device) data_loader.set_target_device(int_to_device(self.cuda_device)) - output_file = Path(output_file) if output_file is not None else None + metrics_output_file = Path(metrics_output_file) if metrics_output_file is not None else None if predictions_output_file is not None: predictions_file = Path(predictions_output_file).open("w", encoding="utf-8") else: predictions_file = None # type: ignore - model_postprocess_function = getattr(model, "make_output_human_readable", None) + model_postprocess_function = getattr(model, self.postprocessor_fn_name, None) with torch.no_grad(): model.eval() @@ -182,14 +192,14 @@ def __call__( metrics["loss"] = total_loss / total_weight description = ( - ", ".join( - [ - "%s: %.2f" % (name, value) - for name, value in metrics.items() - if not name.startswith("_") - ] - ) - + " ||" + ", ".join( + [ + "%s: %.2f" % (name, value) + for name, value in metrics.items() + if not name.startswith("_") + ] + ) + + " ||" ) generator_tqdm.set_description(description, refresh=False) @@ -197,7 +207,7 @@ def __call__( # metrics if predictions_file is not None: predictions_file.write( - self.batch_postprocessor( + self.batch_serializer( batch, output_dict, data_loader, @@ -218,14 +228,14 @@ def __call__( ) final_metrics["loss"] = total_loss / total_weight - if output_file is not None: - dump_metrics(str(output_file), final_metrics, log=True) + if metrics_output_file is not None: + dump_metrics(str(metrics_output_file), final_metrics, log=True) return final_metrics def _to_params(self) -> Dict[str, Any]: return { - "type": "simple", - "cuda_device": self.cuda_device, - "batch_postprocessor": self.batch_postprocessor.to_params(), + "type" : "simple", + "cuda_device" : self.cuda_device, + "batch_postprocessor": self.batch_serializer.to_params(), } diff --git a/allennlp/evaluation/postprocessors/__init__.py b/allennlp/evaluation/postprocessors/__init__.py index df1c98089c0..e69de29bb2d 100644 --- a/allennlp/evaluation/postprocessors/__init__.py +++ b/allennlp/evaluation/postprocessors/__init__.py @@ -1 +0,0 @@ -from allennlp.evaluation.postprocessors.postprocessor import SimplePostprocessor diff --git a/allennlp/evaluation/serializers/__init__.py b/allennlp/evaluation/serializers/__init__.py new file mode 100644 index 00000000000..a9aef4a6b22 --- /dev/null +++ b/allennlp/evaluation/serializers/__init__.py @@ -0,0 +1 @@ +from allennlp.evaluation.serializers.serializers import SimpleSerializer diff --git a/allennlp/evaluation/postprocessors/postprocessor.py b/allennlp/evaluation/serializers/serializers.py similarity index 74% rename from allennlp/evaluation/postprocessors/postprocessor.py rename to allennlp/evaluation/serializers/serializers.py index 118cd90ee4a..2fbcfa79f6e 100644 --- a/allennlp/evaluation/postprocessors/postprocessor.py +++ b/allennlp/evaluation/serializers/serializers.py @@ -10,9 +10,9 @@ logger = logging.getLogger(__name__) -class Postprocessor(Registrable): +class Serializer(Registrable): """ - General Postprocessor class for turning batches into human readable data + General serializer class for turning batches into human readable data """ def __call__( @@ -51,11 +51,11 @@ def __call__( default_implementation = "simple" -@Postprocessor.register("simple") -class SimplePostprocessor(Postprocessor): +@Serializer.register("simple") +class SimpleSerializer(Serializer): """ - Very simple postprocesser. Only sanitizes the batches and outputs. Will use - a passed postprocess function for the outputs if it exists. + Very simple serializer. Only sanitizes the batches and outputs. Will use + a passed serializer function for the outputs if it exists. """ def _to_params(self) -> Dict[str, Any]: @@ -69,7 +69,7 @@ def __call__( output_postprocess_function: Optional[Callable] = None, ): """ - Postprocess a batch. + Serializer a batch. # Parameters @@ -89,18 +89,18 @@ def __call__( # Returns - postprocessed: `str` - The postprocessed batches as strings + serialized: `str` + The serialized batches as strings """ if batch is None: - raise ValueError("Postprocessor got a batch that is None") + raise ValueError("Serializer got a batch that is None") if output_dict is None: - raise ValueError("Postprocessor got an output_dict that is None") + raise ValueError("Serializer got an output_dict that is None") - postprocessed = sanitize(batch) + serialized = sanitize(batch) if output_postprocess_function is not None: - postprocessed.update(sanitize(output_postprocess_function(output_dict))) + serialized.update(sanitize(output_postprocess_function(output_dict))) else: - postprocessed.update(sanitize(output_dict)) + serialized.update(sanitize(output_dict)) - return json.dumps(postprocessed) + return json.dumps(serialized) diff --git a/tests/evaluation/postprocessors/__init__.py b/tests/evaluation/serializers/__init__.py similarity index 100% rename from tests/evaluation/postprocessors/__init__.py rename to tests/evaluation/serializers/__init__.py diff --git a/tests/evaluation/postprocessors/postprocessor_test.py b/tests/evaluation/serializers/serializer_test.py similarity index 85% rename from tests/evaluation/postprocessors/postprocessor_test.py rename to tests/evaluation/serializers/serializer_test.py index fc0996c03fa..684f01c8a5b 100644 --- a/tests/evaluation/postprocessors/postprocessor_test.py +++ b/tests/evaluation/serializers/serializer_test.py @@ -7,8 +7,8 @@ from allennlp.common import Params from allennlp.common.util import sanitize from allennlp.data.data_loaders import TensorDict -from allennlp.evaluation import Postprocessor -from allennlp.evaluation.postprocessors import SimplePostprocessor +from allennlp.evaluation import Serializer +from allennlp.evaluation.serializers import SimpleSerializer class DummyDataLoader: @@ -26,14 +26,14 @@ def set_target_device(self, _): pass -class TestPostprocessor(AllenNlpTestCase): +class TestSerializer(AllenNlpTestCase): def setup_method(self): - super(TestPostprocessor, self).setup_method() - self.postprocessor = Postprocessor.from_params(Params({})) + super(TestSerializer, self).setup_method() + self.postprocessor = Serializer.from_params(Params({})) def test_postprocessor_default_implementation(self): assert self.postprocessor.to_params().params == {"type": "simple"} - assert isinstance(self.postprocessor, SimplePostprocessor) + assert isinstance(self.postprocessor, SimpleSerializer) @pytest.mark.parametrize( "batch", From 90983a1acb8b12c3603e6ae2d6cb15c442ba37c9 Mon Sep 17 00:00:00 2001 From: Gabe Date: Fri, 10 Dec 2021 11:46:40 -0500 Subject: [PATCH 10/12] Formatting --- allennlp/commands/evaluate.py | 62 ++++++++++++++++---------------- allennlp/evaluation/evaluator.py | 60 +++++++++++++++---------------- 2 files changed, 61 insertions(+), 61 deletions(-) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index 4bdbd6e3652..bd80fbb4aa7 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -99,8 +99,8 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument action="store_true", default=False, help="if specified, we will use the instances in your new dataset to " - "extend your vocabulary. If pretrained-file was used to initialize " - "embedding layers, you may also need to pass --embedding-sources-mapping.", + "extend your vocabulary. If pretrained-file was used to initialize " + "embedding layers, you may also need to pass --embedding-sources-mapping.", ) subparser.add_argument( @@ -108,9 +108,9 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument type=str, default="", help="a JSON dict defining mapping from embedding module path to embedding " - "pretrained-file used during training. If not passed, and embedding needs to be " - "extended, we will try to use the original file paths used during training. If " - "they are not available we will use random vectors for embedding extension.", + "pretrained-file used during training. If not passed, and embedding needs to be " + "extended, we will try to use the original file paths used during training. If " + "they are not available we will use random vectors for embedding extension.", ) subparser.add_argument( "--file-friendly-logging", @@ -123,13 +123,13 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument "--auto-names", default="NONE", help="Automatically create output names for each evaluation file. " - "`NONE` will not automatically generate a file name for the " - "neither the metrics nor the predictions. In this case you will" - " need to pas in both `metrics_output_file` and `predictions_output_file`. " - "`METRICS` will only automatically create a file name for the" - " metrics file. `PREDS` will only automatically create a file" - " name for the predictions outputs. `ALL` will create a " - "filename for both the metrics and the predictions.", + "`NONE` will not automatically generate a file name for the " + "neither the metrics nor the predictions. In this case you will" + " need to pas in both `metrics_output_file` and `predictions_output_file`. " + "`METRICS` will only automatically create a file name for the" + " metrics file. `PREDS` will only automatically create a file" + " name for the predictions outputs. `ALL` will create a " + "filename for both the metrics and the predictions.", choices=["NONE", "METRICS", "PREDS", "ALL"], ) @@ -157,19 +157,19 @@ def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]: def evaluate_from_archive( - archive_file: Union[str, PathLike], - input_file: str, - metrics_output_file: Optional[str] = None, - predictions_output_file: Optional[str] = None, - batch_size: Optional[int] = None, - cmd_overrides: Union[str, Dict[str, Any]] = "", - cuda_device: int = -1, - embedding_sources_mapping: str = None, - extend_vocab: bool = False, - weights_file: str = None, - file_friendly_logging: bool = False, - batch_weight_key: str = None, - auto_names: str = "NONE", + archive_file: Union[str, PathLike], + input_file: str, + metrics_output_file: Optional[str] = None, + predictions_output_file: Optional[str] = None, + batch_size: Optional[int] = None, + cmd_overrides: Union[str, Dict[str, Any]] = "", + cuda_device: int = -1, + embedding_sources_mapping: str = None, + extend_vocab: bool = False, + weights_file: str = None, + file_friendly_logging: bool = False, + batch_weight_key: str = None, + auto_names: str = "NONE", ) -> Dict[str, Any]: """ @@ -282,10 +282,10 @@ def evaluate_from_archive( ] else: output_file_list = metrics_output_file.split(":") # type: ignore - assert len(output_file_list) == len( - evaluation_data_path_list - ), "The number of `metrics_output_file` paths must be equal to the number " \ - "of datasets being evaluated." + assert len(output_file_list) == len(evaluation_data_path_list), ( + "The number of `metrics_output_file` paths must be equal to the number " + "of datasets being evaluated." + ) if predictions_output_file is not None: if auto_names == "PREDS" or auto_names == "ALL": logger.warning( @@ -299,8 +299,8 @@ def evaluate_from_archive( else: predictions_output_file_list = predictions_output_file.split(":") # type: ignore assert len(predictions_output_file_list) == len(evaluation_data_path_list), ( - "The number of `predictions_output_file` paths must be equal" - + "to the number of datasets being evaluated. " + "The number of `predictions_output_file` paths must be equal" + + "to the number of datasets being evaluated. " ) # output file diff --git a/allennlp/evaluation/evaluator.py b/allennlp/evaluation/evaluator.py index d42a7745e0e..9c7ef7b6703 100644 --- a/allennlp/evaluation/evaluator.py +++ b/allennlp/evaluation/evaluator.py @@ -41,22 +41,22 @@ class Evaluator(Registrable): default_implementation = "simple" def __init__( - self, - batch_serializer: Optional[Serializer] = None, - cuda_device: Union[int, torch.device] = -1, - postprocessor_fn_name: str = "make_output_human_readable" + self, + batch_serializer: Optional[Serializer] = None, + cuda_device: Union[int, torch.device] = -1, + postprocessor_fn_name: str = "make_output_human_readable", ): self.batch_serializer = batch_serializer or SimpleSerializer() self.cuda_device = cuda_device self.postprocessor_fn_name = postprocessor_fn_name def __call__( - self, - model: Model, - data_loader: DataLoader, - batch_weight_key: str = None, - metrics_output_file: Union[str, PathLike] = None, - predictions_output_file: Union[str, PathLike] = None, + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + metrics_output_file: Union[str, PathLike] = None, + predictions_output_file: Union[str, PathLike] = None, ) -> Dict[str, Any]: """ Evaluate a single data source. @@ -108,20 +108,20 @@ class SimpleEvaluator(Evaluator): """ def __init__( - self, - batch_serializer: Optional[Serializer] = None, - cuda_device: Union[int, torch.device] = -1, - postprocessor_fn_name: str = "make_output_human_readable" + self, + batch_serializer: Optional[Serializer] = None, + cuda_device: Union[int, torch.device] = -1, + postprocessor_fn_name: str = "make_output_human_readable", ): super(SimpleEvaluator, self).__init__(batch_serializer, cuda_device, postprocessor_fn_name) def __call__( - self, - model: Model, - data_loader: DataLoader, - batch_weight_key: str = None, - metrics_output_file: Union[str, PathLike] = None, - predictions_output_file: Union[str, PathLike] = None, + self, + model: Model, + data_loader: DataLoader, + batch_weight_key: str = None, + metrics_output_file: Union[str, PathLike] = None, + predictions_output_file: Union[str, PathLike] = None, ): """ Evaluate a single data source. @@ -192,14 +192,14 @@ def __call__( metrics["loss"] = total_loss / total_weight description = ( - ", ".join( - [ - "%s: %.2f" % (name, value) - for name, value in metrics.items() - if not name.startswith("_") - ] - ) - + " ||" + ", ".join( + [ + "%s: %.2f" % (name, value) + for name, value in metrics.items() + if not name.startswith("_") + ] + ) + + " ||" ) generator_tqdm.set_description(description, refresh=False) @@ -235,7 +235,7 @@ def __call__( def _to_params(self) -> Dict[str, Any]: return { - "type" : "simple", - "cuda_device" : self.cuda_device, + "type": "simple", + "cuda_device": self.cuda_device, "batch_postprocessor": self.batch_serializer.to_params(), } From 1ee6546870831e095bb503b2c978da4442fb1bd5 Mon Sep 17 00:00:00 2001 From: Gabe Date: Tue, 4 Jan 2022 09:33:00 -0500 Subject: [PATCH 11/12] Fixed Flake8 and Black issues --- allennlp/commands/evaluate.py | 21 +++++++-------------- tests/commands/evaluate_test.py | 11 +++-------- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index 5fdef7a5533..bd5b9603d55 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -12,7 +12,6 @@ from typing import Union, Dict, Any, Optional from copy import deepcopy - from allennlp.commands.subcommand import Subcommand from allennlp.common import logging as common_logging from allennlp.common.util import prepare_environment @@ -36,28 +35,23 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument subparser.add_argument( "input_file", type=str, - help=( - "path to the file containing the evaluation data" - " (for mutiple files, put "," between filenames e.g., input1.txt,input2.txt)" - ), + help="path to the file containing the evaluation data (for mutiple " + "files, put between filenames e.g., input1.txt,input2.txt)", ) subparser.add_argument( "--output-file", type=str, - help=( - "optional path to write the metrics to as JSON" - " (for mutiple files, put "," between filenames e.g., output1.txt,output2.txt)" - ), + help="optional path to write the metrics to as JSON (for mutiple " + "files, put between filenames e.g., output1.txt,output2.txt)", ) subparser.add_argument( "--predictions-output-file", type=str, - help=( - "optional path to write the predictions to as JSON lines" - " (for mutiple files, put "," between filenames e.g., output1.jsonl,output2.jsonl)" - ), + help="optional path to write the predictions to as JSON lines " + "(for mutiple files, put between filenames e.g., " + "output1.jsonl,output2.jsonl)", ) subparser.add_argument( @@ -301,7 +295,6 @@ def evaluate_from_archive( + "to the number of datasets being evaluated. " ) - # output file output_file_path = None predictions_output_file_path = None diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index 692a20d9b2b..c454e07eaf3 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -112,18 +112,13 @@ def test_multiple_output_files_evaluate_from_args(self): kebab_args = [ "evaluate", - str( - self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" - ), - str(self.FIXTURES_ROOT / "data" / "conll2003.txt") - + "," - + str(self.FIXTURES_ROOT / "data" / "conll2003.txt"), + ",".join(map(str, paths)), "--cuda-device", "-1", "--output-file", - output_file + "," + output_file, + ",".join(map(str, out_paths)), "--predictions-output-file", - predictions_output_file + "," + predictions_output_file, + ",".join(map(str, pred_paths)), ] args = self.parser.parse_args(kebab_args) computed_metrics = evaluate_from_args(args) From 11ca86efd54c2c2b40c09cdecc16bc1c6de99b1a Mon Sep 17 00:00:00 2001 From: Gabe Date: Tue, 4 Jan 2022 10:05:01 -0500 Subject: [PATCH 12/12] Fixed Test Issues --- allennlp/commands/evaluate.py | 4 ++-- tests/commands/evaluate_test.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/allennlp/commands/evaluate.py b/allennlp/commands/evaluate.py index bd5b9603d55..2b9403a417f 100644 --- a/allennlp/commands/evaluate.py +++ b/allennlp/commands/evaluate.py @@ -36,14 +36,14 @@ def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.Argument "input_file", type=str, help="path to the file containing the evaluation data (for mutiple " - "files, put between filenames e.g., input1.txt,input2.txt)", + "files, put between filenames e.g., input1.txt,input2.txt)", ) subparser.add_argument( "--output-file", type=str, help="optional path to write the metrics to as JSON (for mutiple " - "files, put between filenames e.g., output1.txt,output2.txt)", + "files, put between filenames e.g., output1.txt,output2.txt)", ) subparser.add_argument( diff --git a/tests/commands/evaluate_test.py b/tests/commands/evaluate_test.py index c454e07eaf3..eebf7753453 100644 --- a/tests/commands/evaluate_test.py +++ b/tests/commands/evaluate_test.py @@ -112,6 +112,9 @@ def test_multiple_output_files_evaluate_from_args(self): kebab_args = [ "evaluate", + str( + self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" + ), ",".join(map(str, paths)), "--cuda-device", "-1", @@ -197,13 +200,13 @@ def test_auto_names_creates_files(self, auto_names): str( self.FIXTURES_ROOT / "simple_tagger_with_span_f1" / "serialization" / "model.tar.gz" ), - ":".join(map(str, paths)), + ",".join(map(str, paths)), "--cuda-device", "-1", "--output-file", - ":".join(map(str, out_paths)), + ",".join(map(str, out_paths)), "--predictions-output-file", - ":".join(map(str, pred_paths)), + ",".join(map(str, pred_paths)), "--auto-names", auto_names, ]