diff --git a/CHANGELOG.md b/CHANGELOG.md index 7adc2ad183d..b226f0a5d8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## Unreleased + +### Added + +- Added `TaskSuite` base class and command line functionality for running [`checklist`](https://github.com/marcotcr/checklist) test suites, along with implementations for `SentimentAnalysisSuite`, `QuestionAnsweringSuite`, and `TextualEntailmentSuite`. These can be found in the `allennlp.sanity_checks.task_checklists` module. + ## [v2.4.0](https://github.com/allenai/allennlp/releases/tag/v2.4.0) - 2021-04-22 @@ -40,7 +46,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fixed a bug with the `ShardedDatasetReader` when used with multi-process data loading (https://github.com/allenai/allennlp/issues/5132). - ## [v2.3.0](https://github.com/allenai/allennlp/releases/tag/v2.3.0) - 2021-04-14 ### Added @@ -103,6 +108,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `Model.get_parameters_for_histogram_tensorboard_logging` is deprecated in favor of `Model.get_parameters_for_histogram_logging`. + ### Fixed - Makes sure tensors that are stored in `TensorCache` always live on CPUs diff --git a/Makefile b/Makefile index 1ed1a6b1098..fdbfc181234 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,9 @@ install : # See https://github.com/pypa/pip/issues/4537. python setup.py install_egg_info pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt - + # Docs are not built on docker, and the runner is unable to find + # the nltk_data folder. Hence, we download the requirement. + python -c 'import nltk; nltk.download("sentiwordnet")' # # Documention helpers. # diff --git a/allennlp/commands/__init__.py b/allennlp/commands/__init__.py index 3a0fba2232f..8b5f100a0aa 100644 --- a/allennlp/commands/__init__.py +++ b/allennlp/commands/__init__.py @@ -18,6 +18,7 @@ from allennlp.commands.count_instances import CountInstances from allennlp.common.plugins import import_plugins from allennlp.common.util import import_module_and_submodules +from allennlp.commands.checklist import CheckList logger = logging.getLogger(__name__) diff --git a/allennlp/commands/checklist.py b/allennlp/commands/checklist.py new file mode 100644 index 00000000000..27a061915a4 --- /dev/null +++ b/allennlp/commands/checklist.py @@ -0,0 +1,199 @@ +""" +The `checklist` subcommand allows you to sanity check your +model's predictions using a trained model and its +[`Predictor`](../predictors/predictor.md#predictor) wrapper. +""" + +from typing import Optional, Dict, Any, List +import argparse +import sys +import json + +from overrides import overrides + +from allennlp.commands.subcommand import Subcommand +from allennlp.common.checks import check_for_gpu, ConfigurationError +from allennlp.models.archival import load_archive +from allennlp.predictors.predictor import Predictor +from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite + + +@Subcommand.register("checklist") +class CheckList(Subcommand): + @overrides + def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + + description = """Run the specified model through a checklist suite.""" + subparser = parser.add_parser( + self.name, + description=description, + help="Run a trained model through a checklist suite.", + ) + + subparser.add_argument( + "archive_file", type=str, help="The archived model to make predictions with" + ) + + subparser.add_argument("task", type=str, help="The name of the task suite") + + subparser.add_argument("--checklist-suite", type=str, help="The checklist suite path") + + subparser.add_argument( + "--capabilities", + nargs="+", + default=[], + help=('An optional list of strings of capabilities. Eg. "[Vocabulary, Robustness]"'), + ) + + subparser.add_argument( + "--max-examples", + type=int, + default=None, + help="Maximum number of examples to check per test.", + ) + + subparser.add_argument( + "--task-suite-args", + type=str, + default="", + help=( + "An optional JSON structure used to provide additional parameters to the task suite" + ), + ) + + subparser.add_argument( + "--print-summary-args", + type=str, + default="", + help=( + "An optional JSON structure used to provide additional " + "parameters for printing test summary" + ), + ) + + subparser.add_argument("--output-file", type=str, help="Path to output file") + + subparser.add_argument( + "--cuda-device", type=int, default=-1, help="ID of GPU to use (if any)" + ) + + subparser.add_argument( + "--predictor", type=str, help="Optionally specify a specific predictor to use" + ) + + subparser.add_argument( + "--predictor-args", + type=str, + default="", + help=( + "An optional JSON structure used to provide additional parameters to the predictor" + ), + ) + + subparser.set_defaults(func=_run_suite) + + return subparser + + +def _get_predictor(args: argparse.Namespace) -> Predictor: + check_for_gpu(args.cuda_device) + archive = load_archive( + args.archive_file, + cuda_device=args.cuda_device, + ) + + predictor_args = args.predictor_args.strip() + if len(predictor_args) <= 0: + predictor_args = {} + else: + predictor_args = json.loads(predictor_args) + + return Predictor.from_archive( + archive, + args.predictor, + extra_args=predictor_args, + ) + + +def _get_task_suite(args: argparse.Namespace) -> TaskSuite: + available_tasks = TaskSuite.list_available() + if args.task in available_tasks: + suite_name = args.task + else: + raise ConfigurationError( + f"'{args.task}' is not a recognized task suite. " + f"Available tasks are: {available_tasks}." + ) + + file_path = args.checklist_suite + + task_suite_args = args.task_suite_args.strip() + if len(task_suite_args) <= 0: + task_suite_args = {} + else: + task_suite_args = json.loads(task_suite_args) + + return TaskSuite.constructor( + name=suite_name, + suite_file=file_path, + extra_args=task_suite_args, + ) + + +class _CheckListManager: + def __init__( + self, + task_suite: TaskSuite, + predictor: Predictor, + capabilities: Optional[List[str]] = None, + max_examples: Optional[int] = None, + output_file: Optional[str] = None, + print_summary_args: Optional[Dict[str, Any]] = None, + ) -> None: + self._task_suite = task_suite + self._predictor = predictor + self._capabilities = capabilities + self._max_examples = max_examples + self._output_file = None if output_file is None else open(output_file, "w") + self._print_summary_args = print_summary_args or {} + + if capabilities: + self._print_summary_args["capabilities"] = capabilities + + def run(self) -> None: + self._task_suite.run( + self._predictor, capabilities=self._capabilities, max_examples=self._max_examples + ) + + # We pass in an IO object. + output_file = self._output_file or sys.stdout + self._task_suite.summary(file=output_file, **self._print_summary_args) + + # If `_output_file` was None, there would be nothing to close. + if self._output_file is not None: + self._output_file.close() + + +def _run_suite(args: argparse.Namespace) -> None: + + task_suite = _get_task_suite(args) + predictor = _get_predictor(args) + + print_summary_args = args.print_summary_args.strip() + if len(print_summary_args) <= 0: + print_summary_args = {} + else: + print_summary_args = json.loads(print_summary_args) + + capabilities = args.capabilities + max_examples = args.max_examples + + manager = _CheckListManager( + task_suite, + predictor, + capabilities, + max_examples, + args.output_file, + print_summary_args, + ) + manager.run() diff --git a/allennlp/common/testing/checklist_test.py b/allennlp/common/testing/checklist_test.py new file mode 100644 index 00000000000..c84b82b7afb --- /dev/null +++ b/allennlp/common/testing/checklist_test.py @@ -0,0 +1,35 @@ +from typing import Optional +from checklist.test_suite import TestSuite +from checklist.test_types import MFT as MinimumFunctionalityTest +from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite + + +@TaskSuite.register("fake-task-suite") +class FakeTaskSuite(TaskSuite): + """ + Fake checklist suite for testing purpose. + """ + + def __init__( + self, + suite: Optional[TestSuite] = None, + fake_arg1: Optional[int] = None, + fake_arg2: Optional[int] = None, + ): + self._fake_arg1 = fake_arg1 + self._fake_arg2 = fake_arg2 + + if not suite: + suite = TestSuite() + + # Adding a simple checklist test. + test = MinimumFunctionalityTest( + ["sentence 1", "sentence 2"], + labels=0, + name="fake test 1", + capability="fake capability", + description="Test's description", + ) + suite.add(test) + + super().__init__(suite) diff --git a/allennlp/sanity_checks/task_checklists/__init__.py b/allennlp/sanity_checks/task_checklists/__init__.py new file mode 100644 index 00000000000..ef0e0d28263 --- /dev/null +++ b/allennlp/sanity_checks/task_checklists/__init__.py @@ -0,0 +1,10 @@ +from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite +from allennlp.sanity_checks.task_checklists.sentiment_analysis_suite import ( + SentimentAnalysisSuite, +) +from allennlp.sanity_checks.task_checklists.question_answering_suite import ( + QuestionAnsweringSuite, +) +from allennlp.sanity_checks.task_checklists.textual_entailment_suite import ( + TextualEntailmentSuite, +) diff --git a/allennlp/sanity_checks/task_checklists/question_answering_suite.py b/allennlp/sanity_checks/task_checklists/question_answering_suite.py new file mode 100644 index 00000000000..890ccb6b4ee --- /dev/null +++ b/allennlp/sanity_checks/task_checklists/question_answering_suite.py @@ -0,0 +1,208 @@ +from typing import Optional, Iterable, Tuple, Union +import itertools +import numpy as np +from overrides import overrides +from checklist.editor import MunchWithAdd as CheckListTemplate +from checklist.test_suite import TestSuite +from checklist.test_types import MFT +from checklist.perturb import Perturb +from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite +from allennlp.sanity_checks.task_checklists import utils + + +def _crossproduct(template: CheckListTemplate): + """ + Takes the output of editor.template and does the cross product of contexts and qas + """ + ret = [] + ret_labels = [] + for instance in template.data: + cs = instance["contexts"] + qas = instance["qas"] + d = list(itertools.product(cs, qas)) + ret.append([(x[0], x[1][0]) for x in d]) + ret_labels.append([x[1][1] for x in d]) + template.data = ret + template.labels = ret_labels + return template + + +@TaskSuite.register("question-answering") +class QuestionAnsweringSuite(TaskSuite): + def __init__( + self, + suite: Optional[TestSuite] = None, + context_key: str = "context", + question_key: str = "question", + answer_key: str = "best_span_str", + **kwargs, + ): + self._context_key = context_key + self._question_key = question_key + self._answer_key = answer_key + + super().__init__(suite, **kwargs) + + def _prediction_and_confidence_scores(self, predictor): + def preds_and_confs_fn(data): + data = [{self._context_key: pair[0], self._question_key: pair[1]} for pair in data] + predictions = predictor.predict_batch_json(data) + labels = [pred[self._answer_key] for pred in predictions] + return labels, np.ones(len(labels)) + + return preds_and_confs_fn + + @overrides + def _format_failing_examples( + self, + inputs: Tuple, + pred: str, + conf: Union[np.array, np.ndarray], + label: Optional[str] = None, + *args, + **kwargs, + ): + """ + Formatting function for printing failed test examples. + """ + context, question = inputs + ret = "Context: %s\nQuestion: %s\n" % (context, question) + if label is not None: + ret += "Original answer: %s\n" % label + ret += "Predicted answer: %s\n" % pred + return ret + + @classmethod + def contractions(cls): + def _contractions(x): + conts = Perturb.contractions(x[1]) + return [(x[0], a) for a in conts] + + return _contractions + + @classmethod + def typos(cls): + def question_typo(x, **kwargs): + return (x[0], Perturb.add_typos(x[1], **kwargs)) + + return question_typo + + @classmethod + def punctuation(cls): + def context_punctuation(x): + return (utils.strip_punctuation(x[0]), x[1]) + + return context_punctuation + + @overrides + def _setup_editor(self): + super()._setup_editor() + + adj = [ + "old", + "smart", + "tall", + "young", + "strong", + "short", + "tough", + "cool", + "fast", + "nice", + "small", + "dark", + "wise", + "rich", + "great", + "weak", + "high", + "slow", + "strange", + "clean", + ] + adj = [(x.rstrip("e"), x) for x in adj] + + self.editor.add_lexicon("adjectives_to_compare", adj, overwrite=True) + + comp_pairs = [ + ("better", "worse"), + ("older", "younger"), + ("smarter", "dumber"), + ("taller", "shorter"), + ("bigger", "smaller"), + ("stronger", "weaker"), + ("faster", "slower"), + ("darker", "lighter"), + ("richer", "poorer"), + ("happier", "sadder"), + ("louder", "quieter"), + ("warmer", "colder"), + ] + + self.editor.add_lexicon("comp_pairs", comp_pairs, overwrite=True) + + @overrides + def _default_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + super()._default_tests(data, num_test_cases) + self._setup_editor() + self._default_vocabulary_tests(data, num_test_cases) + self._default_taxonomy_tests(data, num_test_cases) + + def _default_vocabulary_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + + template = self.editor.template( + [ + ( + "{first_name} is {adjectives_to_compare[0]}er than {first_name1}.", + "Who is less {adjectives_to_compare[1]}?", + ), + ( + "{first_name} is {adjectives_to_compare[0]}er than {first_name1}.", + "Who is {adjectives_to_compare[0]}er?", + ), + ], + labels=["{first_name1}", "{first_name}"], + remove_duplicates=True, + nsamples=num_test_cases, + save=True, + ) + test = MFT( + **template, + name="A is COMP than B. Who is more / less COMP?", + description='Eg. Context: "A is taller than B" ' + 'Q: "Who is taller?" A: "A", Q: "Who is less tall?" A: "B"', + capability="Vocabulary", + ) + self.add_test(test) + + def _default_taxonomy_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + template = _crossproduct( + self.editor.template( + { + "contexts": [ + "{first_name} is {comp_pairs[0]} than {first_name1}.", + "{first_name1} is {comp_pairs[1]} than {first_name}.", + ], + "qas": [ + ( + "Who is {comp_pairs[1]}?", + "{first_name1}", + ), + ( + "Who is {comp_pairs[0]}?", + "{first_name}", + ), + ], + }, + remove_duplicates=True, + nsamples=num_test_cases, + save=True, + ) + ) + test = MFT( + **template, + name="A is COMP than B. Who is antonym(COMP)? B", + description='Eg. Context: "A is taller than B", Q: "Who is shorter?", A: "B"', + capability="Taxonomy", + ) + self.add_test(test) diff --git a/allennlp/sanity_checks/task_checklists/sentiment_analysis_suite.py b/allennlp/sanity_checks/task_checklists/sentiment_analysis_suite.py new file mode 100644 index 00000000000..79dcfe8a75b --- /dev/null +++ b/allennlp/sanity_checks/task_checklists/sentiment_analysis_suite.py @@ -0,0 +1,758 @@ +from typing import Optional, Iterable, List, Union, Tuple +import numpy as np +from overrides import overrides +from checklist.test_suite import TestSuite +from checklist.test_types import MFT, INV, DIR, Expect +from checklist.editor import Editor +from checklist.perturb import Perturb +from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite +from allennlp.sanity_checks.task_checklists import utils +from allennlp.data.instance import Instance + + +def _add_phrase_function(phrases: List[str], num_samples: int = 10): + """ + Returns a function which adds each str in `phrases` + at the end of the input string and returns that list. + """ + + def perturb_fn(inp): + input_str = utils.strip_punctuation(inp) + total = len(phrases) + idx = np.random.choice(total, min(num_samples, total), replace=False) + ret = [input_str + ". " + phrases[i] for i in idx] + return ret + + return perturb_fn + + +@TaskSuite.register("sentiment-analysis") +class SentimentAnalysisSuite(TaskSuite): + """ + This suite was built using the checklist process with the self.editor + suggestions. Users are encouraged to add/modify as they see fit. + + Note: `editor.suggest(...)` can be slow as it runs a language model. + """ + + def __init__( + self, + suite: Optional[TestSuite] = None, + positive: Optional[int] = 0, + negative: Optional[int] = 1, + **kwargs, + ): + + self._positive = positive + self._negative = negative + super().__init__(suite, **kwargs) + + @overrides + def _prediction_and_confidence_scores(self, predictor): + def preds_and_confs_fn(data): + labels = [] + confs = [] + if isinstance(data[0], Instance): + predictions = predictor.predict_batch_instance(data) + else: + data = [{"sentence": sentence} for sentence in data] + predictions = predictor.predict_batch_json(data) + for pred in predictions: + label = pred["probs"].index(max(pred["probs"])) + labels.append(label) + confs.append([pred["probs"][self._positive], pred["probs"][self._negative]]) + return np.array(labels), np.array(confs) + + return preds_and_confs_fn + + @overrides + def _format_failing_examples( + self, + inputs: Tuple, + pred: int, + conf: Union[np.array, np.ndarray], + label: Optional[int] = None, + *args, + **kwargs, + ): + """ + Formatting function for printing failed test examples. + """ + labels = {self._positive: "Positive", self._negative: "Negative"} + ret = str(inputs) + if label is not None: + ret += " (Original: %s)" % labels[label] + ret += "\nPrediction: %s (Confidence: %.1f)" % (labels[pred], conf[pred]) + + return ret + + @overrides + def _default_tests(self, data: Optional[Iterable[str]], num_test_cases=100): + super()._default_tests(data, num_test_cases) + self._setup_editor() + self._default_vocabulary_tests(data, num_test_cases) + self._default_ner_tests(data, num_test_cases) + self._default_temporal_tests(data, num_test_cases) + self._default_fairness_tests(data, num_test_cases) + self._default_negation_tests(data, num_test_cases) + + def _setup_editor(self): + if not hasattr(self, "editor"): + self.editor = Editor() + + pos_adj = [ + "good", + "great", + "excellent", + "amazing", + "extraordinary", + "beautiful", + "fantastic", + "nice", + "incredible", + "exceptional", + "awesome", + "perfect", + "fun", + "adorable", + "brilliant", + "exciting", + "sweet", + "wonderful", + ] + neg_adj = [ + "awful", + "bad", + "horrible", + "weird", + "rough", + "lousy", + "unhappy", + "average", + "difficult", + "poor", + "sad", + "frustrating", + "hard", + "lame", + "nasty", + "annoying", + "boring", + "creepy", + "dreadful", + "ridiculous", + "terrible", + "ugly", + "unpleasant", + ] + self.editor.add_lexicon("pos_adj", pos_adj, overwrite=True) + self.editor.add_lexicon("neg_adj", neg_adj, overwrite=True) + + pos_verb_present = [ + "like", + "enjoy", + "appreciate", + "love", + "recommend", + "admire", + "value", + "welcome", + ] + neg_verb_present = ["hate", "dislike", "regret", "abhor", "dread", "despise"] + pos_verb_past = [ + "liked", + "enjoyed", + "appreciated", + "loved", + "admired", + "valued", + "welcomed", + ] + neg_verb_past = ["hated", "disliked", "regretted", "abhorred", "dreaded", "despised"] + self.editor.add_lexicon("pos_verb_present", pos_verb_present, overwrite=True) + self.editor.add_lexicon("neg_verb_present", neg_verb_present, overwrite=True) + self.editor.add_lexicon("pos_verb_past", pos_verb_past, overwrite=True) + self.editor.add_lexicon("neg_verb_past", neg_verb_past, overwrite=True) + self.editor.add_lexicon("pos_verb", pos_verb_present + pos_verb_past, overwrite=True) + self.editor.add_lexicon("neg_verb", neg_verb_present + neg_verb_past, overwrite=True) + + noun = [ + "airline", + "movie", + "product", + "customer service", + "restaurant", + "hotel", + "food", + "staff", + "company", + "crew", + "service", + ] + self.editor.add_lexicon("noun", noun, overwrite=True) + + intens_adj = [ + "very", + "really", + "absolutely", + "truly", + "extremely", + "quite", + "incredibly", + "amazingly", + "especially", + "exceptionally", + "unbelievably", + "utterly", + "exceedingly", + "rather", + "totally", + "particularly", + ] + intens_verb = [ + "really", + "absolutely", + "truly", + "extremely", + "especially", + "utterly", + "totally", + "particularly", + "highly", + "definitely", + "certainly", + "genuinely", + "honestly", + "strongly", + "sure", + "sincerely", + ] + + self.editor.add_lexicon("intens_adj", intens_adj, overwrite=True) + self.editor.add_lexicon("intens_verb", intens_verb, overwrite=True) + + reducer_adj = [ + "somewhat", + "kinda", + "mostly", + "probably", + "generally", + "reasonably", + "a little", + "a bit", + "slightly", + ] + + self.editor.add_lexicon("reducer_adj", reducer_adj, overwrite=True) + + self.monotonic_label = Expect.monotonic(increasing=True, tolerance=0.1) + self.monotonic_label_down = Expect.monotonic(increasing=False, tolerance=0.1) + + def _default_vocabulary_tests(self, data: Optional[Iterable[str]], num_test_cases=100): + + positive_words = ( + self.editor.lexicons["pos_adj"] + + self.editor.lexicons["pos_verb_present"] + + self.editor.lexicons["pos_verb_past"] + ) + + test = MFT( + positive_words, + labels=self._positive, + name="Single Positive Words", + capability="Vocabulary", + description="Correctly recognizes positive words", + ) + + self.add_test(test) + + negative_words = ( + self.editor.lexicons["neg_adj"] + + self.editor.lexicons["neg_verb_present"] + + self.editor.lexicons["neg_verb_past"] + ) + + test = MFT( + negative_words, + labels=self._negative, + name="Single Negative Words", + capability="Vocabulary", + description="Correctly recognizes negative words", + ) + + self.add_test(test) + + template = self.editor.template( + "{it} {noun} {be} {pos_adj}.", + it=["The", "This", "That"], + be=["is", "was"], + labels=self._positive, + save=True, + ) + template += self.editor.template( + "{it} {be} {a:pos_adj} {noun}.", + it=["It", "This", "That"], + be=["is", "was"], + labels=self._positive, + save=True, + ) + template += self.editor.template( + "{i} {pos_verb} {the} {noun}.", + i=["I", "We"], + the=["this", "that", "the"], + labels=self._positive, + save=True, + ) + template += self.editor.template( + "{it} {noun} {be} {neg_adj}.", + it=["That", "This", "The"], + be=["is", "was"], + labels=self._negative, + save=True, + ) + template += self.editor.template( + "{it} {be} {a:neg_adj} {noun}.", + it=["It", "This", "That"], + be=["is", "was"], + labels=self._negative, + save=True, + ) + template += self.editor.template( + "{i} {neg_verb} {the} {noun}.", + i=["I", "We"], + the=["this", "that", "the"], + labels=self._negative, + save=True, + ) + + test = MFT( + **template, + name="Sentiment-laden words in context", + capability="Vocabulary", + description="Use positive and negative verbs and adjectives " + "with nouns such as product, movie, airline, etc. " + 'E.g. "This was a bad movie"', + ) + + self.add_test(test) + + template = self.editor.template( + ["{it} {be} {a:pos_adj} {noun}.", "{it} {be} {a:intens_adj} {pos_adj} {noun}."], + it=["It", "This", "That"], + be=["is", "was"], + nsamples=num_test_cases, + save=True, + ) + template += self.editor.template( + ["{i} {pos_verb} {the} {noun}.", "{i} {intens_verb} {pos_verb} {the} {noun}."], + i=["I", "We"], + the=["this", "that", "the"], + nsamples=num_test_cases, + save=True, + ) + template += self.editor.template( + ["{it} {be} {a:neg_adj} {noun}.", "{it} {be} {a:intens_adj} {neg_adj} {noun}."], + it=["It", "This", "That"], + be=["is", "was"], + nsamples=num_test_cases, + save=True, + ) + template += self.editor.template( + ["{i} {neg_verb} {the} {noun}.", "{i} {intens_verb} {neg_verb} {the} {noun}."], + i=["I", "We"], + the=["this", "that", "the"], + nsamples=num_test_cases, + save=True, + ) + + test = DIR( + template.data, + self.monotonic_label, + templates=template.templates, + name="Intensifiers", + capability="Vocabulary", + description="Test is composed of pairs of sentences (x1, x2), where we add an intensifier" + "such as 'really',or 'very' to x2 and expect the confidence to NOT go down " + "(with tolerance=0.1). e.g.:" + "x1 = 'That was a good movie'" + "x2 = 'That was a very good movie'", + ) + + self.add_test(test) + + template = self.editor.template( + ["{it} {noun} {be} {pos_adj}.", "{it} {noun} {be} {reducer_adj} {pos_adj}."], + it=["The", "This", "That"], + be=["is", "was"], + nsamples=num_test_cases, + save=True, + ) + template += self.editor.template( + ["{it} {noun} {be} {neg_adj}.", "{it} {noun} {be} {reducer_adj} {neg_adj}."], + it=["The", "This", "That"], + be=["is", "was"], + nsamples=num_test_cases, + save=True, + ) + test = DIR( + template.data, + self.monotonic_label_down, + templates=template.templates, + name="Reducers", + capability="Vocabulary", + description="Test is composed of pairs of sentences (x1, x2), where we add a reducer" + "such as 'somewhat', or 'kinda' to x2 and expect the confidence to NOT go up " + " (with tolerance=0.1). e.g.:" + "x1 = 'The staff was good.'" + "x2 = 'The staff was somewhat good.'", + ) + + self.add_test(test) + + if data: + + positive = self.editor.template("I {pos_verb_present} you.").data + positive += self.editor.template("You are {pos_adj}.").data + + negative = self.editor.template("I {neg_verb_present} you.").data + negative += self.editor.template("You are {neg_adj}.").data + + template = Perturb.perturb( + data, _add_phrase_function(positive), nsamples=num_test_cases + ) + test = DIR( + template.data, + Expect.pairwise(self._diff_up), + name="Add positive phrases", + capability="Vocabulary", + description="Add very positive phrases (e.g. I love you) to the end of sentences, " + "expect probability of positive to NOT go down (tolerance=0.1)", + ) + + self.add_test(test) + + template = Perturb.perturb( + data, _add_phrase_function(negative), nsamples=num_test_cases + ) + test = DIR( + template.data, + Expect.pairwise(self._diff_down), + name="Add negative phrases", + capability="Vocabulary", + description="Add very negative phrases (e.g. I hate you) to the end of sentences, " + "expect probability of positive to NOT go up (tolerance=0.1)", + ) + + self.add_test(test) + + def _default_robustness_tests(self, data: Optional[Iterable[str]], num_test_cases=100): + + template = Perturb.perturb(data, utils.add_random_strings, nsamples=num_test_cases) + test = INV( + template.data, + name="Add random urls and handles", + capability="Robustness", + description="Add randomly generated urls and handles to the start or end of sentence", + ) + + self.add_test(test) + + def _default_ner_tests(self, data: Optional[Iterable[str]], num_test_cases=100): + if data: + template = Perturb.perturb( + data, utils.spacy_wrap(Perturb.change_names, ner=True), nsamples=num_test_cases + ) + test = INV( + template.data, + name="Change names", + capability="NER", + description="Replace names with other common names", + ) + self.add_test(test) + + template = Perturb.perturb( + data, utils.spacy_wrap(Perturb.change_location, ner=True), nsamples=num_test_cases + ) + test = INV( + template.data, + name="Change locations", + capability="NER", + description="Replace city or country names with other cities or countries", + ) + self.add_test(test) + + template = Perturb.perturb( + data, utils.spacy_wrap(Perturb.change_number, ner=True), nsamples=num_test_cases + ) + test = INV( + template.data, + name="Change numbers", + capability="NER", + description="Replace integers with random integers within a 20% radius of the original", + ) + self.add_test(test) + + def _default_temporal_tests(self, data: Optional[Iterable[str]], num_test_cases=100): + self._setup_editor() + + change = ["but", "even though", "although", ""] + template = self.editor.template( + [ + "I used to think this {noun} was {neg_adj}, {change} now I think it is {pos_adj}.", + "I think this {noun} is {pos_adj}, {change} I used to think it was {neg_adj}.", + "In the past I thought this {noun} was {neg_adj}, {change} now I think it is {pos_adj}.", + "I think this {noun} is {pos_adj}, {change} in the past I thought it was {neg_adj}.", + ], + change=change, + unroll=True, + nsamples=num_test_cases, + save=True, + labels=self._positive, + ) + template += self.editor.template( + [ + "I used to {neg_verb_present} this {noun}, {change} now I {pos_verb_present} it.", + "I {pos_verb_present} this {noun}, {change} I used to {neg_verb_present} it.", + "In the past I would {neg_verb_present} this {noun}, {change} now I {pos_verb} it.", + "I {pos_verb_present} this {noun}, {change} in the past I would {neg_verb_present} it.", + ], + change=change, + unroll=True, + nsamples=num_test_cases, + save=True, + labels=self._positive, + ) + + template += self.editor.template( + [ + "I used to think this {noun} was {pos_adj}, {change} now I think it is {neg_adj}.", + "I think this {noun} is {neg_adj}, {change} I used to think it was {pos_adj}.", + "In the past I thought this {noun} was {pos_adj}, {change} now I think it is {neg_adj}.", + "I think this {noun} is {neg_adj}, {change} in the past I thought it was {pos_adj}.", + ], + change=change, + unroll=True, + nsamples=num_test_cases, + save=True, + labels=self._negative, + ) + template += self.editor.template( + [ + "I used to {pos_verb_present} this {noun}, {change} now I {neg_verb_present} it.", + "I {neg_verb_present} this {noun}, {change} I used to {pos_verb_present} it.", + "In the past I would {pos_verb_present} this {noun}, {change} now I {neg_verb_present} it.", + "I {neg_verb_present} this {noun}, {change} in the past I would {pos_verb_present} it.", + ], + change=change, + unroll=True, + nsamples=num_test_cases, + save=True, + labels=self._negative, + ) + test = MFT( + **template, + name="Used to, but now", + capability="Temporal", + description="Have two conflicing statements, one about the past and " + "one about the present." + "Expect the present to carry the sentiment. Examples:" + "I used to love this airline, now I hate it -> should be negative" + "I love this airline, although I used to hate it -> should be positive", + ) + + self.add_test(test) + + adjectives = self.editor.lexicons["pos_adj"] + self.editor.lexicons["neg_adj"] + verbs = self.editor.lexicons["pos_verb_present"] + self.editor.lexicons["neg_verb_present"] + + template = self.editor.template( + ["{it} {be} {a:adj} {noun}.", "I used to think {it} {be} {a:adj} {noun}."], + it=["it", "this", "that"], + be=["is", "was"], + adj=adjectives, + save=True, + nsamples=num_test_cases, + ) + template += self.editor.template( + ["{i} {verb} {the} {noun}.", "{i} used to {verb} {the} {noun}."], + i=["I", "We"], + the=["this", "that", "the"], + verb=verbs, + save=True, + nsamples=num_test_cases, + ) + test = DIR( + template.data, + self.monotonic_label_down, + templates=template.templates, + name="'Used to' should reduce", + capability="Temporal", + description="A model should not be more confident on 'I used to think X' " + "when compared to 'X', e.g. 'I used to love this restaurant' " + "should have less confidence than 'I love this restaurant'", + ) + + self.add_test(test) + + def _default_fairness_tests(self, data: Optional[Iterable[str]], num_test_cases=100): + protected = { + "race": ["a black", "a hispanic", "a white", "an asian"], # add more here. + "sexuality": self.editor.template("{a:sexual_adj}").data, + "religion": self.editor.template("{a:religion_adj}").data, + "nationality": self.editor.template("{a:nationality}").data, + } + + for p, vals in protected.items(): + template = self.editor.template( + ["{male} is %s {mask}." % r for r in vals], + return_maps=False, + nsamples=num_test_cases, + save=True, + ) + template += self.editor.template( + ["{female} is %s {mask}." % r for r in vals], + return_maps=False, + nsamples=num_test_cases, + save=True, + ) + test = INV( + template.data, + threshold=0.1, + templates=template.templates, + name="Protected: %s" % p, + capability="Fairness", + description="Prediction should be the same for various adjectives within a protected class", + ) + + self.add_test(test) + + def _default_negation_tests(self, data: Optional[Iterable[str]], num_test_cases=100): + template = self.editor.template( + "{it} {noun} {nt} {pos_adj}.", + it=["This", "That", "The"], + nt=["is not", "isn't"], + save=True, + nsamples=num_test_cases, + ) + template += self.editor.template( + "{it} {benot} {a:pos_adj} {noun}.", + it=["It", "This", "That"], + benot=["is not", "isn't", "was not", "wasn't"], + save=True, + nsamples=num_test_cases, + ) + neg = ["I can't say I", "I don't", "I would never say I", "I don't think I", "I didn't"] + template += self.editor.template( + "{neg} {pos_verb_present} {the} {noun}.", + neg=neg, + the=["this", "that", "the"], + save=True, + nsamples=num_test_cases, + ) + template += self.editor.template( + "No one {pos_verb_present}s {the} {noun}.", + neg=neg, + the=["this", "that", "the"], + save=True, + nsamples=num_test_cases, + ) + test = MFT( + template.data, + labels=self._negative, + templates=template.templates, + name="Simple negations: negative", + capability="Negation", + description="Very simple negations of positive statements", + ) + + self.add_test(test) + + template = self.editor.template( + "I thought {it} {noun} would be {pos_adj}, but it {neg}.", + neg=["was not", "wasn't"], + it=["this", "that", "the"], + nt=["is not", "isn't"], + save=True, + nsamples=num_test_cases, + ) + template += self.editor.template( + "I thought I would {pos_verb_present} {the} {noun}, but I {neg}.", + neg=["did not", "didn't"], + the=["this", "that", "the"], + save=True, + nsamples=num_test_cases, + ) + test = MFT( + template.data, + labels=self._negative, + templates=template.templates, + name="Simple negations: I thought x was positive, but it was not", + capability="Negation", + description="", + ) + self.add_test(test) + + def _positive_change(self, orig_conf: np.ndarray, conf: np.ndarray) -> float: + """ + Returns the change in the confidence scores. + """ + return ( + orig_conf[self._negative] + - conf[self._negative] + + conf[self._positive] + - orig_conf[self._positive] + ) + + def _diff_up( + self, + orig_pred: int, + pred: int, + orig_conf: np.ndarray, + conf: np.ndarray, + labels: Optional[int] = None, + meta: Optional[List] = None, + ) -> Union[bool, float]: + """ + These arguments are expected by `checklist.expect.Expect.pairwise` function. + We only use `orig_conf` and `conf` in this case. + + `orig_conf` is the confidence score of the first example in a test's input data pair. + + A `bool` output indicates whether the test passed the expectation (always + `True` in this function's case). + + A `float` output indicates the magnitude of the failure. + """ + tolerance = 0.1 + change = self._positive_change(orig_conf, conf) + if change + tolerance >= 0: + return True + else: + return change + tolerance + + def _diff_down( + self, + orig_pred: int, + pred: int, + orig_conf: np.ndarray, + conf: np.ndarray, + labels: Optional[int] = None, + meta: Optional[List] = None, + ) -> Union[bool, float]: + """ + These arguments are expected by `checklist.expect.Expect.pairwise` function. + We only use `orig_conf` and `conf` in this case. + + `orig_conf` is the confidence score of the first example in a test's input data pair. + + A `bool` output indicates whether the test passed the expectation (always + `True` in this function's case). + + A `float` output indicates the magnitude of the failure. + """ + tolerance = 0.1 + change = self._positive_change(orig_conf, conf) + if change - tolerance <= 0: + return True + else: + return -(change - tolerance) diff --git a/allennlp/sanity_checks/task_checklists/task_suite.py b/allennlp/sanity_checks/task_checklists/task_suite.py new file mode 100644 index 00000000000..85b05902fdb --- /dev/null +++ b/allennlp/sanity_checks/task_checklists/task_suite.py @@ -0,0 +1,416 @@ +import sys +import logging +from typing import Type, Optional, Dict, Any, Callable, List, Iterable, Union, TextIO, Tuple + +import numpy as np +from checklist.test_suite import TestSuite +from checklist.editor import Editor +from checklist.test_types import MFT, INV, DIR +from checklist.perturb import Perturb +from allennlp.common.registrable import Registrable +from allennlp.common.file_utils import cached_path +from allennlp.predictors.predictor import Predictor +from allennlp.sanity_checks.task_checklists import utils + +logger = logging.getLogger(__name__) + + +class TaskSuite(Registrable): + """ + Base class for various task test suites. + + This is a wrapper class around the CheckList toolkit introduced + in the paper + [Beyond Accuracy: Behavioral Testing of NLP models with CheckList (Ribeiro et al)] + (https://api.semanticscholar.org/CorpusID:218551201). + + Task suites are intended to be used as a form of behavioral testing + for NLP models to check for robustness across several general linguistic + capabilities; eg. Vocabulary, SRL, Negation, etc. + + An example of the entire checklist process can be found at: + [https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/] + (https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/). + + A task suite should contain tests that check general capabilities, including + but not limited to: + + * Vocabulary + POS : Important words/word types for the task + * Taxonomy : Synonyms/antonyms, etc. + * Robustness : To typos, irrelevant changes, etc. + * NER : Appropriately understanding named entities. + * Temporal : Understanding the order of events. + * Negation + * Coreference + * Semantic Role Labeling : Understanding roles such as agents and objects. + * Logic : Ability to handle symmetry, consistency, and conjunctions. + * Fairness + + + # Parameters + + suite: `checklist.test_suite.TestSuite`, optional (default = `None`) + Pass in an existing test suite. + + add_default_tests: `bool` (default = `False`) + Whether to add default checklist tests for the task. + + data: `List[Any]`, optional (default = `None`) + If the data is provided, and `add_default_tests` is `True`, + tests that perturb the data are also added. + + For instance, if the task is sentiment analysis, and the a + list of sentences is passed, it will add tests that check + a model's robustness to typos, etc. + """ + + _capabilities: List[str] = [ + "Vocabulary", + "Taxonomy", + "Robustness", + "NER", + "Fairness", + "Temporal", + "Negation", + "Coref", + "SRL", + "Logic", + ] + + def __init__( + self, + suite: Optional[TestSuite] = None, + add_default_tests: bool = True, + data: Optional[List[Any]] = None, + **kwargs, + ): + self.suite = suite or TestSuite() + + if add_default_tests: + self._default_tests(data, **kwargs) + + def _prediction_and_confidence_scores(self, predictor: Predictor) -> Callable: + """ + This makes certain assumptions about the task predictor + input and output expectations. This should return a function + that takes the data as input, passes it to the predictor, + and returns predictions and confidences. + """ + return NotImplementedError + + def describe(self): + """ + Gives a description of the test suite. This is intended as a utility for + examining the test suite. + """ + self._summary(overview_only=True) + + def summary( + self, capabilities: Optional[List[str]] = None, file: TextIO = sys.stdout, **kwargs + ): + """ + Prints a summary of the test results. + + # Parameters + + capabilities : `List[str]`, optional (default = `None`) + If not None, will only show tests with these capabilities. + **kwargs : `type` + Will be passed as arguments to each test.summary() + """ + old_stdout = sys.stdout + try: + sys.stdout = file + self._summary(capabilities=capabilities, **kwargs) + finally: + sys.stdout = old_stdout + + def _summary( + self, overview_only: bool = False, capabilities: Optional[List[str]] = None, **kwargs + ): + """ + Internal function for description and summary. + """ + + # The capabilities are sorted such that if the capability does not exist + # in the list of pre-defined `_capabilities`, then it is put at the end. + # `100` is selected as an arbitrary large number; we do not expect the + # number of capabilities to be higher. + def cap_order(x): + return self._capabilities.index(x) if x in self._capabilities else 100 + + capabilities = capabilities or sorted( + set([x["capability"] for x in self.suite.info.values()]), key=cap_order + ) + print( + "\n\nThis suite contains {} tests across {} capabilities.".format( + len(self.suite.tests), len(capabilities) + ) + ) + print() + for capability in capabilities: + tests = [ + name for name, test in self.suite.info.items() if test["capability"] == capability + ] + num_tests = len(tests) + if num_tests > 0: + print(f'\nCapability: "{capability}" ({num_tests} tests)\n') + for test in tests: + description = self.suite.info[test]["description"] + num_test_cases = len(self.suite.tests[test].data) + about_test = f"* Name: {test} ({num_test_cases} test cases)" + if description: + about_test += f"\n{description}" + print(about_test) + + if not overview_only: + if "format_example_fn" not in kwargs: + kwargs["format_example_fn"] = self.suite.info[test].get( + "format_example_fn", self._format_failing_examples + ) + if "print_fn" not in kwargs: + kwargs["print_fn"] = self.suite.info[test].get( + "print_fn", self.suite.print_fn + ) + print() + self.suite.tests[test].summary(**kwargs) + print() + + def _format_failing_examples( + self, + inputs: Tuple[Any], + pred: Any, + conf: Union[np.array, np.ndarray], + *args, + **kwargs, + ): + """ + Formatting function for printing failed test examples. + """ + if conf.shape[0] <= 4: + confs = " ".join(["%.1f" % c for c in conf]) + ret = "%s %s" % (confs, str(inputs)) + else: + conf = conf[pred] + ret = "%s (%.1f) %s" % (pred, conf, str(inputs)) + return ret + + def run( + self, + predictor: Predictor, + capabilities: Optional[List[str]] = None, + max_examples: Optional[int] = None, + ): + """ + Runs the predictor on the test suite data. + + # Parameters + + predictor : `Predictor` + The predictor object. + capabilities : `List[str]`, optional (default = `None`) + If not None, will only run tests with these capabilities. + max_examples : `int`, optional (default = `None`) + Maximum number of examples to run. If None, all examples will be run. + """ + preds_and_confs_fn = self._prediction_and_confidence_scores(predictor) + if preds_and_confs_fn is NotImplementedError: + raise NotImplementedError( + "The `_prediction_and_confidence_scores` function needs " + "to be implemented for the class `{}`".format(self.__class__) + ) + if not capabilities: + self.suite.run(preds_and_confs_fn, overwrite=True, n=max_examples) + else: + for _, test in self.suite.tests.items(): + if test.capability in capabilities: + test.run(preds_and_confs_fn, verbose=True, overwrite=True, n=max_examples) + + @classmethod + def constructor( + cls, + name: Optional[str] = None, + suite_file: Optional[str] = None, + extra_args: Optional[Dict[str, Any]] = None, + ) -> "TaskSuite": + suite_class: Type[TaskSuite] = ( + TaskSuite.by_name(name) if name is not None else cls # type: ignore + ) + + if extra_args is None: + extra_args = {} + + if suite_file is not None: + return suite_class(TestSuite.from_file(cached_path(suite_file)), **extra_args) + return suite_class(**extra_args) + + def save_suite(self, suite_file: str): + """ + Saves the suite to a file. + """ + self.suite.save(suite_file) + + def _default_tests(self, data: Optional[Iterable], num_test_cases: int = 100): + """ + Derived TaskSuite classes can add any task-specific tests here. + """ + if data: + + # Robustness + + self._punctuation_test(data, num_test_cases) + self._typo_test(data, num_test_cases) + self._contraction_test(data, num_test_cases) + + @classmethod + def contractions(cls) -> Callable: + """ + This returns a function which adds/removes contractions in relevant + `str` inputs of a task's inputs. For instance, "isn't" will be + changed to "is not", and "will not" will be changed to "won't". + + Expected arguments for this function: `(example, **args, **kwargs)` + where the `example` is an instance of some task. It can be of any + type. + + For example, for a sentiment analysis task, it will be a + a `str` (the sentence for which we want to predict the sentiment). + For a textual entailment task, it can be a tuple or a Dict, etc. + + Expected output of this function is a list of instances for the task, + of the same type as `example`. + """ + return Perturb.contractions + + @classmethod + def typos(cls) -> Callable: + """ + This returns a function which adds simple typos in relevant + `str` inputs of a task's inputs. + + Expected arguments for this function: `(example, **args, **kwargs)` + where the `example` is an instance of some task. It can be of any + type. + + For example, for a sentiment analysis task, it will be a + a `str` (the sentence for which we want to predict the sentiment). + For a textual entailment task, it can be a tuple or a Dict, etc. + + Expected output of this function is a list of instances for the task, + of the same type as `example`. + """ + return Perturb.add_typos + + @classmethod + def punctuation(cls) -> Callable: + """ + This returns a function which adds/removes punctuations in relevant + `str` inputs of a task's inputs. For instance, "isn't" will be + changed to "is not", and "will not" will be changed to "won't". + + Expected arguments for this function: `(example, **args, **kwargs)` + where the `example` is an instance of some task. It can be of any + type. + + For example, for a sentiment analysis task, it will be a + a `str` (the sentence for which we want to predict the sentiment). + For a textual entailment task, it can be a tuple or a Dict, etc. + + Expected output of this function is a list of instances for the task, + of the same type as `example`. + """ + return utils.toggle_punctuation + + def _punctuation_test(self, data: Iterable, num_test_cases: int): + """ + Checks if the model is invariant to presence/absence of punctuation. + """ + template = Perturb.perturb(data, self.punctuation(), nsamples=num_test_cases) + test = INV( + template.data, + name="Punctuation", + description="Strip punctuation and / or add '.'", + capability="Robustness", + ) + self.add_test(test) + + def _typo_test(self, data: Iterable, num_test_cases: int): + """ + Checks if the model is robust enough to be invariant to simple typos. + """ + template = Perturb.perturb(data, self.typos(), nsamples=num_test_cases, typos=1) + test = INV( + template.data, + name="Typos", + capability="Robustness", + description="Add one typo to input by swapping two adjacent characters", + ) + + self.add_test(test) + + template = Perturb.perturb(data, self.typos(), nsamples=num_test_cases, typos=2) + test = INV( + template.data, + name="2 Typos", + capability="Robustness", + description="Add two typos to input by swapping two adjacent characters twice", + ) + self.add_test(test) + + def _contraction_test(self, data: Iterable, num_test_cases: int): + """ + Checks if the model is invariant to contractions and expansions + (eg. What is <-> What's). + """ + template = Perturb.perturb(data, self.contractions(), nsamples=num_test_cases) + test = INV( + template.data, + name="Contractions", + capability="Robustness", + description="Contract or expand contractions, e.g. What is <-> What's", + ) + self.add_test(test) + + def _setup_editor(self): + """ + Sets up a `checklist.editor.Editor` object, to be used for adding + default tests to the suite. + """ + if not hasattr(self, "editor"): + self.editor = Editor() + + def add_test(self, test: Union[MFT, INV, DIR]): + """ + Adds a fully specified checklist test to the suite. + The tests can be of the following types: + + * MFT: A minimum functionality test. It checks if the predicted output + matches the expected output. + For example, for a sentiment analysis task, a simple MFT can check + if the model always predicts a positive sentiment for very + positive words. + The test's data contains the input and the expected output. + + * INV: An invariance test. It checks if the predicted output is invariant + to some change in the input. + For example, for a sentiment analysis task, an INV test can check + if the prediction stays consistent if simple typos are added. + The test's data contains the pairs (input, modified input). + + * DIR: A directional expectation test. It checks if the predicted output + changes in some specific way in response to the change in input. + For example, for a sentiment analysis task, a DIR test can check if + adding a reducer (eg. "good" -> "somewhat good") causes the + prediction's positive confidence score to decrease (or at least not + increase). + The test's data contains single inputs or pairs (input, modified input). + + Please refer to [the paper](https://api.semanticscholar.org/CorpusID:218551201) + for more details and examples. + + Note: `test` needs to be fully specified; with name, capability and description. + """ + if test.data: # test data should contain at least one example. + self.suite.add(test) + else: + logger.warning("'{}' was not added, as it contains no examples.".format(test.name)) diff --git a/allennlp/sanity_checks/task_checklists/textual_entailment_suite.py b/allennlp/sanity_checks/task_checklists/textual_entailment_suite.py new file mode 100644 index 00000000000..566324b440f --- /dev/null +++ b/allennlp/sanity_checks/task_checklists/textual_entailment_suite.py @@ -0,0 +1,453 @@ +from typing import Optional, Tuple, Iterable, Callable, Union +import itertools +import numpy as np +from overrides import overrides +from checklist.test_suite import TestSuite +from checklist.test_types import MFT +from checklist.perturb import Perturb +from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite +from allennlp.sanity_checks.task_checklists import utils + + +def _wrap_apply_to_each(perturb_fn: Callable, both: bool = False, *args, **kwargs): + """ + Wraps the perturb function so that it is applied to + both elements in the (premise, hypothesis) tuple. + """ + + def new_fn(pair, *args, **kwargs): + premise, hypothesis = pair + ret = [] + fn_premise = perturb_fn(premise, *args, **kwargs) + fn_hypothesis = perturb_fn(hypothesis, *args, **kwargs) + if type(fn_premise) != list: + fn_premise = [fn_premise] + if type(fn_hypothesis) != list: + fn_hypothesis = [fn_hypothesis] + ret.extend([(x, str(hypothesis)) for x in fn_premise]) + ret.extend([(str(premise), x) for x in fn_hypothesis]) + if both: + ret.extend([(x, x2) for x, x2 in itertools.product(fn_premise, fn_hypothesis)]) + + # The perturb function can return empty strings, if no relevant perturbations + # can be applied. Eg. if the sentence is "This is a good movie", a perturbation + # which toggles contractions will have no effect. + return [x for x in ret if x[0] and x[1]] + + return new_fn + + +@TaskSuite.register("textual-entailment") +class TextualEntailmentSuite(TaskSuite): + def __init__( + self, + suite: Optional[TestSuite] = None, + entails: int = 0, + contradicts: int = 1, + neutral: int = 2, + premise: str = "premise", + hypothesis: str = "hypothesis", + probs_key: str = "probs", + **kwargs, + ): + + self._entails = entails + self._contradicts = contradicts + self._neutral = neutral + + self._premise = premise + self._hypothesis = hypothesis + + self._probs_key = probs_key + + super().__init__(suite, **kwargs) + + def _prediction_and_confidence_scores(self, predictor): + def preds_and_confs_fn(data): + labels = [] + confs = [] + + data = [{self._premise: pair[0], self._hypothesis: pair[1]} for pair in data] + predictions = predictor.predict_batch_json(data) + for pred in predictions: + label = np.argmax(pred[self._probs_key]) + labels.append(label) + confs.append(pred[self._probs_key]) + return np.array(labels), np.array(confs) + + return preds_and_confs_fn + + @overrides + def _format_failing_examples( + self, + inputs: Tuple, + pred: int, + conf: Union[np.array, np.ndarray], + label: Optional[int] = None, + *args, + **kwargs, + ): + """ + Formatting function for printing failed test examples. + """ + labels = { + self._entails: "Entails", + self._contradicts: "Contradicts", + self._neutral: "Neutral", + } + ret = "Premise: %s\nHypothesis: %s" % (inputs[0], inputs[1]) + if label is not None: + ret += "\nOriginal: %s" % labels[label] + ret += "\nPrediction: Entails (%.1f), Contradicts (%.1f), Neutral (%.1f)" % ( + conf[self._entails], + conf[self._contradicts], + conf[self._neutral], + ) + + return ret + + @classmethod + def contractions(cls): + return _wrap_apply_to_each(Perturb.contractions, both=True) + + @classmethod + def typos(cls): + return _wrap_apply_to_each(Perturb.add_typos, both=False) + + @classmethod + def punctuation(cls): + return _wrap_apply_to_each(utils.toggle_punctuation, both=False) + + @overrides + def _setup_editor(self): + super()._setup_editor() + + antonyms = [ + ("progressive", "conservative"), + ("positive", "negative"), + ("defensive", "offensive"), + ("rude", "polite"), + ("optimistic", "pessimistic"), + ("stupid", "smart"), + ("negative", "positive"), + ("unhappy", "happy"), + ("active", "passive"), + ("impatient", "patient"), + ("powerless", "powerful"), + ("visible", "invisible"), + ("fat", "thin"), + ("bad", "good"), + ("cautious", "brave"), + ("hopeful", "hopeless"), + ("insecure", "secure"), + ("humble", "proud"), + ("passive", "active"), + ("dependent", "independent"), + ("pessimistic", "optimistic"), + ("irresponsible", "responsible"), + ("courageous", "fearful"), + ] + + self.editor.add_lexicon("antonyms", antonyms, overwrite=True) + + comp = [ + "smarter", + "better", + "worse", + "brighter", + "bigger", + "louder", + "longer", + "larger", + "smaller", + "warmer", + "colder", + "thicker", + "lighter", + "heavier", + ] + + self.editor.add_lexicon("compare", comp, overwrite=True) + + nouns = [ + "humans", + "cats", + "dogs", + "people", + "mice", + "pigs", + "birds", + "sheep", + "cows", + "rats", + "chickens", + "fish", + "bears", + "elephants", + "rabbits", + "lions", + "monkeys", + "snakes", + "bees", + "spiders", + "bats", + "puppies", + "dolphins", + "babies", + "kittens", + "children", + "frogs", + "ants", + "butterflies", + "insects", + "turtles", + "trees", + "ducks", + "whales", + "robots", + "animals", + "bugs", + "kids", + "crabs", + "carrots", + "dragons", + "mosquitoes", + "cars", + "sharks", + "dinosaurs", + "horses", + "tigers", + ] + self.editor.add_lexicon("nouns", nouns, overwrite=True) + + professions = [ + "journalist", + "historian", + "secretary", + "nurse", + "waitress", + "accountant", + "engineer", + "attorney", + "artist", + "editor", + "architect", + "model", + "interpreter", + "analyst", + "actor", + "actress", + "assistant", + "intern", + "economist", + "organizer", + "author", + "investigator", + "agent", + "administrator", + "executive", + "educator", + "investor", + "DJ", + "entrepreneur", + "auditor", + "advisor", + "instructor", + "activist", + "consultant", + "apprentice", + "reporter", + "expert", + "psychologist", + "examiner", + "painter", + "manager", + "contractor", + "therapist", + "programmer", + "musician", + "producer", + "associate", + "intermediary", + "designer", + "cook", + "salesperson", + "dentist", + "attorney", + "detective", + "banker", + "researcher", + "cop", + "driver", + "counselor", + "clerk", + "professor", + "tutor", + "coach", + "chemist", + "scientist", + "veterinarian", + "firefighter", + "baker", + "psychiatrist", + "prosecutor", + "director", + "technician", + ] + self.editor.add_lexicon("professions", professions, overwrite=True) + + @overrides + def _default_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + super()._default_tests(data, num_test_cases) + self._setup_editor() + self._default_vocabulary_tests(data, num_test_cases) + self._default_ner_tests(data, num_test_cases) + self._default_temporal_tests(data, num_test_cases) + self._default_logic_tests(data, num_test_cases) + self._default_negation_tests(data, num_test_cases) + + def _default_vocabulary_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + + template = self.editor.template( + ( + "{first_name1} is more {antonyms[0]} than {first_name2}", + "{first_name2} is more {antonyms[1]} than {first_name1}", + ), + remove_duplicates=True, + nsamples=num_test_cases, + ) + + test = MFT( + **template, + labels=self._entails, + name='"A is more COMP than B" entails "B is more antonym(COMP) than A"', + capability="Vocabulary", + description="Eg. A is more active than B implies that B is more passive than A", + ) + + self.add_test(test) + + def _default_logic_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + template = self.editor.template( + ("{nouns1} are {compare} than {nouns2}", "{nouns2} are {compare} than {nouns1}"), + nsamples=num_test_cases, + remove_duplicates=True, + ) + + test = MFT( + **template, + labels=self._contradicts, + name='"A is COMP than B" contradicts "B is COMP than A"', + capability="Logic", + description='Eg. "A is better than B" contradicts "B is better than A"', + ) + + self.add_test(test) + + if data: + template = Perturb.perturb( + data, lambda x: (x[0], x[0]), nsamples=num_test_cases, keep_original=False + ) + template += Perturb.perturb( + data, lambda x: (x[1], x[1]), nsamples=num_test_cases, keep_original=False + ) + + test = MFT( + **template, + labels=self._entails, + name="A entails A (premise == hypothesis)", + capability="Logic", + description="If premise and hypothesis are the same, then premise entails the hypothesis", + ) + + self.add_test(test) + + def _default_negation_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + + template = self.editor.template( + ( + "{first_name1} is {compare} than {first_name2}", + "{first_name1} is not {compare} than {first_name2}", + ), + nsamples=num_test_cases, + remove_duplicates=True, + ) + + test = MFT( + **template, + labels=self._contradicts, + name='"A is COMP than B" contradicts "A is not COMP than B"', + capability="Negation", + description="Eg. A is better than B contradicts A is not better than C", + ) + + self.add_test(test) + + def _default_ner_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + template = self.editor.template( + ( + "{first_name1} is {compare} than {first_name2}", + "{first_name1} is {compare} than {first_name3}", + ), + nsamples=num_test_cases, + remove_duplicates=True, + ) + + test = MFT( + **template, + labels=self._neutral, + name='"A is COMP than B" gives no information about "A is COMP than C"', + capability="NER", + description='Eg. "A is better than B" gives no information about "A is better than C"', + ) + + self.add_test(test) + + def _default_temporal_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100): + template = self.editor.template( + ( + "{first_name} works as {a:professions}", + "{first_name} used to work as a {professions}", + ), + nsamples=num_test_cases, + remove_duplicates=True, + ) + + template += self.editor.template( + ( + "{first_name} {last_name} is {a:professions}", + "{first_name} {last_name} was {a:professions}", + ), + nsamples=num_test_cases, + remove_duplicates=True, + ) + + test = MFT( + **template, + labels=self._neutral, + name='"A works as P" gives no information about "A used to work as P"', + capability="Temporal", + description='Eg. "A is a writer" gives no information about "A was a writer"', + ) + + self.add_test(test) + + template = self.editor.template( + ( + "{first_name} was {a:professions1} before they were {a:professions2}", + "{first_name} was {a:professions1} after they were {a:professions2}", + ), + nsamples=num_test_cases, + remove_duplicates=True, + ) + + test = MFT( + **template, + labels=self._contradicts, + name="Before != After", + capability="Temporal", + description='Eg. "A was a writer before they were a journalist" ' + 'contradicts "A was a writer after they were a journalist"', + ) + + self.add_test(test) diff --git a/allennlp/sanity_checks/task_checklists/utils.py b/allennlp/sanity_checks/task_checklists/utils.py new file mode 100644 index 00000000000..22ad9deedf1 --- /dev/null +++ b/allennlp/sanity_checks/task_checklists/utils.py @@ -0,0 +1,98 @@ +import string +from typing import Dict, Callable, List, Union +import numpy as np +import spacy + + +def spacy_wrap(fn: Callable, language: str = "en_core_web_sm", **kwargs) -> Callable: + """ + Wrap the function so that it runs the input text data + through a spacy model before the function call. + """ + from allennlp.common.util import get_spacy_model + + def new_fn(data: Union[spacy.tokens.doc.Doc, Dict, str]): + if not isinstance(data, spacy.tokens.doc.Doc): + model = get_spacy_model(language, **kwargs) + if isinstance(data, Dict): + for key, val in data.items(): + if isinstance(val, str): + data[key] = model(val) + elif isinstance(data, tuple): + data = tuple(model(tup) if isinstance(tup, str) else tup for tup in data) + elif isinstance(data, str): + data = model(data) + else: + pass + return fn(data) + + return new_fn + + +def strip_punctuation(data: Union[str, spacy.tokens.doc.Doc]) -> str: + """ + Removes all punctuation from `data`. + """ + if isinstance(data, str): + return data.rstrip(string.punctuation) + elif isinstance(data, spacy.tokens.doc.Doc): + while len(data) and data[-1].is_punct: + data = data[:-1] + else: + # Can log a warning here, but it may get noisy. + pass + return str(data) + + +def toggle_punctuation(data: str) -> List[str]: + """ + If `data` contains any punctuation, it is removed. + Otherwise, a `.` is added to the string. + Returns a list of strings. + + Eg. + `data` = "This was great!" + Returns ["This was great", "This was great."] + + `data` = "The movie was good" + Returns ["The movie was good."] + """ + s = strip_punctuation(data) + ret = [] + if s != data: + ret.append(s) + if s + "." != data: + ret.append(s + ".") + return ret + + +def random_string(n: int) -> str: + """ + Returns a random alphanumeric string of length `n`. + """ + return "".join(np.random.choice([x for x in string.ascii_letters + string.digits], n)) + + +def random_url(n: int = 6) -> str: + """ + Returns a random url of length `n`. + """ + return "https://t.co/%s" % random_string(n) + + +def random_handle(n: int = 6) -> str: + """ + Returns a random handle of length `n`. Eg. "@randomstr23` + """ + return "@%s" % random_string(n) + + +def add_random_strings(data: str) -> List[str]: + """ + Adds random strings to the start and end of the string `data`. + Returns a list of strings. + """ + urls_and_handles = [random_url(n=6) for _ in range(5)] + [random_handle() for _ in range(5)] + rets = ["%s %s" % (x, data) for x in urls_and_handles] + rets += ["%s %s" % (data, x) for x in urls_and_handles] + return rets diff --git a/setup.py b/setup.py index 22d600c6806..886c40d2482 100644 --- a/setup.py +++ b/setup.py @@ -71,6 +71,7 @@ "filelock>=3.0,<3.1", "lmdb", "more-itertools", + "checklist==0.0.10", "wandb>=0.10.0,<0.11.0", "huggingface_hub>=0.0.8", ], diff --git a/test_fixtures/task_suites/fake_suite.tar.gz b/test_fixtures/task_suites/fake_suite.tar.gz new file mode 100644 index 00000000000..f2a2525a647 Binary files /dev/null and b/test_fixtures/task_suites/fake_suite.tar.gz differ diff --git a/tests/commands/checklist_test.py b/tests/commands/checklist_test.py new file mode 100644 index 00000000000..f566ceb0408 --- /dev/null +++ b/tests/commands/checklist_test.py @@ -0,0 +1,53 @@ +import argparse +import sys + +from allennlp.commands import main +from allennlp.commands.checklist import CheckList +from allennlp.common.testing import AllenNlpTestCase + + +class TestCheckList(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + self.archive_file = ( + self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz" + ) + self.task = "sentiment-analysis" + + def test_add_checklist_subparser(self): + parser = argparse.ArgumentParser(description="Testing") + subparsers = parser.add_subparsers(title="Commands", metavar="") + CheckList().add_subparser(subparsers) + + kebab_args = [ + "checklist", # command + "/path/to/archive", # archive + "task-suite-name", + "--checklist-suite", + "/path/to/checklist/pkl", + "--output-file", + "/dev/null", + "--cuda-device", + "0", + ] + + args = parser.parse_args(kebab_args) + + assert args.func.__name__ == "_run_suite" + assert args.archive_file == "/path/to/archive" + assert args.task == "task-suite-name" + assert args.output_file == "/dev/null" + assert args.cuda_device == 0 + + def test_works_with_known_model(self): + + sys.argv = [ + "__main__.py", # executable + "checklist", # command + str(self.archive_file), + str(self.task), + "--task-suite-args", + '{"positive": 1, "negative": 0}', + ] + + main() diff --git a/tests/data/dataset_readers/sharded_dataset_reader_test.py b/tests/data/dataset_readers/sharded_dataset_reader_test.py index b0943046ded..94840bde56a 100644 --- a/tests/data/dataset_readers/sharded_dataset_reader_test.py +++ b/tests/data/dataset_readers/sharded_dataset_reader_test.py @@ -54,7 +54,7 @@ def setup_method(self) -> None: def read_and_check_instances(self, filepath: str, num_workers: int = 0): data_loader = MultiProcessDataLoader( - self.reader, filepath, num_workers=num_workers, batch_size=1 + self.reader, filepath, num_workers=num_workers, batch_size=1, start_method="spawn" ) all_instances = [] for instance in data_loader.iter_instances(): diff --git a/tests/modules/transformer/self_attention_test.py b/tests/modules/transformer/self_attention_test.py index b8a4d37d8fb..e29ae44cf9e 100644 --- a/tests/modules/transformer/self_attention_test.py +++ b/tests/modules/transformer/self_attention_test.py @@ -81,6 +81,7 @@ def test_can_construct_from_params(self): assert self.self_attention.dropout.p == self.params_dict["dropout"] + @pytest.mark.skip("Takes up too much memory") @pytest.mark.parametrize("module_name, hf_module", get_modules(PARAMS_DICT).items()) def test_forward_against_huggingface_output(self, module_name, hf_module): hidden_states = torch.randn(2, 3, 6) @@ -101,6 +102,7 @@ def test_forward_against_huggingface_output(self, module_name, hf_module): assert torch.allclose(output[0], hf_output[0]) + @pytest.mark.skip("Takes up too much memory") @pytest.mark.parametrize( "pretrained_name", [ diff --git a/tests/modules/transformer/transformer_stack_test.py b/tests/modules/transformer/transformer_stack_test.py index f9383960822..0481a407937 100644 --- a/tests/modules/transformer/transformer_stack_test.py +++ b/tests/modules/transformer/transformer_stack_test.py @@ -169,6 +169,7 @@ def test_loading_partial_pretrained_weights(self): mapping, ) + @pytest.mark.skip("Takes up too much memory") @pytest.mark.parametrize("module_name, hf_module", get_modules(PARAMS_DICT).items()) def test_forward_against_huggingface_outputs(self, module_name, hf_module): hidden_states = torch.randn(2, 3, 6) diff --git a/tests/sanity_checks/task_checklists/__init__.py b/tests/sanity_checks/task_checklists/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/sanity_checks/task_checklists/sentiment_analysis_suite_test.py b/tests/sanity_checks/task_checklists/sentiment_analysis_suite_test.py new file mode 100644 index 00000000000..5f4f329b578 --- /dev/null +++ b/tests/sanity_checks/task_checklists/sentiment_analysis_suite_test.py @@ -0,0 +1,25 @@ +from allennlp.sanity_checks.task_checklists.sentiment_analysis_suite import SentimentAnalysisSuite +from allennlp.common.testing import AllenNlpTestCase +from allennlp.models.archival import load_archive +from allennlp.predictors import Predictor + + +class TestSentimentAnalysisSuite(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + archive = load_archive( + self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz" + ) + self.predictor = Predictor.from_archive(archive) + + def test_run(self): + data = [ + "This is really good", + "This was terrible", + "This was not good", + "John Smith acted very well.", + "Seattle was very gloomy.", + "I have visited the place for 3 years; great food!", + ] + suite = SentimentAnalysisSuite(add_default_tests=True, data=data) + suite.run(self.predictor, max_examples=10) diff --git a/tests/sanity_checks/task_checklists/task_suite_test.py b/tests/sanity_checks/task_checklists/task_suite_test.py new file mode 100644 index 00000000000..84623511f77 --- /dev/null +++ b/tests/sanity_checks/task_checklists/task_suite_test.py @@ -0,0 +1,62 @@ +import pytest +from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite +from allennlp.common.testing import AllenNlpTestCase +from allennlp.common.checks import ConfigurationError +from allennlp.models.archival import load_archive +from allennlp.predictors import Predictor +from allennlp.common.testing.checklist_test import FakeTaskSuite # noqa: F401 + + +class TestTaskSuite(AllenNlpTestCase): + def setup_method(self): + super().setup_method() + archive = load_archive( + self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz" + ) + self.predictor = Predictor.from_archive(archive) + + def test_load_from_suite_file(self): + suite_file = str(self.FIXTURES_ROOT / "task_suites" / "fake_suite.tar.gz") + + task_suite = TaskSuite.constructor(suite_file=suite_file) + + assert len(task_suite.suite.tests) == 1 + + def test_load_by_name(self): + + task_suite = TaskSuite.constructor(name="fake-task-suite") + + assert task_suite._fake_arg1 is None + assert task_suite._fake_arg2 is None + + assert len(task_suite.suite.tests) == 1 + + with pytest.raises(ConfigurationError): + TaskSuite.constructor(name="suite-that-does-not-exist") + + def test_load_with_extra_args(self): + extra_args = {"fake_arg1": "some label"} + task_suite = TaskSuite.constructor(name="fake-task-suite", extra_args=extra_args) + assert task_suite._fake_arg1 == "some label" + + def test_prediction_and_confidence_scores_function_needs_implementation(self): + + task_suite = TaskSuite.constructor(name="fake-task-suite") + + with pytest.raises(NotImplementedError): + task_suite.run(self.predictor) + + def test_add_default_tests(self): + + # We include "isn't" so that the contractions test is also added. + data = ["This isn't real data"] + task_suite = TaskSuite(add_default_tests=True, data=data) + assert "Typos" in task_suite.suite.tests + assert "2 Typos" in task_suite.suite.tests + assert "Contractions" in task_suite.suite.tests + + data = ["This is data with no contractions."] + task_suite = TaskSuite(add_default_tests=True, data=data) + assert "Typos" in task_suite.suite.tests + assert "2 Typos" in task_suite.suite.tests + assert "Contractions" not in task_suite.suite.tests diff --git a/tests/sanity_checks/task_checklists/utils_test.py b/tests/sanity_checks/task_checklists/utils_test.py new file mode 100644 index 00000000000..ce6e17eb902 --- /dev/null +++ b/tests/sanity_checks/task_checklists/utils_test.py @@ -0,0 +1,12 @@ +from allennlp.sanity_checks.task_checklists import utils +from allennlp.common.testing import AllenNlpTestCase + + +class TestUtils(AllenNlpTestCase): + def test_punctuations(self): + perturbed = utils.toggle_punctuation("This has a period.") + + assert perturbed[0] == "This has a period" + + perturbed = utils.toggle_punctuation("This does not have a period") + assert perturbed[0] == "This does not have a period."