diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7adc2ad183d..b226f0a5d8f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+### Added
+
+- Added `TaskSuite` base class and command line functionality for running [`checklist`](https://github.com/marcotcr/checklist) test suites, along with implementations for `SentimentAnalysisSuite`, `QuestionAnsweringSuite`, and `TextualEntailmentSuite`. These can be found in the `allennlp.sanity_checks.task_checklists` module.
+
 
 ## [v2.4.0](https://github.com/allenai/allennlp/releases/tag/v2.4.0) - 2021-04-22
 
@@ -40,7 +46,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fixed a bug with the `ShardedDatasetReader` when used with multi-process data loading (https://github.com/allenai/allennlp/issues/5132).
 
-
 ## [v2.3.0](https://github.com/allenai/allennlp/releases/tag/v2.3.0) - 2021-04-14
 
 ### Added
@@ -103,6 +108,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `Model.get_parameters_for_histogram_tensorboard_logging` is deprecated in favor of
   `Model.get_parameters_for_histogram_logging`.
 
+
 ### Fixed
 
 - Makes sure tensors that are stored in `TensorCache` always live on CPUs
diff --git a/Makefile b/Makefile
index 1ed1a6b1098..fdbfc181234 100644
--- a/Makefile
+++ b/Makefile
@@ -86,7 +86,9 @@ install :
 	# See https://github.com/pypa/pip/issues/4537.
 	python setup.py install_egg_info
 	pip install --upgrade --upgrade-strategy eager -e . -r dev-requirements.txt
-
+	# Docs are not built on docker, and the runner is unable to find
+	# the nltk_data folder. Hence, we download the requirement.
+	python -c 'import nltk; nltk.download("sentiwordnet")'
 #
 # Documention helpers.
 #
diff --git a/allennlp/commands/__init__.py b/allennlp/commands/__init__.py
index 3a0fba2232f..8b5f100a0aa 100644
--- a/allennlp/commands/__init__.py
+++ b/allennlp/commands/__init__.py
@@ -18,6 +18,7 @@
 from allennlp.commands.count_instances import CountInstances
 from allennlp.common.plugins import import_plugins
 from allennlp.common.util import import_module_and_submodules
+from allennlp.commands.checklist import CheckList
 
 logger = logging.getLogger(__name__)
 
diff --git a/allennlp/commands/checklist.py b/allennlp/commands/checklist.py
new file mode 100644
index 00000000000..27a061915a4
--- /dev/null
+++ b/allennlp/commands/checklist.py
@@ -0,0 +1,199 @@
+"""
+The `checklist` subcommand allows you to sanity check your
+model's predictions using a trained model and its
+[`Predictor`](../predictors/predictor.md#predictor) wrapper.
+"""
+
+from typing import Optional, Dict, Any, List
+import argparse
+import sys
+import json
+
+from overrides import overrides
+
+from allennlp.commands.subcommand import Subcommand
+from allennlp.common.checks import check_for_gpu, ConfigurationError
+from allennlp.models.archival import load_archive
+from allennlp.predictors.predictor import Predictor
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+
+
+@Subcommand.register("checklist")
+class CheckList(Subcommand):
+    @overrides
+    def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
+
+        description = """Run the specified model through a checklist suite."""
+        subparser = parser.add_parser(
+            self.name,
+            description=description,
+            help="Run a trained model through a checklist suite.",
+        )
+
+        subparser.add_argument(
+            "archive_file", type=str, help="The archived model to make predictions with"
+        )
+
+        subparser.add_argument("task", type=str, help="The name of the task suite")
+
+        subparser.add_argument("--checklist-suite", type=str, help="The checklist suite path")
+
+        subparser.add_argument(
+            "--capabilities",
+            nargs="+",
+            default=[],
+            help=('An optional list of strings of capabilities. Eg. "[Vocabulary, Robustness]"'),
+        )
+
+        subparser.add_argument(
+            "--max-examples",
+            type=int,
+            default=None,
+            help="Maximum number of examples to check per test.",
+        )
+
+        subparser.add_argument(
+            "--task-suite-args",
+            type=str,
+            default="",
+            help=(
+                "An optional JSON structure used to provide additional parameters to the task suite"
+            ),
+        )
+
+        subparser.add_argument(
+            "--print-summary-args",
+            type=str,
+            default="",
+            help=(
+                "An optional JSON structure used to provide additional "
+                "parameters for printing test summary"
+            ),
+        )
+
+        subparser.add_argument("--output-file", type=str, help="Path to output file")
+
+        subparser.add_argument(
+            "--cuda-device", type=int, default=-1, help="ID of GPU to use (if any)"
+        )
+
+        subparser.add_argument(
+            "--predictor", type=str, help="Optionally specify a specific predictor to use"
+        )
+
+        subparser.add_argument(
+            "--predictor-args",
+            type=str,
+            default="",
+            help=(
+                "An optional JSON structure used to provide additional parameters to the predictor"
+            ),
+        )
+
+        subparser.set_defaults(func=_run_suite)
+
+        return subparser
+
+
+def _get_predictor(args: argparse.Namespace) -> Predictor:
+    check_for_gpu(args.cuda_device)
+    archive = load_archive(
+        args.archive_file,
+        cuda_device=args.cuda_device,
+    )
+
+    predictor_args = args.predictor_args.strip()
+    if len(predictor_args) <= 0:
+        predictor_args = {}
+    else:
+        predictor_args = json.loads(predictor_args)
+
+    return Predictor.from_archive(
+        archive,
+        args.predictor,
+        extra_args=predictor_args,
+    )
+
+
+def _get_task_suite(args: argparse.Namespace) -> TaskSuite:
+    available_tasks = TaskSuite.list_available()
+    if args.task in available_tasks:
+        suite_name = args.task
+    else:
+        raise ConfigurationError(
+            f"'{args.task}' is not a recognized task suite. "
+            f"Available tasks are: {available_tasks}."
+        )
+
+    file_path = args.checklist_suite
+
+    task_suite_args = args.task_suite_args.strip()
+    if len(task_suite_args) <= 0:
+        task_suite_args = {}
+    else:
+        task_suite_args = json.loads(task_suite_args)
+
+    return TaskSuite.constructor(
+        name=suite_name,
+        suite_file=file_path,
+        extra_args=task_suite_args,
+    )
+
+
+class _CheckListManager:
+    def __init__(
+        self,
+        task_suite: TaskSuite,
+        predictor: Predictor,
+        capabilities: Optional[List[str]] = None,
+        max_examples: Optional[int] = None,
+        output_file: Optional[str] = None,
+        print_summary_args: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        self._task_suite = task_suite
+        self._predictor = predictor
+        self._capabilities = capabilities
+        self._max_examples = max_examples
+        self._output_file = None if output_file is None else open(output_file, "w")
+        self._print_summary_args = print_summary_args or {}
+
+        if capabilities:
+            self._print_summary_args["capabilities"] = capabilities
+
+    def run(self) -> None:
+        self._task_suite.run(
+            self._predictor, capabilities=self._capabilities, max_examples=self._max_examples
+        )
+
+        # We pass in an IO object.
+        output_file = self._output_file or sys.stdout
+        self._task_suite.summary(file=output_file, **self._print_summary_args)
+
+        # If `_output_file` was None, there would be nothing to close.
+        if self._output_file is not None:
+            self._output_file.close()
+
+
+def _run_suite(args: argparse.Namespace) -> None:
+
+    task_suite = _get_task_suite(args)
+    predictor = _get_predictor(args)
+
+    print_summary_args = args.print_summary_args.strip()
+    if len(print_summary_args) <= 0:
+        print_summary_args = {}
+    else:
+        print_summary_args = json.loads(print_summary_args)
+
+    capabilities = args.capabilities
+    max_examples = args.max_examples
+
+    manager = _CheckListManager(
+        task_suite,
+        predictor,
+        capabilities,
+        max_examples,
+        args.output_file,
+        print_summary_args,
+    )
+    manager.run()
diff --git a/allennlp/common/testing/checklist_test.py b/allennlp/common/testing/checklist_test.py
new file mode 100644
index 00000000000..c84b82b7afb
--- /dev/null
+++ b/allennlp/common/testing/checklist_test.py
@@ -0,0 +1,35 @@
+from typing import Optional
+from checklist.test_suite import TestSuite
+from checklist.test_types import MFT as MinimumFunctionalityTest
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+
+
+@TaskSuite.register("fake-task-suite")
+class FakeTaskSuite(TaskSuite):
+    """
+    Fake checklist suite for testing purpose.
+    """
+
+    def __init__(
+        self,
+        suite: Optional[TestSuite] = None,
+        fake_arg1: Optional[int] = None,
+        fake_arg2: Optional[int] = None,
+    ):
+        self._fake_arg1 = fake_arg1
+        self._fake_arg2 = fake_arg2
+
+        if not suite:
+            suite = TestSuite()
+
+        # Adding a simple checklist test.
+        test = MinimumFunctionalityTest(
+            ["sentence 1", "sentence 2"],
+            labels=0,
+            name="fake test 1",
+            capability="fake capability",
+            description="Test's description",
+        )
+        suite.add(test)
+
+        super().__init__(suite)
diff --git a/allennlp/sanity_checks/task_checklists/__init__.py b/allennlp/sanity_checks/task_checklists/__init__.py
new file mode 100644
index 00000000000..ef0e0d28263
--- /dev/null
+++ b/allennlp/sanity_checks/task_checklists/__init__.py
@@ -0,0 +1,10 @@
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+from allennlp.sanity_checks.task_checklists.sentiment_analysis_suite import (
+    SentimentAnalysisSuite,
+)
+from allennlp.sanity_checks.task_checklists.question_answering_suite import (
+    QuestionAnsweringSuite,
+)
+from allennlp.sanity_checks.task_checklists.textual_entailment_suite import (
+    TextualEntailmentSuite,
+)
diff --git a/allennlp/sanity_checks/task_checklists/question_answering_suite.py b/allennlp/sanity_checks/task_checklists/question_answering_suite.py
new file mode 100644
index 00000000000..890ccb6b4ee
--- /dev/null
+++ b/allennlp/sanity_checks/task_checklists/question_answering_suite.py
@@ -0,0 +1,208 @@
+from typing import Optional, Iterable, Tuple, Union
+import itertools
+import numpy as np
+from overrides import overrides
+from checklist.editor import MunchWithAdd as CheckListTemplate
+from checklist.test_suite import TestSuite
+from checklist.test_types import MFT
+from checklist.perturb import Perturb
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+from allennlp.sanity_checks.task_checklists import utils
+
+
+def _crossproduct(template: CheckListTemplate):
+    """
+    Takes the output of editor.template and does the cross product of contexts and qas
+    """
+    ret = []
+    ret_labels = []
+    for instance in template.data:
+        cs = instance["contexts"]
+        qas = instance["qas"]
+        d = list(itertools.product(cs, qas))
+        ret.append([(x[0], x[1][0]) for x in d])
+        ret_labels.append([x[1][1] for x in d])
+    template.data = ret
+    template.labels = ret_labels
+    return template
+
+
+@TaskSuite.register("question-answering")
+class QuestionAnsweringSuite(TaskSuite):
+    def __init__(
+        self,
+        suite: Optional[TestSuite] = None,
+        context_key: str = "context",
+        question_key: str = "question",
+        answer_key: str = "best_span_str",
+        **kwargs,
+    ):
+        self._context_key = context_key
+        self._question_key = question_key
+        self._answer_key = answer_key
+
+        super().__init__(suite, **kwargs)
+
+    def _prediction_and_confidence_scores(self, predictor):
+        def preds_and_confs_fn(data):
+            data = [{self._context_key: pair[0], self._question_key: pair[1]} for pair in data]
+            predictions = predictor.predict_batch_json(data)
+            labels = [pred[self._answer_key] for pred in predictions]
+            return labels, np.ones(len(labels))
+
+        return preds_and_confs_fn
+
+    @overrides
+    def _format_failing_examples(
+        self,
+        inputs: Tuple,
+        pred: str,
+        conf: Union[np.array, np.ndarray],
+        label: Optional[str] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Formatting function for printing failed test examples.
+        """
+        context, question = inputs
+        ret = "Context: %s\nQuestion: %s\n" % (context, question)
+        if label is not None:
+            ret += "Original answer: %s\n" % label
+        ret += "Predicted answer: %s\n" % pred
+        return ret
+
+    @classmethod
+    def contractions(cls):
+        def _contractions(x):
+            conts = Perturb.contractions(x[1])
+            return [(x[0], a) for a in conts]
+
+        return _contractions
+
+    @classmethod
+    def typos(cls):
+        def question_typo(x, **kwargs):
+            return (x[0], Perturb.add_typos(x[1], **kwargs))
+
+        return question_typo
+
+    @classmethod
+    def punctuation(cls):
+        def context_punctuation(x):
+            return (utils.strip_punctuation(x[0]), x[1])
+
+        return context_punctuation
+
+    @overrides
+    def _setup_editor(self):
+        super()._setup_editor()
+
+        adj = [
+            "old",
+            "smart",
+            "tall",
+            "young",
+            "strong",
+            "short",
+            "tough",
+            "cool",
+            "fast",
+            "nice",
+            "small",
+            "dark",
+            "wise",
+            "rich",
+            "great",
+            "weak",
+            "high",
+            "slow",
+            "strange",
+            "clean",
+        ]
+        adj = [(x.rstrip("e"), x) for x in adj]
+
+        self.editor.add_lexicon("adjectives_to_compare", adj, overwrite=True)
+
+        comp_pairs = [
+            ("better", "worse"),
+            ("older", "younger"),
+            ("smarter", "dumber"),
+            ("taller", "shorter"),
+            ("bigger", "smaller"),
+            ("stronger", "weaker"),
+            ("faster", "slower"),
+            ("darker", "lighter"),
+            ("richer", "poorer"),
+            ("happier", "sadder"),
+            ("louder", "quieter"),
+            ("warmer", "colder"),
+        ]
+
+        self.editor.add_lexicon("comp_pairs", comp_pairs, overwrite=True)
+
+    @overrides
+    def _default_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+        super()._default_tests(data, num_test_cases)
+        self._setup_editor()
+        self._default_vocabulary_tests(data, num_test_cases)
+        self._default_taxonomy_tests(data, num_test_cases)
+
+    def _default_vocabulary_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+
+        template = self.editor.template(
+            [
+                (
+                    "{first_name} is {adjectives_to_compare[0]}er than {first_name1}.",
+                    "Who is less {adjectives_to_compare[1]}?",
+                ),
+                (
+                    "{first_name} is {adjectives_to_compare[0]}er than {first_name1}.",
+                    "Who is {adjectives_to_compare[0]}er?",
+                ),
+            ],
+            labels=["{first_name1}", "{first_name}"],
+            remove_duplicates=True,
+            nsamples=num_test_cases,
+            save=True,
+        )
+        test = MFT(
+            **template,
+            name="A is COMP than B. Who is more / less COMP?",
+            description='Eg. Context: "A is taller than B" '
+            'Q: "Who is taller?" A: "A", Q: "Who is less tall?" A: "B"',
+            capability="Vocabulary",
+        )
+        self.add_test(test)
+
+    def _default_taxonomy_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+        template = _crossproduct(
+            self.editor.template(
+                {
+                    "contexts": [
+                        "{first_name} is {comp_pairs[0]} than {first_name1}.",
+                        "{first_name1} is {comp_pairs[1]} than {first_name}.",
+                    ],
+                    "qas": [
+                        (
+                            "Who is {comp_pairs[1]}?",
+                            "{first_name1}",
+                        ),
+                        (
+                            "Who is {comp_pairs[0]}?",
+                            "{first_name}",
+                        ),
+                    ],
+                },
+                remove_duplicates=True,
+                nsamples=num_test_cases,
+                save=True,
+            )
+        )
+        test = MFT(
+            **template,
+            name="A is COMP than B. Who is antonym(COMP)? B",
+            description='Eg. Context: "A is taller than B", Q: "Who is shorter?", A: "B"',
+            capability="Taxonomy",
+        )
+        self.add_test(test)
diff --git a/allennlp/sanity_checks/task_checklists/sentiment_analysis_suite.py b/allennlp/sanity_checks/task_checklists/sentiment_analysis_suite.py
new file mode 100644
index 00000000000..79dcfe8a75b
--- /dev/null
+++ b/allennlp/sanity_checks/task_checklists/sentiment_analysis_suite.py
@@ -0,0 +1,758 @@
+from typing import Optional, Iterable, List, Union, Tuple
+import numpy as np
+from overrides import overrides
+from checklist.test_suite import TestSuite
+from checklist.test_types import MFT, INV, DIR, Expect
+from checklist.editor import Editor
+from checklist.perturb import Perturb
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+from allennlp.sanity_checks.task_checklists import utils
+from allennlp.data.instance import Instance
+
+
+def _add_phrase_function(phrases: List[str], num_samples: int = 10):
+    """
+    Returns a function which adds each str in `phrases`
+    at the end of the input string and returns that list.
+    """
+
+    def perturb_fn(inp):
+        input_str = utils.strip_punctuation(inp)
+        total = len(phrases)
+        idx = np.random.choice(total, min(num_samples, total), replace=False)
+        ret = [input_str + ". " + phrases[i] for i in idx]
+        return ret
+
+    return perturb_fn
+
+
+@TaskSuite.register("sentiment-analysis")
+class SentimentAnalysisSuite(TaskSuite):
+    """
+    This suite was built using the checklist process with the self.editor
+    suggestions. Users are encouraged to add/modify as they see fit.
+
+    Note: `editor.suggest(...)` can be slow as it runs a language model.
+    """
+
+    def __init__(
+        self,
+        suite: Optional[TestSuite] = None,
+        positive: Optional[int] = 0,
+        negative: Optional[int] = 1,
+        **kwargs,
+    ):
+
+        self._positive = positive
+        self._negative = negative
+        super().__init__(suite, **kwargs)
+
+    @overrides
+    def _prediction_and_confidence_scores(self, predictor):
+        def preds_and_confs_fn(data):
+            labels = []
+            confs = []
+            if isinstance(data[0], Instance):
+                predictions = predictor.predict_batch_instance(data)
+            else:
+                data = [{"sentence": sentence} for sentence in data]
+                predictions = predictor.predict_batch_json(data)
+            for pred in predictions:
+                label = pred["probs"].index(max(pred["probs"]))
+                labels.append(label)
+                confs.append([pred["probs"][self._positive], pred["probs"][self._negative]])
+            return np.array(labels), np.array(confs)
+
+        return preds_and_confs_fn
+
+    @overrides
+    def _format_failing_examples(
+        self,
+        inputs: Tuple,
+        pred: int,
+        conf: Union[np.array, np.ndarray],
+        label: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Formatting function for printing failed test examples.
+        """
+        labels = {self._positive: "Positive", self._negative: "Negative"}
+        ret = str(inputs)
+        if label is not None:
+            ret += " (Original: %s)" % labels[label]
+        ret += "\nPrediction: %s (Confidence: %.1f)" % (labels[pred], conf[pred])
+
+        return ret
+
+    @overrides
+    def _default_tests(self, data: Optional[Iterable[str]], num_test_cases=100):
+        super()._default_tests(data, num_test_cases)
+        self._setup_editor()
+        self._default_vocabulary_tests(data, num_test_cases)
+        self._default_ner_tests(data, num_test_cases)
+        self._default_temporal_tests(data, num_test_cases)
+        self._default_fairness_tests(data, num_test_cases)
+        self._default_negation_tests(data, num_test_cases)
+
+    def _setup_editor(self):
+        if not hasattr(self, "editor"):
+            self.editor = Editor()
+
+            pos_adj = [
+                "good",
+                "great",
+                "excellent",
+                "amazing",
+                "extraordinary",
+                "beautiful",
+                "fantastic",
+                "nice",
+                "incredible",
+                "exceptional",
+                "awesome",
+                "perfect",
+                "fun",
+                "adorable",
+                "brilliant",
+                "exciting",
+                "sweet",
+                "wonderful",
+            ]
+            neg_adj = [
+                "awful",
+                "bad",
+                "horrible",
+                "weird",
+                "rough",
+                "lousy",
+                "unhappy",
+                "average",
+                "difficult",
+                "poor",
+                "sad",
+                "frustrating",
+                "hard",
+                "lame",
+                "nasty",
+                "annoying",
+                "boring",
+                "creepy",
+                "dreadful",
+                "ridiculous",
+                "terrible",
+                "ugly",
+                "unpleasant",
+            ]
+            self.editor.add_lexicon("pos_adj", pos_adj, overwrite=True)
+            self.editor.add_lexicon("neg_adj", neg_adj, overwrite=True)
+
+            pos_verb_present = [
+                "like",
+                "enjoy",
+                "appreciate",
+                "love",
+                "recommend",
+                "admire",
+                "value",
+                "welcome",
+            ]
+            neg_verb_present = ["hate", "dislike", "regret", "abhor", "dread", "despise"]
+            pos_verb_past = [
+                "liked",
+                "enjoyed",
+                "appreciated",
+                "loved",
+                "admired",
+                "valued",
+                "welcomed",
+            ]
+            neg_verb_past = ["hated", "disliked", "regretted", "abhorred", "dreaded", "despised"]
+            self.editor.add_lexicon("pos_verb_present", pos_verb_present, overwrite=True)
+            self.editor.add_lexicon("neg_verb_present", neg_verb_present, overwrite=True)
+            self.editor.add_lexicon("pos_verb_past", pos_verb_past, overwrite=True)
+            self.editor.add_lexicon("neg_verb_past", neg_verb_past, overwrite=True)
+            self.editor.add_lexicon("pos_verb", pos_verb_present + pos_verb_past, overwrite=True)
+            self.editor.add_lexicon("neg_verb", neg_verb_present + neg_verb_past, overwrite=True)
+
+            noun = [
+                "airline",
+                "movie",
+                "product",
+                "customer service",
+                "restaurant",
+                "hotel",
+                "food",
+                "staff",
+                "company",
+                "crew",
+                "service",
+            ]
+            self.editor.add_lexicon("noun", noun, overwrite=True)
+
+            intens_adj = [
+                "very",
+                "really",
+                "absolutely",
+                "truly",
+                "extremely",
+                "quite",
+                "incredibly",
+                "amazingly",
+                "especially",
+                "exceptionally",
+                "unbelievably",
+                "utterly",
+                "exceedingly",
+                "rather",
+                "totally",
+                "particularly",
+            ]
+            intens_verb = [
+                "really",
+                "absolutely",
+                "truly",
+                "extremely",
+                "especially",
+                "utterly",
+                "totally",
+                "particularly",
+                "highly",
+                "definitely",
+                "certainly",
+                "genuinely",
+                "honestly",
+                "strongly",
+                "sure",
+                "sincerely",
+            ]
+
+            self.editor.add_lexicon("intens_adj", intens_adj, overwrite=True)
+            self.editor.add_lexicon("intens_verb", intens_verb, overwrite=True)
+
+            reducer_adj = [
+                "somewhat",
+                "kinda",
+                "mostly",
+                "probably",
+                "generally",
+                "reasonably",
+                "a little",
+                "a bit",
+                "slightly",
+            ]
+
+            self.editor.add_lexicon("reducer_adj", reducer_adj, overwrite=True)
+
+            self.monotonic_label = Expect.monotonic(increasing=True, tolerance=0.1)
+            self.monotonic_label_down = Expect.monotonic(increasing=False, tolerance=0.1)
+
+    def _default_vocabulary_tests(self, data: Optional[Iterable[str]], num_test_cases=100):
+
+        positive_words = (
+            self.editor.lexicons["pos_adj"]
+            + self.editor.lexicons["pos_verb_present"]
+            + self.editor.lexicons["pos_verb_past"]
+        )
+
+        test = MFT(
+            positive_words,
+            labels=self._positive,
+            name="Single Positive Words",
+            capability="Vocabulary",
+            description="Correctly recognizes positive words",
+        )
+
+        self.add_test(test)
+
+        negative_words = (
+            self.editor.lexicons["neg_adj"]
+            + self.editor.lexicons["neg_verb_present"]
+            + self.editor.lexicons["neg_verb_past"]
+        )
+
+        test = MFT(
+            negative_words,
+            labels=self._negative,
+            name="Single Negative Words",
+            capability="Vocabulary",
+            description="Correctly recognizes negative words",
+        )
+
+        self.add_test(test)
+
+        template = self.editor.template(
+            "{it} {noun} {be} {pos_adj}.",
+            it=["The", "This", "That"],
+            be=["is", "was"],
+            labels=self._positive,
+            save=True,
+        )
+        template += self.editor.template(
+            "{it} {be} {a:pos_adj} {noun}.",
+            it=["It", "This", "That"],
+            be=["is", "was"],
+            labels=self._positive,
+            save=True,
+        )
+        template += self.editor.template(
+            "{i} {pos_verb} {the} {noun}.",
+            i=["I", "We"],
+            the=["this", "that", "the"],
+            labels=self._positive,
+            save=True,
+        )
+        template += self.editor.template(
+            "{it} {noun} {be} {neg_adj}.",
+            it=["That", "This", "The"],
+            be=["is", "was"],
+            labels=self._negative,
+            save=True,
+        )
+        template += self.editor.template(
+            "{it} {be} {a:neg_adj} {noun}.",
+            it=["It", "This", "That"],
+            be=["is", "was"],
+            labels=self._negative,
+            save=True,
+        )
+        template += self.editor.template(
+            "{i} {neg_verb} {the} {noun}.",
+            i=["I", "We"],
+            the=["this", "that", "the"],
+            labels=self._negative,
+            save=True,
+        )
+
+        test = MFT(
+            **template,
+            name="Sentiment-laden words in context",
+            capability="Vocabulary",
+            description="Use positive and negative verbs and adjectives "
+            "with nouns such as product, movie, airline, etc. "
+            'E.g. "This was a bad movie"',
+        )
+
+        self.add_test(test)
+
+        template = self.editor.template(
+            ["{it} {be} {a:pos_adj} {noun}.", "{it} {be} {a:intens_adj} {pos_adj} {noun}."],
+            it=["It", "This", "That"],
+            be=["is", "was"],
+            nsamples=num_test_cases,
+            save=True,
+        )
+        template += self.editor.template(
+            ["{i} {pos_verb} {the} {noun}.", "{i} {intens_verb} {pos_verb} {the} {noun}."],
+            i=["I", "We"],
+            the=["this", "that", "the"],
+            nsamples=num_test_cases,
+            save=True,
+        )
+        template += self.editor.template(
+            ["{it} {be} {a:neg_adj} {noun}.", "{it} {be} {a:intens_adj} {neg_adj} {noun}."],
+            it=["It", "This", "That"],
+            be=["is", "was"],
+            nsamples=num_test_cases,
+            save=True,
+        )
+        template += self.editor.template(
+            ["{i} {neg_verb} {the} {noun}.", "{i} {intens_verb} {neg_verb} {the} {noun}."],
+            i=["I", "We"],
+            the=["this", "that", "the"],
+            nsamples=num_test_cases,
+            save=True,
+        )
+
+        test = DIR(
+            template.data,
+            self.monotonic_label,
+            templates=template.templates,
+            name="Intensifiers",
+            capability="Vocabulary",
+            description="Test is composed of pairs of sentences (x1, x2), where we add an intensifier"
+            "such as 'really',or 'very' to x2 and expect the confidence to NOT go down "
+            "(with tolerance=0.1). e.g.:"
+            "x1 = 'That was a good movie'"
+            "x2 = 'That was a very good movie'",
+        )
+
+        self.add_test(test)
+
+        template = self.editor.template(
+            ["{it} {noun} {be} {pos_adj}.", "{it} {noun} {be} {reducer_adj} {pos_adj}."],
+            it=["The", "This", "That"],
+            be=["is", "was"],
+            nsamples=num_test_cases,
+            save=True,
+        )
+        template += self.editor.template(
+            ["{it} {noun} {be} {neg_adj}.", "{it} {noun} {be} {reducer_adj} {neg_adj}."],
+            it=["The", "This", "That"],
+            be=["is", "was"],
+            nsamples=num_test_cases,
+            save=True,
+        )
+        test = DIR(
+            template.data,
+            self.monotonic_label_down,
+            templates=template.templates,
+            name="Reducers",
+            capability="Vocabulary",
+            description="Test is composed of pairs of sentences (x1, x2), where we add a reducer"
+            "such as 'somewhat', or 'kinda' to x2 and expect the confidence to NOT go up "
+            " (with tolerance=0.1). e.g.:"
+            "x1 = 'The staff was good.'"
+            "x2 = 'The staff was somewhat good.'",
+        )
+
+        self.add_test(test)
+
+        if data:
+
+            positive = self.editor.template("I {pos_verb_present} you.").data
+            positive += self.editor.template("You are {pos_adj}.").data
+
+            negative = self.editor.template("I {neg_verb_present} you.").data
+            negative += self.editor.template("You are {neg_adj}.").data
+
+            template = Perturb.perturb(
+                data, _add_phrase_function(positive), nsamples=num_test_cases
+            )
+            test = DIR(
+                template.data,
+                Expect.pairwise(self._diff_up),
+                name="Add positive phrases",
+                capability="Vocabulary",
+                description="Add very positive phrases (e.g. I love you) to the end of sentences, "
+                "expect probability of positive to NOT go down (tolerance=0.1)",
+            )
+
+            self.add_test(test)
+
+            template = Perturb.perturb(
+                data, _add_phrase_function(negative), nsamples=num_test_cases
+            )
+            test = DIR(
+                template.data,
+                Expect.pairwise(self._diff_down),
+                name="Add negative phrases",
+                capability="Vocabulary",
+                description="Add very negative phrases (e.g. I hate you) to the end of sentences, "
+                "expect probability of positive to NOT go up (tolerance=0.1)",
+            )
+
+            self.add_test(test)
+
+    def _default_robustness_tests(self, data: Optional[Iterable[str]], num_test_cases=100):
+
+        template = Perturb.perturb(data, utils.add_random_strings, nsamples=num_test_cases)
+        test = INV(
+            template.data,
+            name="Add random urls and handles",
+            capability="Robustness",
+            description="Add randomly generated urls and handles to the start or end of sentence",
+        )
+
+        self.add_test(test)
+
+    def _default_ner_tests(self, data: Optional[Iterable[str]], num_test_cases=100):
+        if data:
+            template = Perturb.perturb(
+                data, utils.spacy_wrap(Perturb.change_names, ner=True), nsamples=num_test_cases
+            )
+            test = INV(
+                template.data,
+                name="Change names",
+                capability="NER",
+                description="Replace names with other common names",
+            )
+            self.add_test(test)
+
+            template = Perturb.perturb(
+                data, utils.spacy_wrap(Perturb.change_location, ner=True), nsamples=num_test_cases
+            )
+            test = INV(
+                template.data,
+                name="Change locations",
+                capability="NER",
+                description="Replace city or country names with other cities or countries",
+            )
+            self.add_test(test)
+
+            template = Perturb.perturb(
+                data, utils.spacy_wrap(Perturb.change_number, ner=True), nsamples=num_test_cases
+            )
+            test = INV(
+                template.data,
+                name="Change numbers",
+                capability="NER",
+                description="Replace integers with random integers within a 20% radius of the original",
+            )
+            self.add_test(test)
+
+    def _default_temporal_tests(self, data: Optional[Iterable[str]], num_test_cases=100):
+        self._setup_editor()
+
+        change = ["but", "even though", "although", ""]
+        template = self.editor.template(
+            [
+                "I used to think this {noun} was {neg_adj}, {change} now I think it is {pos_adj}.",
+                "I think this {noun} is {pos_adj}, {change} I used to think it was {neg_adj}.",
+                "In the past I thought this {noun} was {neg_adj}, {change} now I think it is {pos_adj}.",
+                "I think this {noun} is {pos_adj}, {change} in the past I thought it was {neg_adj}.",
+            ],
+            change=change,
+            unroll=True,
+            nsamples=num_test_cases,
+            save=True,
+            labels=self._positive,
+        )
+        template += self.editor.template(
+            [
+                "I used to {neg_verb_present} this {noun}, {change} now I {pos_verb_present} it.",
+                "I {pos_verb_present} this {noun}, {change} I used to {neg_verb_present} it.",
+                "In the past I would {neg_verb_present} this {noun}, {change} now I {pos_verb} it.",
+                "I {pos_verb_present} this {noun}, {change} in the past I would {neg_verb_present} it.",
+            ],
+            change=change,
+            unroll=True,
+            nsamples=num_test_cases,
+            save=True,
+            labels=self._positive,
+        )
+
+        template += self.editor.template(
+            [
+                "I used to think this {noun} was {pos_adj}, {change} now I think it is {neg_adj}.",
+                "I think this {noun} is {neg_adj}, {change} I used to think it was {pos_adj}.",
+                "In the past I thought this {noun} was {pos_adj}, {change} now I think it is {neg_adj}.",
+                "I think this {noun} is {neg_adj}, {change} in the past I thought it was {pos_adj}.",
+            ],
+            change=change,
+            unroll=True,
+            nsamples=num_test_cases,
+            save=True,
+            labels=self._negative,
+        )
+        template += self.editor.template(
+            [
+                "I used to {pos_verb_present} this {noun}, {change} now I {neg_verb_present} it.",
+                "I {neg_verb_present} this {noun}, {change} I used to {pos_verb_present} it.",
+                "In the past I would {pos_verb_present} this {noun}, {change} now I {neg_verb_present} it.",
+                "I {neg_verb_present} this {noun}, {change} in the past I would {pos_verb_present} it.",
+            ],
+            change=change,
+            unroll=True,
+            nsamples=num_test_cases,
+            save=True,
+            labels=self._negative,
+        )
+        test = MFT(
+            **template,
+            name="Used to, but now",
+            capability="Temporal",
+            description="Have two conflicing statements, one about the past and "
+            "one about the present."
+            "Expect the present to carry the sentiment. Examples:"
+            "I used to love this airline, now I hate it -> should be negative"
+            "I love this airline, although I used to hate it -> should be positive",
+        )
+
+        self.add_test(test)
+
+        adjectives = self.editor.lexicons["pos_adj"] + self.editor.lexicons["neg_adj"]
+        verbs = self.editor.lexicons["pos_verb_present"] + self.editor.lexicons["neg_verb_present"]
+
+        template = self.editor.template(
+            ["{it} {be} {a:adj} {noun}.", "I used to think {it} {be} {a:adj} {noun}."],
+            it=["it", "this", "that"],
+            be=["is", "was"],
+            adj=adjectives,
+            save=True,
+            nsamples=num_test_cases,
+        )
+        template += self.editor.template(
+            ["{i} {verb} {the} {noun}.", "{i} used to {verb} {the} {noun}."],
+            i=["I", "We"],
+            the=["this", "that", "the"],
+            verb=verbs,
+            save=True,
+            nsamples=num_test_cases,
+        )
+        test = DIR(
+            template.data,
+            self.monotonic_label_down,
+            templates=template.templates,
+            name="'Used to' should reduce",
+            capability="Temporal",
+            description="A model should not be more confident on 'I used to think X' "
+            "when compared to 'X', e.g. 'I used to love this restaurant' "
+            "should have less confidence than 'I love this restaurant'",
+        )
+
+        self.add_test(test)
+
+    def _default_fairness_tests(self, data: Optional[Iterable[str]], num_test_cases=100):
+        protected = {
+            "race": ["a black", "a hispanic", "a white", "an asian"],  # add more here.
+            "sexuality": self.editor.template("{a:sexual_adj}").data,
+            "religion": self.editor.template("{a:religion_adj}").data,
+            "nationality": self.editor.template("{a:nationality}").data,
+        }
+
+        for p, vals in protected.items():
+            template = self.editor.template(
+                ["{male} is %s {mask}." % r for r in vals],
+                return_maps=False,
+                nsamples=num_test_cases,
+                save=True,
+            )
+            template += self.editor.template(
+                ["{female} is %s {mask}." % r for r in vals],
+                return_maps=False,
+                nsamples=num_test_cases,
+                save=True,
+            )
+            test = INV(
+                template.data,
+                threshold=0.1,
+                templates=template.templates,
+                name="Protected: %s" % p,
+                capability="Fairness",
+                description="Prediction should be the same for various adjectives within a protected class",
+            )
+
+            self.add_test(test)
+
+    def _default_negation_tests(self, data: Optional[Iterable[str]], num_test_cases=100):
+        template = self.editor.template(
+            "{it} {noun} {nt} {pos_adj}.",
+            it=["This", "That", "The"],
+            nt=["is not", "isn't"],
+            save=True,
+            nsamples=num_test_cases,
+        )
+        template += self.editor.template(
+            "{it} {benot} {a:pos_adj} {noun}.",
+            it=["It", "This", "That"],
+            benot=["is not", "isn't", "was not", "wasn't"],
+            save=True,
+            nsamples=num_test_cases,
+        )
+        neg = ["I can't say I", "I don't", "I would never say I", "I don't think I", "I didn't"]
+        template += self.editor.template(
+            "{neg} {pos_verb_present} {the} {noun}.",
+            neg=neg,
+            the=["this", "that", "the"],
+            save=True,
+            nsamples=num_test_cases,
+        )
+        template += self.editor.template(
+            "No one {pos_verb_present}s {the} {noun}.",
+            neg=neg,
+            the=["this", "that", "the"],
+            save=True,
+            nsamples=num_test_cases,
+        )
+        test = MFT(
+            template.data,
+            labels=self._negative,
+            templates=template.templates,
+            name="Simple negations: negative",
+            capability="Negation",
+            description="Very simple negations of positive statements",
+        )
+
+        self.add_test(test)
+
+        template = self.editor.template(
+            "I thought {it} {noun} would be {pos_adj}, but it {neg}.",
+            neg=["was not", "wasn't"],
+            it=["this", "that", "the"],
+            nt=["is not", "isn't"],
+            save=True,
+            nsamples=num_test_cases,
+        )
+        template += self.editor.template(
+            "I thought I would {pos_verb_present} {the} {noun}, but I {neg}.",
+            neg=["did not", "didn't"],
+            the=["this", "that", "the"],
+            save=True,
+            nsamples=num_test_cases,
+        )
+        test = MFT(
+            template.data,
+            labels=self._negative,
+            templates=template.templates,
+            name="Simple negations: I thought x was positive, but it was not",
+            capability="Negation",
+            description="",
+        )
+        self.add_test(test)
+
+    def _positive_change(self, orig_conf: np.ndarray, conf: np.ndarray) -> float:
+        """
+        Returns the change in the confidence scores.
+        """
+        return (
+            orig_conf[self._negative]
+            - conf[self._negative]
+            + conf[self._positive]
+            - orig_conf[self._positive]
+        )
+
+    def _diff_up(
+        self,
+        orig_pred: int,
+        pred: int,
+        orig_conf: np.ndarray,
+        conf: np.ndarray,
+        labels: Optional[int] = None,
+        meta: Optional[List] = None,
+    ) -> Union[bool, float]:
+        """
+        These arguments are expected by `checklist.expect.Expect.pairwise` function.
+        We only use `orig_conf` and `conf` in this case.
+
+        `orig_conf` is the confidence score of the first example in a test's input data pair.
+
+        A `bool` output indicates whether the test passed the expectation (always
+        `True` in this function's case).
+
+        A `float` output indicates the magnitude of the failure.
+        """
+        tolerance = 0.1
+        change = self._positive_change(orig_conf, conf)
+        if change + tolerance >= 0:
+            return True
+        else:
+            return change + tolerance
+
+    def _diff_down(
+        self,
+        orig_pred: int,
+        pred: int,
+        orig_conf: np.ndarray,
+        conf: np.ndarray,
+        labels: Optional[int] = None,
+        meta: Optional[List] = None,
+    ) -> Union[bool, float]:
+        """
+        These arguments are expected by `checklist.expect.Expect.pairwise` function.
+        We only use `orig_conf` and `conf` in this case.
+
+        `orig_conf` is the confidence score of the first example in a test's input data pair.
+
+        A `bool` output indicates whether the test passed the expectation (always
+        `True` in this function's case).
+
+        A `float` output indicates the magnitude of the failure.
+        """
+        tolerance = 0.1
+        change = self._positive_change(orig_conf, conf)
+        if change - tolerance <= 0:
+            return True
+        else:
+            return -(change - tolerance)
diff --git a/allennlp/sanity_checks/task_checklists/task_suite.py b/allennlp/sanity_checks/task_checklists/task_suite.py
new file mode 100644
index 00000000000..85b05902fdb
--- /dev/null
+++ b/allennlp/sanity_checks/task_checklists/task_suite.py
@@ -0,0 +1,416 @@
+import sys
+import logging
+from typing import Type, Optional, Dict, Any, Callable, List, Iterable, Union, TextIO, Tuple
+
+import numpy as np
+from checklist.test_suite import TestSuite
+from checklist.editor import Editor
+from checklist.test_types import MFT, INV, DIR
+from checklist.perturb import Perturb
+from allennlp.common.registrable import Registrable
+from allennlp.common.file_utils import cached_path
+from allennlp.predictors.predictor import Predictor
+from allennlp.sanity_checks.task_checklists import utils
+
+logger = logging.getLogger(__name__)
+
+
+class TaskSuite(Registrable):
+    """
+    Base class for various task test suites.
+
+    This is a wrapper class around the CheckList toolkit introduced
+    in the paper
+    [Beyond Accuracy: Behavioral Testing of NLP models with CheckList (Ribeiro et al)]
+    (https://api.semanticscholar.org/CorpusID:218551201).
+
+    Task suites are intended to be used as a form of behavioral testing
+    for NLP models to check for robustness across several general linguistic
+    capabilities; eg. Vocabulary, SRL, Negation, etc.
+
+    An example of the entire checklist process can be found at:
+    [https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/]
+    (https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/).
+
+    A task suite should contain tests that check general capabilities, including
+    but not limited to:
+
+    * Vocabulary + POS : Important words/word types for the task
+    * Taxonomy : Synonyms/antonyms, etc.
+    * Robustness : To typos, irrelevant changes, etc.
+    * NER : Appropriately understanding named entities.
+    * Temporal : Understanding the order of events.
+    * Negation
+    * Coreference
+    * Semantic Role Labeling : Understanding roles such as agents and objects.
+    * Logic : Ability to handle symmetry, consistency, and conjunctions.
+    * Fairness
+
+
+    # Parameters
+
+    suite: `checklist.test_suite.TestSuite`, optional (default = `None`)
+        Pass in an existing test suite.
+
+    add_default_tests: `bool` (default = `False`)
+        Whether to add default checklist tests for the task.
+
+    data: `List[Any]`, optional (default = `None`)
+        If the data is provided, and `add_default_tests` is `True`,
+        tests that perturb the data are also added.
+
+        For instance, if the task is sentiment analysis, and the a
+        list of sentences is passed, it will add tests that check
+        a model's robustness to typos, etc.
+    """
+
+    _capabilities: List[str] = [
+        "Vocabulary",
+        "Taxonomy",
+        "Robustness",
+        "NER",
+        "Fairness",
+        "Temporal",
+        "Negation",
+        "Coref",
+        "SRL",
+        "Logic",
+    ]
+
+    def __init__(
+        self,
+        suite: Optional[TestSuite] = None,
+        add_default_tests: bool = True,
+        data: Optional[List[Any]] = None,
+        **kwargs,
+    ):
+        self.suite = suite or TestSuite()
+
+        if add_default_tests:
+            self._default_tests(data, **kwargs)
+
+    def _prediction_and_confidence_scores(self, predictor: Predictor) -> Callable:
+        """
+        This makes certain assumptions about the task predictor
+        input and output expectations. This should return a function
+        that takes the data as input, passes it to the predictor,
+        and returns predictions and confidences.
+        """
+        return NotImplementedError
+
+    def describe(self):
+        """
+        Gives a description of the test suite. This is intended as a utility for
+        examining the test suite.
+        """
+        self._summary(overview_only=True)
+
+    def summary(
+        self, capabilities: Optional[List[str]] = None, file: TextIO = sys.stdout, **kwargs
+    ):
+        """
+        Prints a summary of the test results.
+
+        # Parameters
+
+        capabilities : `List[str]`, optional (default = `None`)
+            If not None, will only show tests with these capabilities.
+        **kwargs : `type`
+            Will be passed as arguments to each test.summary()
+        """
+        old_stdout = sys.stdout
+        try:
+            sys.stdout = file
+            self._summary(capabilities=capabilities, **kwargs)
+        finally:
+            sys.stdout = old_stdout
+
+    def _summary(
+        self, overview_only: bool = False, capabilities: Optional[List[str]] = None, **kwargs
+    ):
+        """
+        Internal function for description and summary.
+        """
+
+        # The capabilities are sorted such that if the capability does not exist
+        # in the list of pre-defined `_capabilities`, then it is put at the end.
+        # `100` is selected as an arbitrary large number; we do not expect the
+        # number of capabilities to be higher.
+        def cap_order(x):
+            return self._capabilities.index(x) if x in self._capabilities else 100
+
+        capabilities = capabilities or sorted(
+            set([x["capability"] for x in self.suite.info.values()]), key=cap_order
+        )
+        print(
+            "\n\nThis suite contains {} tests across {} capabilities.".format(
+                len(self.suite.tests), len(capabilities)
+            )
+        )
+        print()
+        for capability in capabilities:
+            tests = [
+                name for name, test in self.suite.info.items() if test["capability"] == capability
+            ]
+            num_tests = len(tests)
+            if num_tests > 0:
+                print(f'\nCapability: "{capability}" ({num_tests} tests)\n')
+                for test in tests:
+                    description = self.suite.info[test]["description"]
+                    num_test_cases = len(self.suite.tests[test].data)
+                    about_test = f"* Name: {test} ({num_test_cases} test cases)"
+                    if description:
+                        about_test += f"\n{description}"
+                    print(about_test)
+
+                    if not overview_only:
+                        if "format_example_fn" not in kwargs:
+                            kwargs["format_example_fn"] = self.suite.info[test].get(
+                                "format_example_fn", self._format_failing_examples
+                            )
+                        if "print_fn" not in kwargs:
+                            kwargs["print_fn"] = self.suite.info[test].get(
+                                "print_fn", self.suite.print_fn
+                            )
+                        print()
+                        self.suite.tests[test].summary(**kwargs)
+                        print()
+
+    def _format_failing_examples(
+        self,
+        inputs: Tuple[Any],
+        pred: Any,
+        conf: Union[np.array, np.ndarray],
+        *args,
+        **kwargs,
+    ):
+        """
+        Formatting function for printing failed test examples.
+        """
+        if conf.shape[0] <= 4:
+            confs = " ".join(["%.1f" % c for c in conf])
+            ret = "%s %s" % (confs, str(inputs))
+        else:
+            conf = conf[pred]
+            ret = "%s (%.1f) %s" % (pred, conf, str(inputs))
+        return ret
+
+    def run(
+        self,
+        predictor: Predictor,
+        capabilities: Optional[List[str]] = None,
+        max_examples: Optional[int] = None,
+    ):
+        """
+        Runs the predictor on the test suite data.
+
+        # Parameters
+
+        predictor : `Predictor`
+            The predictor object.
+        capabilities : `List[str]`, optional (default = `None`)
+            If not None, will only run tests with these capabilities.
+        max_examples : `int`, optional (default = `None`)
+            Maximum number of examples to run. If None, all examples will be run.
+        """
+        preds_and_confs_fn = self._prediction_and_confidence_scores(predictor)
+        if preds_and_confs_fn is NotImplementedError:
+            raise NotImplementedError(
+                "The `_prediction_and_confidence_scores` function needs "
+                "to be implemented for the class `{}`".format(self.__class__)
+            )
+        if not capabilities:
+            self.suite.run(preds_and_confs_fn, overwrite=True, n=max_examples)
+        else:
+            for _, test in self.suite.tests.items():
+                if test.capability in capabilities:
+                    test.run(preds_and_confs_fn, verbose=True, overwrite=True, n=max_examples)
+
+    @classmethod
+    def constructor(
+        cls,
+        name: Optional[str] = None,
+        suite_file: Optional[str] = None,
+        extra_args: Optional[Dict[str, Any]] = None,
+    ) -> "TaskSuite":
+        suite_class: Type[TaskSuite] = (
+            TaskSuite.by_name(name) if name is not None else cls  # type: ignore
+        )
+
+        if extra_args is None:
+            extra_args = {}
+
+        if suite_file is not None:
+            return suite_class(TestSuite.from_file(cached_path(suite_file)), **extra_args)
+        return suite_class(**extra_args)
+
+    def save_suite(self, suite_file: str):
+        """
+        Saves the suite to a file.
+        """
+        self.suite.save(suite_file)
+
+    def _default_tests(self, data: Optional[Iterable], num_test_cases: int = 100):
+        """
+        Derived TaskSuite classes can add any task-specific tests here.
+        """
+        if data:
+
+            # Robustness
+
+            self._punctuation_test(data, num_test_cases)
+            self._typo_test(data, num_test_cases)
+            self._contraction_test(data, num_test_cases)
+
+    @classmethod
+    def contractions(cls) -> Callable:
+        """
+        This returns a function which adds/removes contractions in relevant
+        `str` inputs of a task's inputs. For instance, "isn't" will be
+        changed to "is not", and "will not" will be changed to "won't".
+
+        Expected arguments for this function: `(example, **args, **kwargs)`
+        where the `example` is an instance of some task. It can be of any
+        type.
+
+        For example, for a sentiment analysis task, it will be a
+        a `str` (the sentence for which we want to predict the sentiment).
+        For a textual entailment task, it can be a tuple or a Dict, etc.
+
+        Expected output of this function is a list of instances for the task,
+        of the same type as `example`.
+        """
+        return Perturb.contractions
+
+    @classmethod
+    def typos(cls) -> Callable:
+        """
+        This returns a function which adds simple typos in relevant
+        `str` inputs of a task's inputs.
+
+        Expected arguments for this function: `(example, **args, **kwargs)`
+        where the `example` is an instance of some task. It can be of any
+        type.
+
+        For example, for a sentiment analysis task, it will be a
+        a `str` (the sentence for which we want to predict the sentiment).
+        For a textual entailment task, it can be a tuple or a Dict, etc.
+
+        Expected output of this function is a list of instances for the task,
+        of the same type as `example`.
+        """
+        return Perturb.add_typos
+
+    @classmethod
+    def punctuation(cls) -> Callable:
+        """
+        This returns a function which adds/removes punctuations in relevant
+        `str` inputs of a task's inputs. For instance, "isn't" will be
+        changed to "is not", and "will not" will be changed to "won't".
+
+        Expected arguments for this function: `(example, **args, **kwargs)`
+        where the `example` is an instance of some task. It can be of any
+        type.
+
+        For example, for a sentiment analysis task, it will be a
+        a `str` (the sentence for which we want to predict the sentiment).
+        For a textual entailment task, it can be a tuple or a Dict, etc.
+
+        Expected output of this function is a list of instances for the task,
+        of the same type as `example`.
+        """
+        return utils.toggle_punctuation
+
+    def _punctuation_test(self, data: Iterable, num_test_cases: int):
+        """
+        Checks if the model is invariant to presence/absence of punctuation.
+        """
+        template = Perturb.perturb(data, self.punctuation(), nsamples=num_test_cases)
+        test = INV(
+            template.data,
+            name="Punctuation",
+            description="Strip punctuation and / or add '.'",
+            capability="Robustness",
+        )
+        self.add_test(test)
+
+    def _typo_test(self, data: Iterable, num_test_cases: int):
+        """
+        Checks if the model is robust enough to be invariant to simple typos.
+        """
+        template = Perturb.perturb(data, self.typos(), nsamples=num_test_cases, typos=1)
+        test = INV(
+            template.data,
+            name="Typos",
+            capability="Robustness",
+            description="Add one typo to input by swapping two adjacent characters",
+        )
+
+        self.add_test(test)
+
+        template = Perturb.perturb(data, self.typos(), nsamples=num_test_cases, typos=2)
+        test = INV(
+            template.data,
+            name="2 Typos",
+            capability="Robustness",
+            description="Add two typos to input by swapping two adjacent characters twice",
+        )
+        self.add_test(test)
+
+    def _contraction_test(self, data: Iterable, num_test_cases: int):
+        """
+        Checks if the model is invariant to contractions and expansions
+        (eg. What is <-> What's).
+        """
+        template = Perturb.perturb(data, self.contractions(), nsamples=num_test_cases)
+        test = INV(
+            template.data,
+            name="Contractions",
+            capability="Robustness",
+            description="Contract or expand contractions, e.g. What is <-> What's",
+        )
+        self.add_test(test)
+
+    def _setup_editor(self):
+        """
+        Sets up a `checklist.editor.Editor` object, to be used for adding
+        default tests to the suite.
+        """
+        if not hasattr(self, "editor"):
+            self.editor = Editor()
+
+    def add_test(self, test: Union[MFT, INV, DIR]):
+        """
+        Adds a fully specified checklist test to the suite.
+        The tests can be of the following types:
+
+        * MFT: A minimum functionality test. It checks if the predicted output
+               matches the expected output.
+               For example, for a sentiment analysis task, a simple MFT can check
+               if the model always predicts a positive sentiment for very
+               positive words.
+               The test's data contains the input and the expected output.
+
+        * INV: An invariance test. It checks if the predicted output is invariant
+               to some change in the input.
+               For example, for a sentiment analysis task, an INV test can check
+               if the prediction stays consistent if simple typos are added.
+               The test's data contains the pairs (input, modified input).
+
+        * DIR: A directional expectation test. It checks if the predicted output
+               changes in some specific way in response to the change in input.
+               For example, for a sentiment analysis task, a DIR test can check if
+               adding a reducer (eg. "good" -> "somewhat good") causes the
+               prediction's positive confidence score to decrease (or at least not
+               increase).
+               The test's data contains single inputs or pairs (input, modified input).
+
+        Please refer to [the paper](https://api.semanticscholar.org/CorpusID:218551201)
+        for more details and examples.
+
+        Note: `test` needs to be fully specified; with name, capability and description.
+        """
+        if test.data:  # test data should contain at least one example.
+            self.suite.add(test)
+        else:
+            logger.warning("'{}' was not added, as it contains no examples.".format(test.name))
diff --git a/allennlp/sanity_checks/task_checklists/textual_entailment_suite.py b/allennlp/sanity_checks/task_checklists/textual_entailment_suite.py
new file mode 100644
index 00000000000..566324b440f
--- /dev/null
+++ b/allennlp/sanity_checks/task_checklists/textual_entailment_suite.py
@@ -0,0 +1,453 @@
+from typing import Optional, Tuple, Iterable, Callable, Union
+import itertools
+import numpy as np
+from overrides import overrides
+from checklist.test_suite import TestSuite
+from checklist.test_types import MFT
+from checklist.perturb import Perturb
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+from allennlp.sanity_checks.task_checklists import utils
+
+
+def _wrap_apply_to_each(perturb_fn: Callable, both: bool = False, *args, **kwargs):
+    """
+    Wraps the perturb function so that it is applied to
+    both elements in the (premise, hypothesis) tuple.
+    """
+
+    def new_fn(pair, *args, **kwargs):
+        premise, hypothesis = pair
+        ret = []
+        fn_premise = perturb_fn(premise, *args, **kwargs)
+        fn_hypothesis = perturb_fn(hypothesis, *args, **kwargs)
+        if type(fn_premise) != list:
+            fn_premise = [fn_premise]
+        if type(fn_hypothesis) != list:
+            fn_hypothesis = [fn_hypothesis]
+        ret.extend([(x, str(hypothesis)) for x in fn_premise])
+        ret.extend([(str(premise), x) for x in fn_hypothesis])
+        if both:
+            ret.extend([(x, x2) for x, x2 in itertools.product(fn_premise, fn_hypothesis)])
+
+        # The perturb function can return empty strings, if no relevant perturbations
+        # can be applied. Eg. if the sentence is "This is a good movie", a perturbation
+        # which toggles contractions will have no effect.
+        return [x for x in ret if x[0] and x[1]]
+
+    return new_fn
+
+
+@TaskSuite.register("textual-entailment")
+class TextualEntailmentSuite(TaskSuite):
+    def __init__(
+        self,
+        suite: Optional[TestSuite] = None,
+        entails: int = 0,
+        contradicts: int = 1,
+        neutral: int = 2,
+        premise: str = "premise",
+        hypothesis: str = "hypothesis",
+        probs_key: str = "probs",
+        **kwargs,
+    ):
+
+        self._entails = entails
+        self._contradicts = contradicts
+        self._neutral = neutral
+
+        self._premise = premise
+        self._hypothesis = hypothesis
+
+        self._probs_key = probs_key
+
+        super().__init__(suite, **kwargs)
+
+    def _prediction_and_confidence_scores(self, predictor):
+        def preds_and_confs_fn(data):
+            labels = []
+            confs = []
+
+            data = [{self._premise: pair[0], self._hypothesis: pair[1]} for pair in data]
+            predictions = predictor.predict_batch_json(data)
+            for pred in predictions:
+                label = np.argmax(pred[self._probs_key])
+                labels.append(label)
+                confs.append(pred[self._probs_key])
+            return np.array(labels), np.array(confs)
+
+        return preds_and_confs_fn
+
+    @overrides
+    def _format_failing_examples(
+        self,
+        inputs: Tuple,
+        pred: int,
+        conf: Union[np.array, np.ndarray],
+        label: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Formatting function for printing failed test examples.
+        """
+        labels = {
+            self._entails: "Entails",
+            self._contradicts: "Contradicts",
+            self._neutral: "Neutral",
+        }
+        ret = "Premise: %s\nHypothesis: %s" % (inputs[0], inputs[1])
+        if label is not None:
+            ret += "\nOriginal: %s" % labels[label]
+        ret += "\nPrediction: Entails (%.1f), Contradicts (%.1f), Neutral (%.1f)" % (
+            conf[self._entails],
+            conf[self._contradicts],
+            conf[self._neutral],
+        )
+
+        return ret
+
+    @classmethod
+    def contractions(cls):
+        return _wrap_apply_to_each(Perturb.contractions, both=True)
+
+    @classmethod
+    def typos(cls):
+        return _wrap_apply_to_each(Perturb.add_typos, both=False)
+
+    @classmethod
+    def punctuation(cls):
+        return _wrap_apply_to_each(utils.toggle_punctuation, both=False)
+
+    @overrides
+    def _setup_editor(self):
+        super()._setup_editor()
+
+        antonyms = [
+            ("progressive", "conservative"),
+            ("positive", "negative"),
+            ("defensive", "offensive"),
+            ("rude", "polite"),
+            ("optimistic", "pessimistic"),
+            ("stupid", "smart"),
+            ("negative", "positive"),
+            ("unhappy", "happy"),
+            ("active", "passive"),
+            ("impatient", "patient"),
+            ("powerless", "powerful"),
+            ("visible", "invisible"),
+            ("fat", "thin"),
+            ("bad", "good"),
+            ("cautious", "brave"),
+            ("hopeful", "hopeless"),
+            ("insecure", "secure"),
+            ("humble", "proud"),
+            ("passive", "active"),
+            ("dependent", "independent"),
+            ("pessimistic", "optimistic"),
+            ("irresponsible", "responsible"),
+            ("courageous", "fearful"),
+        ]
+
+        self.editor.add_lexicon("antonyms", antonyms, overwrite=True)
+
+        comp = [
+            "smarter",
+            "better",
+            "worse",
+            "brighter",
+            "bigger",
+            "louder",
+            "longer",
+            "larger",
+            "smaller",
+            "warmer",
+            "colder",
+            "thicker",
+            "lighter",
+            "heavier",
+        ]
+
+        self.editor.add_lexicon("compare", comp, overwrite=True)
+
+        nouns = [
+            "humans",
+            "cats",
+            "dogs",
+            "people",
+            "mice",
+            "pigs",
+            "birds",
+            "sheep",
+            "cows",
+            "rats",
+            "chickens",
+            "fish",
+            "bears",
+            "elephants",
+            "rabbits",
+            "lions",
+            "monkeys",
+            "snakes",
+            "bees",
+            "spiders",
+            "bats",
+            "puppies",
+            "dolphins",
+            "babies",
+            "kittens",
+            "children",
+            "frogs",
+            "ants",
+            "butterflies",
+            "insects",
+            "turtles",
+            "trees",
+            "ducks",
+            "whales",
+            "robots",
+            "animals",
+            "bugs",
+            "kids",
+            "crabs",
+            "carrots",
+            "dragons",
+            "mosquitoes",
+            "cars",
+            "sharks",
+            "dinosaurs",
+            "horses",
+            "tigers",
+        ]
+        self.editor.add_lexicon("nouns", nouns, overwrite=True)
+
+        professions = [
+            "journalist",
+            "historian",
+            "secretary",
+            "nurse",
+            "waitress",
+            "accountant",
+            "engineer",
+            "attorney",
+            "artist",
+            "editor",
+            "architect",
+            "model",
+            "interpreter",
+            "analyst",
+            "actor",
+            "actress",
+            "assistant",
+            "intern",
+            "economist",
+            "organizer",
+            "author",
+            "investigator",
+            "agent",
+            "administrator",
+            "executive",
+            "educator",
+            "investor",
+            "DJ",
+            "entrepreneur",
+            "auditor",
+            "advisor",
+            "instructor",
+            "activist",
+            "consultant",
+            "apprentice",
+            "reporter",
+            "expert",
+            "psychologist",
+            "examiner",
+            "painter",
+            "manager",
+            "contractor",
+            "therapist",
+            "programmer",
+            "musician",
+            "producer",
+            "associate",
+            "intermediary",
+            "designer",
+            "cook",
+            "salesperson",
+            "dentist",
+            "attorney",
+            "detective",
+            "banker",
+            "researcher",
+            "cop",
+            "driver",
+            "counselor",
+            "clerk",
+            "professor",
+            "tutor",
+            "coach",
+            "chemist",
+            "scientist",
+            "veterinarian",
+            "firefighter",
+            "baker",
+            "psychiatrist",
+            "prosecutor",
+            "director",
+            "technician",
+        ]
+        self.editor.add_lexicon("professions", professions, overwrite=True)
+
+    @overrides
+    def _default_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+        super()._default_tests(data, num_test_cases)
+        self._setup_editor()
+        self._default_vocabulary_tests(data, num_test_cases)
+        self._default_ner_tests(data, num_test_cases)
+        self._default_temporal_tests(data, num_test_cases)
+        self._default_logic_tests(data, num_test_cases)
+        self._default_negation_tests(data, num_test_cases)
+
+    def _default_vocabulary_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+
+        template = self.editor.template(
+            (
+                "{first_name1} is more {antonyms[0]} than {first_name2}",
+                "{first_name2} is more {antonyms[1]} than {first_name1}",
+            ),
+            remove_duplicates=True,
+            nsamples=num_test_cases,
+        )
+
+        test = MFT(
+            **template,
+            labels=self._entails,
+            name='"A is more COMP than B" entails "B is more antonym(COMP) than A"',
+            capability="Vocabulary",
+            description="Eg. A is more active than B implies that B is more passive than A",
+        )
+
+        self.add_test(test)
+
+    def _default_logic_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+        template = self.editor.template(
+            ("{nouns1} are {compare} than {nouns2}", "{nouns2} are {compare} than {nouns1}"),
+            nsamples=num_test_cases,
+            remove_duplicates=True,
+        )
+
+        test = MFT(
+            **template,
+            labels=self._contradicts,
+            name='"A is COMP than B" contradicts "B is COMP than A"',
+            capability="Logic",
+            description='Eg. "A is better than B" contradicts "B is better than A"',
+        )
+
+        self.add_test(test)
+
+        if data:
+            template = Perturb.perturb(
+                data, lambda x: (x[0], x[0]), nsamples=num_test_cases, keep_original=False
+            )
+            template += Perturb.perturb(
+                data, lambda x: (x[1], x[1]), nsamples=num_test_cases, keep_original=False
+            )
+
+            test = MFT(
+                **template,
+                labels=self._entails,
+                name="A entails A (premise == hypothesis)",
+                capability="Logic",
+                description="If premise and hypothesis are the same, then premise entails the hypothesis",
+            )
+
+            self.add_test(test)
+
+    def _default_negation_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+
+        template = self.editor.template(
+            (
+                "{first_name1} is {compare} than {first_name2}",
+                "{first_name1} is not {compare} than {first_name2}",
+            ),
+            nsamples=num_test_cases,
+            remove_duplicates=True,
+        )
+
+        test = MFT(
+            **template,
+            labels=self._contradicts,
+            name='"A is COMP than B" contradicts "A is not COMP than B"',
+            capability="Negation",
+            description="Eg. A is better than B contradicts A is not better than C",
+        )
+
+        self.add_test(test)
+
+    def _default_ner_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+        template = self.editor.template(
+            (
+                "{first_name1} is {compare} than {first_name2}",
+                "{first_name1} is {compare} than {first_name3}",
+            ),
+            nsamples=num_test_cases,
+            remove_duplicates=True,
+        )
+
+        test = MFT(
+            **template,
+            labels=self._neutral,
+            name='"A is COMP than B" gives no information about "A is COMP than C"',
+            capability="NER",
+            description='Eg. "A is better than B" gives no information about "A is better than C"',
+        )
+
+        self.add_test(test)
+
+    def _default_temporal_tests(self, data: Optional[Iterable[Tuple]], num_test_cases=100):
+        template = self.editor.template(
+            (
+                "{first_name} works as {a:professions}",
+                "{first_name} used to work as a {professions}",
+            ),
+            nsamples=num_test_cases,
+            remove_duplicates=True,
+        )
+
+        template += self.editor.template(
+            (
+                "{first_name} {last_name} is {a:professions}",
+                "{first_name} {last_name} was {a:professions}",
+            ),
+            nsamples=num_test_cases,
+            remove_duplicates=True,
+        )
+
+        test = MFT(
+            **template,
+            labels=self._neutral,
+            name='"A works as P" gives no information about "A used to work as P"',
+            capability="Temporal",
+            description='Eg. "A is a writer" gives no information about "A was a writer"',
+        )
+
+        self.add_test(test)
+
+        template = self.editor.template(
+            (
+                "{first_name} was {a:professions1} before they were {a:professions2}",
+                "{first_name} was {a:professions1} after they were {a:professions2}",
+            ),
+            nsamples=num_test_cases,
+            remove_duplicates=True,
+        )
+
+        test = MFT(
+            **template,
+            labels=self._contradicts,
+            name="Before != After",
+            capability="Temporal",
+            description='Eg. "A was a writer before they were a journalist" '
+            'contradicts "A was a writer after they were a journalist"',
+        )
+
+        self.add_test(test)
diff --git a/allennlp/sanity_checks/task_checklists/utils.py b/allennlp/sanity_checks/task_checklists/utils.py
new file mode 100644
index 00000000000..22ad9deedf1
--- /dev/null
+++ b/allennlp/sanity_checks/task_checklists/utils.py
@@ -0,0 +1,98 @@
+import string
+from typing import Dict, Callable, List, Union
+import numpy as np
+import spacy
+
+
+def spacy_wrap(fn: Callable, language: str = "en_core_web_sm", **kwargs) -> Callable:
+    """
+    Wrap the function so that it runs the input text data
+    through a spacy model before the function call.
+    """
+    from allennlp.common.util import get_spacy_model
+
+    def new_fn(data: Union[spacy.tokens.doc.Doc, Dict, str]):
+        if not isinstance(data, spacy.tokens.doc.Doc):
+            model = get_spacy_model(language, **kwargs)
+            if isinstance(data, Dict):
+                for key, val in data.items():
+                    if isinstance(val, str):
+                        data[key] = model(val)
+            elif isinstance(data, tuple):
+                data = tuple(model(tup) if isinstance(tup, str) else tup for tup in data)
+            elif isinstance(data, str):
+                data = model(data)
+            else:
+                pass
+        return fn(data)
+
+    return new_fn
+
+
+def strip_punctuation(data: Union[str, spacy.tokens.doc.Doc]) -> str:
+    """
+    Removes all punctuation from `data`.
+    """
+    if isinstance(data, str):
+        return data.rstrip(string.punctuation)
+    elif isinstance(data, spacy.tokens.doc.Doc):
+        while len(data) and data[-1].is_punct:
+            data = data[:-1]
+    else:
+        # Can log a warning here, but it may get noisy.
+        pass
+    return str(data)
+
+
+def toggle_punctuation(data: str) -> List[str]:
+    """
+    If `data` contains any punctuation, it is removed.
+    Otherwise, a `.` is added to the string.
+    Returns a list of strings.
+
+    Eg.
+    `data` = "This was great!"
+    Returns ["This was great", "This was great."]
+
+    `data` = "The movie was good"
+    Returns ["The movie was good."]
+    """
+    s = strip_punctuation(data)
+    ret = []
+    if s != data:
+        ret.append(s)
+    if s + "." != data:
+        ret.append(s + ".")
+    return ret
+
+
+def random_string(n: int) -> str:
+    """
+    Returns a random alphanumeric string of length `n`.
+    """
+    return "".join(np.random.choice([x for x in string.ascii_letters + string.digits], n))
+
+
+def random_url(n: int = 6) -> str:
+    """
+    Returns a random url of length `n`.
+    """
+    return "https://t.co/%s" % random_string(n)
+
+
+def random_handle(n: int = 6) -> str:
+    """
+    Returns a random handle of length `n`. Eg. "@randomstr23`
+    """
+    return "@%s" % random_string(n)
+
+
+def add_random_strings(data: str) -> List[str]:
+    """
+    Adds random strings to the start and end of the string `data`.
+    Returns a list of strings.
+    """
+    urls_and_handles = [random_url(n=6) for _ in range(5)] + [random_handle() for _ in range(5)]
+    rets = ["%s %s" % (x, data) for x in urls_and_handles]
+    rets += ["%s %s" % (data, x) for x in urls_and_handles]
+    return rets
diff --git a/setup.py b/setup.py
index 22d600c6806..886c40d2482 100644
--- a/setup.py
+++ b/setup.py
@@ -71,6 +71,7 @@
         "filelock>=3.0,<3.1",
         "lmdb",
         "more-itertools",
+        "checklist==0.0.10",
         "wandb>=0.10.0,<0.11.0",
         "huggingface_hub>=0.0.8",
     ],
diff --git a/test_fixtures/task_suites/fake_suite.tar.gz b/test_fixtures/task_suites/fake_suite.tar.gz
new file mode 100644
index 00000000000..f2a2525a647
Binary files /dev/null and b/test_fixtures/task_suites/fake_suite.tar.gz differ
diff --git a/tests/commands/checklist_test.py b/tests/commands/checklist_test.py
new file mode 100644
index 00000000000..f566ceb0408
--- /dev/null
+++ b/tests/commands/checklist_test.py
@@ -0,0 +1,53 @@
+import argparse
+import sys
+
+from allennlp.commands import main
+from allennlp.commands.checklist import CheckList
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestCheckList(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+        self.archive_file = (
+            self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz"
+        )
+        self.task = "sentiment-analysis"
+
+    def test_add_checklist_subparser(self):
+        parser = argparse.ArgumentParser(description="Testing")
+        subparsers = parser.add_subparsers(title="Commands", metavar="")
+        CheckList().add_subparser(subparsers)
+
+        kebab_args = [
+            "checklist",  # command
+            "/path/to/archive",  # archive
+            "task-suite-name",
+            "--checklist-suite",
+            "/path/to/checklist/pkl",
+            "--output-file",
+            "/dev/null",
+            "--cuda-device",
+            "0",
+        ]
+
+        args = parser.parse_args(kebab_args)
+
+        assert args.func.__name__ == "_run_suite"
+        assert args.archive_file == "/path/to/archive"
+        assert args.task == "task-suite-name"
+        assert args.output_file == "/dev/null"
+        assert args.cuda_device == 0
+
+    def test_works_with_known_model(self):
+
+        sys.argv = [
+            "__main__.py",  # executable
+            "checklist",  # command
+            str(self.archive_file),
+            str(self.task),
+            "--task-suite-args",
+            '{"positive": 1, "negative": 0}',
+        ]
+
+        main()
diff --git a/tests/data/dataset_readers/sharded_dataset_reader_test.py b/tests/data/dataset_readers/sharded_dataset_reader_test.py
index b0943046ded..94840bde56a 100644
--- a/tests/data/dataset_readers/sharded_dataset_reader_test.py
+++ b/tests/data/dataset_readers/sharded_dataset_reader_test.py
@@ -54,7 +54,7 @@ def setup_method(self) -> None:
 
     def read_and_check_instances(self, filepath: str, num_workers: int = 0):
         data_loader = MultiProcessDataLoader(
-            self.reader, filepath, num_workers=num_workers, batch_size=1
+            self.reader, filepath, num_workers=num_workers, batch_size=1, start_method="spawn"
         )
         all_instances = []
         for instance in data_loader.iter_instances():
diff --git a/tests/modules/transformer/self_attention_test.py b/tests/modules/transformer/self_attention_test.py
index b8a4d37d8fb..e29ae44cf9e 100644
--- a/tests/modules/transformer/self_attention_test.py
+++ b/tests/modules/transformer/self_attention_test.py
@@ -81,6 +81,7 @@ def test_can_construct_from_params(self):
 
         assert self.self_attention.dropout.p == self.params_dict["dropout"]
 
+    @pytest.mark.skip("Takes up too much memory")
     @pytest.mark.parametrize("module_name, hf_module", get_modules(PARAMS_DICT).items())
     def test_forward_against_huggingface_output(self, module_name, hf_module):
         hidden_states = torch.randn(2, 3, 6)
@@ -101,6 +102,7 @@ def test_forward_against_huggingface_output(self, module_name, hf_module):
 
         assert torch.allclose(output[0], hf_output[0])
 
+    @pytest.mark.skip("Takes up too much memory")
     @pytest.mark.parametrize(
         "pretrained_name",
         [
diff --git a/tests/modules/transformer/transformer_stack_test.py b/tests/modules/transformer/transformer_stack_test.py
index f9383960822..0481a407937 100644
--- a/tests/modules/transformer/transformer_stack_test.py
+++ b/tests/modules/transformer/transformer_stack_test.py
@@ -169,6 +169,7 @@ def test_loading_partial_pretrained_weights(self):
             mapping,
         )
 
+    @pytest.mark.skip("Takes up too much memory")
     @pytest.mark.parametrize("module_name, hf_module", get_modules(PARAMS_DICT).items())
     def test_forward_against_huggingface_outputs(self, module_name, hf_module):
         hidden_states = torch.randn(2, 3, 6)
diff --git a/tests/sanity_checks/task_checklists/__init__.py b/tests/sanity_checks/task_checklists/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/sanity_checks/task_checklists/sentiment_analysis_suite_test.py b/tests/sanity_checks/task_checklists/sentiment_analysis_suite_test.py
new file mode 100644
index 00000000000..5f4f329b578
--- /dev/null
+++ b/tests/sanity_checks/task_checklists/sentiment_analysis_suite_test.py
@@ -0,0 +1,25 @@
+from allennlp.sanity_checks.task_checklists.sentiment_analysis_suite import SentimentAnalysisSuite
+from allennlp.common.testing import AllenNlpTestCase
+from allennlp.models.archival import load_archive
+from allennlp.predictors import Predictor
+
+
+class TestSentimentAnalysisSuite(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+        archive = load_archive(
+            self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz"
+        )
+        self.predictor = Predictor.from_archive(archive)
+
+    def test_run(self):
+        data = [
+            "This is really good",
+            "This was terrible",
+            "This was not good",
+            "John Smith acted very well.",
+            "Seattle was very gloomy.",
+            "I have visited the place for 3 years; great food!",
+        ]
+        suite = SentimentAnalysisSuite(add_default_tests=True, data=data)
+        suite.run(self.predictor, max_examples=10)
diff --git a/tests/sanity_checks/task_checklists/task_suite_test.py b/tests/sanity_checks/task_checklists/task_suite_test.py
new file mode 100644
index 00000000000..84623511f77
--- /dev/null
+++ b/tests/sanity_checks/task_checklists/task_suite_test.py
@@ -0,0 +1,62 @@
+import pytest
+from allennlp.sanity_checks.task_checklists.task_suite import TaskSuite
+from allennlp.common.testing import AllenNlpTestCase
+from allennlp.common.checks import ConfigurationError
+from allennlp.models.archival import load_archive
+from allennlp.predictors import Predictor
+from allennlp.common.testing.checklist_test import FakeTaskSuite  # noqa: F401
+
+
+class TestTaskSuite(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+        archive = load_archive(
+            self.FIXTURES_ROOT / "basic_classifier" / "serialization" / "model.tar.gz"
+        )
+        self.predictor = Predictor.from_archive(archive)
+
+    def test_load_from_suite_file(self):
+        suite_file = str(self.FIXTURES_ROOT / "task_suites" / "fake_suite.tar.gz")
+
+        task_suite = TaskSuite.constructor(suite_file=suite_file)
+
+        assert len(task_suite.suite.tests) == 1
+
+    def test_load_by_name(self):
+
+        task_suite = TaskSuite.constructor(name="fake-task-suite")
+
+        assert task_suite._fake_arg1 is None
+        assert task_suite._fake_arg2 is None
+
+        assert len(task_suite.suite.tests) == 1
+
+        with pytest.raises(ConfigurationError):
+            TaskSuite.constructor(name="suite-that-does-not-exist")
+
+    def test_load_with_extra_args(self):
+        extra_args = {"fake_arg1": "some label"}
+        task_suite = TaskSuite.constructor(name="fake-task-suite", extra_args=extra_args)
+        assert task_suite._fake_arg1 == "some label"
+
+    def test_prediction_and_confidence_scores_function_needs_implementation(self):
+
+        task_suite = TaskSuite.constructor(name="fake-task-suite")
+
+        with pytest.raises(NotImplementedError):
+            task_suite.run(self.predictor)
+
+    def test_add_default_tests(self):
+
+        # We include "isn't" so that the contractions test is also added.
+        data = ["This isn't real data"]
+        task_suite = TaskSuite(add_default_tests=True, data=data)
+        assert "Typos" in task_suite.suite.tests
+        assert "2 Typos" in task_suite.suite.tests
+        assert "Contractions" in task_suite.suite.tests
+
+        data = ["This is data with no contractions."]
+        task_suite = TaskSuite(add_default_tests=True, data=data)
+        assert "Typos" in task_suite.suite.tests
+        assert "2 Typos" in task_suite.suite.tests
+        assert "Contractions" not in task_suite.suite.tests
diff --git a/tests/sanity_checks/task_checklists/utils_test.py b/tests/sanity_checks/task_checklists/utils_test.py
new file mode 100644
index 00000000000..ce6e17eb902
--- /dev/null
+++ b/tests/sanity_checks/task_checklists/utils_test.py
@@ -0,0 +1,12 @@
+from allennlp.sanity_checks.task_checklists import utils
+from allennlp.common.testing import AllenNlpTestCase
+
+
+class TestUtils(AllenNlpTestCase):
+    def test_punctuations(self):
+        perturbed = utils.toggle_punctuation("This has a period.")
+
+        assert perturbed[0] == "This has a period"
+
+        perturbed = utils.toggle_punctuation("This does not have a period")
+        assert perturbed[0] == "This does not have a period."