diff --git a/lit_nlp/examples/datasets/classification.py b/lit_nlp/examples/datasets/classification.py
index 0a022f1c..0d1e1df3 100644
--- a/lit_nlp/examples/datasets/classification.py
+++ b/lit_nlp/examples/datasets/classification.py
@@ -1,10 +1,8 @@
 """Text classification datasets, including single- and two-sentence tasks."""
 from typing import Optional
 
-from absl import logging
 from lit_nlp.api import dataset as lit_dataset
 from lit_nlp.api import types as lit_types
-import pandas as pd
 import tensorflow_datasets as tfds
 
 
@@ -17,134 +15,6 @@ def load_tfds(*args, **kw):
       tfds.as_numpy(tfds.load(*args, download=True, try_gcs=True, **kw)))
 
 
-class MNLIDataFromTSV(lit_dataset.Dataset):
-  """MultiNLI dataset, from TSV.
-
-  Compared to the TFDS version, this includes:
-  - label2 field for binary labels, with same schema as HANS
-  - genre labels, for stratified analysis
-
-  The downside is that you need to download the data from
-  https://gluebenchmark.com/tasks, and provide a path to the .tsv file.
-  """
-
-  LABELS3 = ["entailment", "neutral", "contradiction"]
-  LABELS2 = ["non-entailment", "entailment"]
-
-  def binarize_label(self, label):
-    return "entailment" if label == "entailment" else "non-entailment"
-
-  def __init__(self, path: str):
-    self._examples = self.load_datapoints(path)
-
-  def load_datapoints(self, path: str):
-    with open(path) as fd:
-      df = pd.read_csv(fd, sep="\t")
-    # pylint: disable=g-complex-comprehension
-    return [{
-        "premise": row["sentence1"],
-        "hypothesis": row["sentence2"],
-        "label": row["gold_label"],
-        "label2": self.binarize_label(row["gold_label"]),
-        "genre": row["genre"],
-    } for _, row in df.iterrows()]
-    # pylint: enable=g-complex-comprehension
-
-  def load(self, path: str):
-    datapoints = self.load_datapoints(path)
-    return lit_dataset.Dataset(base=self, examples=datapoints)
-
-  def save(self, examples: list[lit_types.IndexedInput], path: str):
-    example_data = [ex["data"] for ex in examples]
-    df = pd.DataFrame(example_data).rename(columns={
-        "premise": "sentence1",
-        "hypothesis": "sentence2",
-        "label": "gold_label",
-    })
-    with open(path, "w") as fd:
-      df.to_csv(fd, sep="\t")
-
-  def spec(self) -> lit_types.Spec:
-    """Should match MnliModel's input_spec()."""
-    return {
-        "premise": lit_types.TextSegment(),
-        "hypothesis": lit_types.TextSegment(),
-        # 'label' for 3-way NLI labels, 'label2' for binarized.
-        "label": lit_types.CategoryLabel(vocab=self.LABELS3),
-        "label2": lit_types.CategoryLabel(vocab=self.LABELS2),
-        "genre": lit_types.CategoryLabel(),
-    }
-
-
-class XNLIData(lit_dataset.Dataset):
-  """Cross-lingual NLI; see https://cims.nyu.edu/~sbowman/xnli/."""
-
-  LABELS = ["entailment", "neutral", "contradiction"]
-
-  def _process_example(self, ex, languages: list[str]):
-    # Hypothesis is stored as parallel arrays, so make a map.
-    hyp_map = {
-        lang.decode("utf-8"): hyp.decode("utf-8") for lang, hyp in zip(
-            ex["hypothesis"]["language"], ex["hypothesis"]["translation"])
-    }
-    for lang in languages:
-      if lang not in hyp_map:
-        logging.warning("Missing hypothesis (lang=%s) for premise '%s'", lang,
-                        ex["premise"]["lang"].decode("utf-8"))
-        continue
-      yield {
-          "premise": ex["premise"][lang].decode("utf-8"),
-          "hypothesis": hyp_map[lang],
-          "label": self.LABELS[ex["label"]],
-          "language": lang,
-      }
-
-  def __init__(self, split: str, languages=("en", "es", "hi", "zh")):
-    self._examples = []
-    for ex in load_tfds("xnli", split=split):
-      # Each TFDS example contains all the translations; we unpack to individual
-      # (premise, hypothesis) pairs that are compatible with a standard NLI
-      # model.
-      self._examples.extend(self._process_example(ex, languages))
-
-  def spec(self):
-    return {
-        "premise": lit_types.TextSegment(),
-        "hypothesis": lit_types.TextSegment(),
-        "label": lit_types.CategoryLabel(vocab=self.LABELS),
-        "language": lit_types.CategoryLabel(),
-    }
-
-
-class HansNLIData(lit_dataset.Dataset):
-  """HANS NLI challenge set (https://arxiv.org/abs/1902.01007); 30k examples."""
-
-  LABELS = ["non-entailment", "entailment"]
-
-  def __init__(self, path: str):
-    with open(path) as fd:
-      df = pd.read_csv(fd, sep="\t", header=0)
-    # pylint: disable=g-complex-comprehension
-    self._examples = [{
-        "premise": row["sentence1"],
-        "hypothesis": row["sentence2"],
-        "label2": row["gold_label"],
-        "heuristic": row["heuristic"],
-        "template": row["template"],
-    } for _, row in df.iterrows()]
-    # pylint: enable=g-complex-comprehension
-
-  def spec(self) -> lit_types.Spec:
-    return {
-        "premise": lit_types.TextSegment(),
-        "hypothesis": lit_types.TextSegment(),
-        # 'label2' for 2-way NLI labels
-        "label2": lit_types.CategoryLabel(vocab=self.LABELS),
-        "heuristic": lit_types.CategoryLabel(),
-        "template": lit_types.CategoryLabel(),
-    }
-
-
 class IMDBData(lit_dataset.Dataset):
   """IMDB reviews dataset; see http://ai.stanford.edu/~amaas/data/sentiment/."""
 
diff --git a/lit_nlp/examples/xnli_demo.py b/lit_nlp/examples/xnli_demo.py
deleted file mode 100644
index 4f209e39..00000000
--- a/lit_nlp/examples/xnli_demo.py
+++ /dev/null
@@ -1,103 +0,0 @@
-r"""Example demo for multilingual NLI on the XNLI eval set.
-
-To run locally with our trained model:
-  python -m lit_nlp.examples.xnli_demo --port=5432
-
-Then navigate to localhost:5432 to access the demo UI.
-
-To train a model for this task, use tools/glue_trainer.py or your favorite
-trainer script to fine-tune a multilingual encoder, such as
-bert-base-multilingual-cased, on the mnli task.
-
-Note: the LIT UI can handle around 10k examples comfortably, depending on your
-hardware. The monolingual (english) eval sets for MNLI are about 9.8k each,
-while each language for XNLI is about 2.5k examples, so we recommend using the
---languages flag to load only the languages you're interested in.
-"""
-
-from collections.abc import Sequence
-import sys
-from typing import Optional
-
-from absl import app
-from absl import flags
-from absl import logging
-
-from lit_nlp import dev_server
-from lit_nlp import server_flags
-from lit_nlp.examples.datasets import classification
-from lit_nlp.examples.datasets import glue
-from lit_nlp.examples.models import glue_models
-from lit_nlp.lib import file_cache
-
-# NOTE: additional flags defined in server_flags.py
-
-FLAGS = flags.FLAGS
-
-FLAGS.set_default("development_demo", True)
-
-_LANGUAGES = flags.DEFINE_list(
-    "languages", ["en", "es", "hi", "zh"],
-    "Languages to load from XNLI. Available languages: "
-    "ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,zh,vi"
-)
-
-_MODEL_PATH = flags.DEFINE_string(
-    "model_path",
-    "https://storage.googleapis.com/what-if-tool-resources/lit-models/mbert_mnli.tar.gz",
-    (
-        "Path to fine-tuned model files. Expects model to be in standard "
-        "transformers format, e.g. as saved by model.save_pretrained() and "
-        "tokenizer.save_pretrained()."
-    ),
-)
-
-_MAX_EXAMPLES = flags.DEFINE_integer(
-    "max_examples", None, "Maximum number of examples to load into LIT. "
-    "Note: MNLI eval set is 10k examples, so will take a while to run and may "
-    "be slow on older machines. Set --max_examples=200 for a quick start.")
-
-
-def get_wsgi_app() -> Optional[dev_server.LitServerType]:
-  """Returns a LitApp instance for consumption by gunicorn."""
-  FLAGS.set_default("server_type", "external")
-  FLAGS.set_default("demo_mode", True)
-  # Parse flags without calling app.run(main), to avoid conflict with
-  # gunicorn command line flags.
-  unused = flags.FLAGS(sys.argv, known_only=True)
-  if unused:
-    logging.info("xnli_demo:get_wsgi_app() called with unused args: %s", unused)
-  return main([])
-
-
-def main(argv: Sequence[str]) -> Optional[dev_server.LitServerType]:
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  # Normally path is a directory; if it's an archive file, download and
-  # extract to the transformers cache.
-  model_path = _MODEL_PATH.value
-  if model_path.endswith(".tar.gz"):
-    model_path = file_cache.cached_path(
-        model_path, extract_compressed_file=True)
-
-  models = {"nli": glue_models.MNLIModel(model_path, inference_batch_size=16)}
-  datasets = {
-      "xnli": classification.XNLIData("validation", _LANGUAGES.value),
-      "mnli_dev": glue.MNLIData("validation_matched"),
-      "mnli_dev_mm": glue.MNLIData("validation_mismatched"),
-  }
-
-  # Truncate datasets if --max_examples is set.
-  for name in datasets:
-    logging.info("Dataset: '%s' with %d examples", name, len(datasets[name]))
-    datasets[name] = datasets[name].slice[:_MAX_EXAMPLES.value]
-    logging.info("  truncated to %d examples", len(datasets[name]))
-
-  # Start the LIT server. See server_flags.py for server options.
-  lit_demo = dev_server.Server(models, datasets, **server_flags.get_flags())
-  return lit_demo.serve()
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/website/sphinx_src/demos.md b/website/sphinx_src/demos.md
index c919790d..336d2b43 100644
--- a/website/sphinx_src/demos.md
+++ b/website/sphinx_src/demos.md
@@ -43,19 +43,6 @@ https://pair-code.github.io/lit/demos/.
 Tip: check out a case study for this demo on the public LIT website:
 https://pair-code.github.io/lit/tutorials/sentiment
 
-### Multilingual (XNLI) <!-- DO NOT REMOVE {#xnli .demo-header} -->
-
-**Code:** [examples/xnli_demo.py](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/xnli_demo.py)
-
-*   [XNLI](https://cims.nyu.edu/~sbowman/xnli/) dataset translates a subset of
-    MultiNLI into 14 different languages.
-*   Specify `--languages=en,jp,hi,...` flag to select which languages to load.
-*   NLI as a three-way classification task with two-segment input (premise,
-    hypothesis).
-*   Fine-tuned multilingual BERT model.
-*   Salience methods work with non-whitespace-delimited text, by using the
-    model's wordpiece tokenization.
-
 --------------------------------------------------------------------------------
 
 ## Regression / Scoring <!-- DO NOT REMOVE {#regression-scoring .demo-section-header} -->
diff --git a/website/sphinx_src/faq.md b/website/sphinx_src/faq.md
index 9ff5b3d5..fc70e29c 100644
--- a/website/sphinx_src/faq.md
+++ b/website/sphinx_src/faq.md
@@ -34,9 +34,6 @@ All strings in LIT are unicode and most components use model-provided
 tokenization if available, so in most cases non-English languages and non-Latin
 scripts should work without any modifications. For examples, see:
 
-*   [XNLI demo](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/xnli_demo.py) -
-    cross-lingual NLI, with up to 15 languages supported via a multilingual BERT
-    model.
 *   [T5 demo](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/t5_demo.py) -
     includes WMT data for machine translation