Skip to content

Commit

Permalink
Remove the xnli demos from the LIT examples.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 640597959
  • Loading branch information
llcourage authored and LIT team committed Jun 5, 2024
1 parent c2fb41b commit dd196e9
Show file tree
Hide file tree
Showing 4 changed files with 0 additions and 249 deletions.
130 changes: 0 additions & 130 deletions lit_nlp/examples/datasets/classification.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
"""Text classification datasets, including single- and two-sentence tasks."""
from typing import Optional

from absl import logging
from lit_nlp.api import dataset as lit_dataset
from lit_nlp.api import types as lit_types
import pandas as pd
import tensorflow_datasets as tfds


Expand All @@ -17,134 +15,6 @@ def load_tfds(*args, **kw):
tfds.as_numpy(tfds.load(*args, download=True, try_gcs=True, **kw)))


class MNLIDataFromTSV(lit_dataset.Dataset):
"""MultiNLI dataset, from TSV.
Compared to the TFDS version, this includes:
- label2 field for binary labels, with same schema as HANS
- genre labels, for stratified analysis
The downside is that you need to download the data from
https://gluebenchmark.com/tasks, and provide a path to the .tsv file.
"""

LABELS3 = ["entailment", "neutral", "contradiction"]
LABELS2 = ["non-entailment", "entailment"]

def binarize_label(self, label):
return "entailment" if label == "entailment" else "non-entailment"

def __init__(self, path: str):
self._examples = self.load_datapoints(path)

def load_datapoints(self, path: str):
with open(path) as fd:
df = pd.read_csv(fd, sep="\t")
# pylint: disable=g-complex-comprehension
return [{
"premise": row["sentence1"],
"hypothesis": row["sentence2"],
"label": row["gold_label"],
"label2": self.binarize_label(row["gold_label"]),
"genre": row["genre"],
} for _, row in df.iterrows()]
# pylint: enable=g-complex-comprehension

def load(self, path: str):
datapoints = self.load_datapoints(path)
return lit_dataset.Dataset(base=self, examples=datapoints)

def save(self, examples: list[lit_types.IndexedInput], path: str):
example_data = [ex["data"] for ex in examples]
df = pd.DataFrame(example_data).rename(columns={
"premise": "sentence1",
"hypothesis": "sentence2",
"label": "gold_label",
})
with open(path, "w") as fd:
df.to_csv(fd, sep="\t")

def spec(self) -> lit_types.Spec:
"""Should match MnliModel's input_spec()."""
return {
"premise": lit_types.TextSegment(),
"hypothesis": lit_types.TextSegment(),
# 'label' for 3-way NLI labels, 'label2' for binarized.
"label": lit_types.CategoryLabel(vocab=self.LABELS3),
"label2": lit_types.CategoryLabel(vocab=self.LABELS2),
"genre": lit_types.CategoryLabel(),
}


class XNLIData(lit_dataset.Dataset):
"""Cross-lingual NLI; see https://cims.nyu.edu/~sbowman/xnli/."""

LABELS = ["entailment", "neutral", "contradiction"]

def _process_example(self, ex, languages: list[str]):
# Hypothesis is stored as parallel arrays, so make a map.
hyp_map = {
lang.decode("utf-8"): hyp.decode("utf-8") for lang, hyp in zip(
ex["hypothesis"]["language"], ex["hypothesis"]["translation"])
}
for lang in languages:
if lang not in hyp_map:
logging.warning("Missing hypothesis (lang=%s) for premise '%s'", lang,
ex["premise"]["lang"].decode("utf-8"))
continue
yield {
"premise": ex["premise"][lang].decode("utf-8"),
"hypothesis": hyp_map[lang],
"label": self.LABELS[ex["label"]],
"language": lang,
}

def __init__(self, split: str, languages=("en", "es", "hi", "zh")):
self._examples = []
for ex in load_tfds("xnli", split=split):
# Each TFDS example contains all the translations; we unpack to individual
# (premise, hypothesis) pairs that are compatible with a standard NLI
# model.
self._examples.extend(self._process_example(ex, languages))

def spec(self):
return {
"premise": lit_types.TextSegment(),
"hypothesis": lit_types.TextSegment(),
"label": lit_types.CategoryLabel(vocab=self.LABELS),
"language": lit_types.CategoryLabel(),
}


class HansNLIData(lit_dataset.Dataset):
"""HANS NLI challenge set (https://arxiv.org/abs/1902.01007); 30k examples."""

LABELS = ["non-entailment", "entailment"]

def __init__(self, path: str):
with open(path) as fd:
df = pd.read_csv(fd, sep="\t", header=0)
# pylint: disable=g-complex-comprehension
self._examples = [{
"premise": row["sentence1"],
"hypothesis": row["sentence2"],
"label2": row["gold_label"],
"heuristic": row["heuristic"],
"template": row["template"],
} for _, row in df.iterrows()]
# pylint: enable=g-complex-comprehension

def spec(self) -> lit_types.Spec:
return {
"premise": lit_types.TextSegment(),
"hypothesis": lit_types.TextSegment(),
# 'label2' for 2-way NLI labels
"label2": lit_types.CategoryLabel(vocab=self.LABELS),
"heuristic": lit_types.CategoryLabel(),
"template": lit_types.CategoryLabel(),
}


class IMDBData(lit_dataset.Dataset):
"""IMDB reviews dataset; see http://ai.stanford.edu/~amaas/data/sentiment/."""

Expand Down
103 changes: 0 additions & 103 deletions lit_nlp/examples/xnli_demo.py

This file was deleted.

13 changes: 0 additions & 13 deletions website/sphinx_src/demos.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,6 @@ https://pair-code.github.io/lit/demos/.
Tip: check out a case study for this demo on the public LIT website:
https://pair-code.github.io/lit/tutorials/sentiment

### Multilingual (XNLI) <!-- DO NOT REMOVE {#xnli .demo-header} -->

**Code:** [examples/xnli_demo.py](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/xnli_demo.py)

* [XNLI](https://cims.nyu.edu/~sbowman/xnli/) dataset translates a subset of
MultiNLI into 14 different languages.
* Specify `--languages=en,jp,hi,...` flag to select which languages to load.
* NLI as a three-way classification task with two-segment input (premise,
hypothesis).
* Fine-tuned multilingual BERT model.
* Salience methods work with non-whitespace-delimited text, by using the
model's wordpiece tokenization.

--------------------------------------------------------------------------------

## Regression / Scoring <!-- DO NOT REMOVE {#regression-scoring .demo-section-header} -->
Expand Down
3 changes: 0 additions & 3 deletions website/sphinx_src/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ All strings in LIT are unicode and most components use model-provided
tokenization if available, so in most cases non-English languages and non-Latin
scripts should work without any modifications. For examples, see:

* [XNLI demo](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/xnli_demo.py) -
cross-lingual NLI, with up to 15 languages supported via a multilingual BERT
model.
* [T5 demo](https://github.com/PAIR-code/lit/blob/main/lit_nlp/examples/t5_demo.py) -
includes WMT data for machine translation

Expand Down

0 comments on commit dd196e9

Please sign in to comment.