From 7437f09ba608bfce34b10f7aa5ff9098923228ca Mon Sep 17 00:00:00 2001
From: devrimcavusoglu <devrimcavusoglu@gmail.com>
Date: Fri, 4 Mar 2022 20:34:34 +0300
Subject: [PATCH 1/5] WIP: Adding TER. Multi pred & multi ref has not been
 completed yet.

---
 jury/metrics/ter/__init__.py                  |   1 +
 jury/metrics/ter/ter.py                       |   8 +
 .../ter/ter_for_language_generation.py        | 208 ++++++++++++++++++
 3 files changed, 217 insertions(+)
 create mode 100644 jury/metrics/ter/__init__.py
 create mode 100644 jury/metrics/ter/ter.py
 create mode 100644 jury/metrics/ter/ter_for_language_generation.py

diff --git a/jury/metrics/ter/__init__.py b/jury/metrics/ter/__init__.py
new file mode 100644
index 0000000..e03f17b
--- /dev/null
+++ b/jury/metrics/ter/__init__.py
@@ -0,0 +1 @@
+from jury.metrics.ter.ter import TER
diff --git a/jury/metrics/ter/ter.py b/jury/metrics/ter/ter.py
new file mode 100644
index 0000000..24f4be9
--- /dev/null
+++ b/jury/metrics/ter/ter.py
@@ -0,0 +1,8 @@
+from jury.metrics._core import MetricAlias
+from jury.metrics.ter.ter_for_language_generation import TERForLanguageGeneration
+
+__main_class__ = "TER"
+
+
+class TER(MetricAlias):
+    _SUBCLASS = TERForLanguageGeneration
diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py
new file mode 100644
index 0000000..70f12f2
--- /dev/null
+++ b/jury/metrics/ter/ter_for_language_generation.py
@@ -0,0 +1,208 @@
+# coding=utf-8
+# Copyright 2020 Open Business Software Solutions, The HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Word Error Ratio (WER) metric. The part of this file is adapted from HuggingFace's
+datasets package implementation of CER metric. See
+https://github.com/huggingface/datasets/blob/master/metrics/wer/wer.py
+"""
+import warnings
+from typing import Callable, List, Tuple, Union
+
+import datasets
+
+from jury.metrics import LanguageGenerationInstance, MetricForLanguageGeneration
+from jury.metrics._core.utils import PackagePlaceholder, requirement_message
+
+# `import sacrebleu as scb` placeholder
+scb = PackagePlaceholder(version="2.0.0")
+
+
+_CITATION = """\
+@inproceedings{snover-etal-2006-study,
+    title = "A Study of Translation Edit Rate with Targeted Human Annotation",
+    author = "Snover, Matthew  and
+      Dorr, Bonnie  and
+      Schwartz, Rich  and
+      Micciulla, Linnea  and
+      Makhoul, John",
+    booktitle = "Proceedings of the 7th Conference of the Association for Machine Translation in the Americas: Technical Papers",
+    month = aug # " 8-12",
+    year = "2006",
+    address = "Cambridge, Massachusetts, USA",
+    publisher = "Association for Machine Translation in the Americas",
+    url = "https://aclanthology.org/2006.amta-papers.25",
+    pages = "223--231",
+}
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a
+hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu
+(https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found
+here: https://github.com/jhclark/tercom.
+The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
+the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
+sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
+See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
+"""
+
+_KWARGS_DESCRIPTION = """
+Produces TER scores alongside the number of edits and reference length.
+Args:
+    predictions: The system stream (a sequence of segments).
+    references: A list of one or more reference streams (each a sequence of segments).
+    normalized: Whether to apply basic tokenization to sentences.
+    no_punct: Whether to remove punctuations from sentences.
+    asian_support: Whether to support Asian character processing.
+    case_sensitive: Whether to disable lowercasing.
+Returns:
+    'score': TER score (num_edits / sum_ref_lengths * 100),
+    'num_edits': The cumulative number of edits,
+    'ref_length': The cumulative average reference length.
+Examples:
+    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+    >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
+    >>> ter = jury.load_metric("ter")
+    >>> results = ter.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'score': 0.0, 'num_edits': 0, 'ref_length': 6.5}
+"""
+
+
+@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class TERForLanguageGeneration(MetricForLanguageGeneration):
+    def _info(self):
+        return datasets.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            homepage="http://www.cs.umd.edu/~snover/tercom/",
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=self._default_features,
+            codebase_urls=["https://github.com/mjpost/sacreBLEU#ter"],
+            reference_urls=[
+                "https://github.com/jhclark/tercom",
+            ],
+        )
+
+    def _download_and_prepare(self, dl_manager):
+        global scb
+        global TERScorer
+
+        try:
+            import sacrebleu as scb
+            from sacrebleu import TER as TERScorer
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(requirement_message(path="WER", package_name="jiwer"))
+        else:
+            super(TERForLanguageGeneration, self)._download_and_prepare(dl_manager)
+
+    def _compute_ter_score(self, predictions: LanguageGenerationInstance, references: LanguageGenerationInstance, **kwargs):
+        sb_ter = TERScorer(**kwargs)
+        output = sb_ter.corpus_score(predictions, references)
+        return {"score": output.score, "num_edits": output.num_edits, "ref_length": output.ref_length}
+
+    def _compute_single_pred_single_ref(
+        self,
+        predictions: LanguageGenerationInstance,
+        references: LanguageGenerationInstance,
+        reduce_fn: Callable = None,
+        normalized: bool = False,
+        no_punct: bool = False,
+        asian_support: bool = False,
+        case_sensitive: bool = False,
+    ):
+        transformed_references = [[r] for r in references]
+        return self._compute_ter_score(
+                predictions=predictions,
+                references=transformed_references,
+                normalized=normalized,
+                no_punct=no_punct,
+                asian_support=asian_support,
+                case_sensitive=case_sensitive
+        )
+
+    def _compute_single_pred_multi_ref(
+        self,
+        predictions: LanguageGenerationInstance,
+        references: LanguageGenerationInstance,
+        reduce_fn: Callable = None,
+        normalized: bool = False,
+        no_punct: bool = False,
+        asian_support: bool = False,
+        case_sensitive: bool = False,
+    ):
+        references_per_prediction = len(references[0])
+        if any(len(refs) != references_per_prediction for refs in references):
+            raise ValueError("Sacrebleu requires the same number of references for each prediction")
+        transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
+        # SacreBleu inherently supports multiple references.
+        return self._compute_ter_score(
+                predictions=predictions,
+                references=transformed_references,
+                normalized=normalized,
+                no_punct=no_punct,
+                asian_support=asian_support,
+                case_sensitive=case_sensitive
+        )
+
+    def _compute_multi_pred_multi_ref(
+        self,
+        predictions: LanguageGenerationInstance,
+        references: LanguageGenerationInstance,
+        reduce_fn: Callable = None,
+        normalized: bool = False,
+        no_punct: bool = False,
+        asian_support: bool = False,
+        case_sensitive: bool = False,
+    ):
+        scores = []
+        for preds, refs in zip(predictions, references):
+            pred_scores = []
+            for pred in preds:
+                score = self._compute_single_pred_multi_ref(
+                        predictions=[pred],
+                        references=[refs],
+                        normalized=normalized,
+                        no_punct=no_punct,
+                        asian_support=asian_support,
+                        case_sensitive=case_sensitive
+                )
+                pred_scores.append(score["score"])
+            pred_score = reduce_fn(pred_scores)
+            scores.append(pred_score)
+        return {"score": sum(scores) / len(scores)}
+
+
+if __name__ == "__main__":
+    import json
+
+    predictions = [["hello there general kenobi", "hi there"], ["foo bar foobar"]]
+    # references = ["hello there general kenobi", "foo bar foobar"]
+    references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
+
+    ter = TERForLanguageGeneration()
+    res = ter.compute(predictions=predictions, references=references)
+
+    print(json.dumps(res, indent=2))

From 3e04d8a4f7d2eae92bf1b01f762152b52acee115 Mon Sep 17 00:00:00 2001
From: devrimcavusoglu <devrimcavusoglu@gmail.com>
Date: Wed, 9 Mar 2022 20:37:16 +0300
Subject: [PATCH 2/5] TER metric added.

---
 jury/metrics/__init__.py                      |   1 +
 .../ter/ter_for_language_generation.py        | 107 +++++++++++-------
 tests/jury/metrics/test_ter.py                |  47 ++++++++
 .../expected_outputs/metrics/test_ter.json    |  29 +++++
 4 files changed, 140 insertions(+), 44 deletions(-)
 create mode 100644 tests/jury/metrics/test_ter.py
 create mode 100644 tests/test_data/expected_outputs/metrics/test_ter.json

diff --git a/jury/metrics/__init__.py b/jury/metrics/__init__.py
index 853ff97..258dc82 100644
--- a/jury/metrics/__init__.py
+++ b/jury/metrics/__init__.py
@@ -26,4 +26,5 @@
 from jury.metrics.rouge import Rouge
 from jury.metrics.sacrebleu import Sacrebleu
 from jury.metrics.squad import Squad
+from jury.metrics.ter import TER
 from jury.metrics.wer import WER
diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py
index 70f12f2..0355c00 100644
--- a/jury/metrics/ter/ter_for_language_generation.py
+++ b/jury/metrics/ter/ter_for_language_generation.py
@@ -16,11 +16,11 @@
 datasets package implementation of CER metric. See
 https://github.com/huggingface/datasets/blob/master/metrics/wer/wer.py
 """
-import warnings
-from typing import Callable, List, Tuple, Union
+from typing import Callable, Dict, Sequence
 
 import datasets
 
+from jury.collator import Collator
 from jury.metrics import LanguageGenerationInstance, MetricForLanguageGeneration
 from jury.metrics._core.utils import PackagePlaceholder, requirement_message
 
@@ -118,10 +118,20 @@ def _download_and_prepare(self, dl_manager):
         else:
             super(TERForLanguageGeneration, self)._download_and_prepare(dl_manager)
 
-    def _compute_ter_score(self, predictions: LanguageGenerationInstance, references: LanguageGenerationInstance, **kwargs):
+    def _validate_references(self, references: Collator) -> None:
+        references_per_prediction = len(references[0])
+        if any(len(refs) != references_per_prediction for refs in references):
+            raise ValueError("Sacrebleu requires the same number of references for each prediction")
+
+    def _compute_ter_score(
+        self, predictions: Sequence[str], references: Sequence[Sequence[str]], sentence_level: bool = False, **kwargs
+    ):
         sb_ter = TERScorer(**kwargs)
-        output = sb_ter.corpus_score(predictions, references)
-        return {"score": output.score, "num_edits": output.num_edits, "ref_length": output.ref_length}
+        if sentence_level:
+            output = sb_ter.sentence_score(predictions, references)
+        else:
+            output = sb_ter.corpus_score(predictions, references)
+        return {"score": float(output.score / 100), "num_edits": output.num_edits, "ref_length": output.ref_length}
 
     def _compute_single_pred_single_ref(
         self,
@@ -133,14 +143,13 @@ def _compute_single_pred_single_ref(
         asian_support: bool = False,
         case_sensitive: bool = False,
     ):
-        transformed_references = [[r] for r in references]
         return self._compute_ter_score(
-                predictions=predictions,
-                references=transformed_references,
-                normalized=normalized,
-                no_punct=no_punct,
-                asian_support=asian_support,
-                case_sensitive=case_sensitive
+            predictions=predictions,
+            references=references,
+            normalized=normalized,
+            no_punct=no_punct,
+            asian_support=asian_support,
+            case_sensitive=case_sensitive,
         )
 
     def _compute_single_pred_multi_ref(
@@ -153,18 +162,14 @@ def _compute_single_pred_multi_ref(
         asian_support: bool = False,
         case_sensitive: bool = False,
     ):
-        references_per_prediction = len(references[0])
-        if any(len(refs) != references_per_prediction for refs in references):
-            raise ValueError("Sacrebleu requires the same number of references for each prediction")
-        transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
         # SacreBleu inherently supports multiple references.
         return self._compute_ter_score(
-                predictions=predictions,
-                references=transformed_references,
-                normalized=normalized,
-                no_punct=no_punct,
-                asian_support=asian_support,
-                case_sensitive=case_sensitive
+            predictions=predictions,
+            references=references,
+            normalized=normalized,
+            no_punct=no_punct,
+            asian_support=asian_support,
+            case_sensitive=case_sensitive,
         )
 
     def _compute_multi_pred_multi_ref(
@@ -178,31 +183,45 @@ def _compute_multi_pred_multi_ref(
         case_sensitive: bool = False,
     ):
         scores = []
+        avg_num_edits = 0
+        avg_ref_length = 0
         for preds, refs in zip(predictions, references):
             pred_scores = []
+            num_edits = []
+            ref_lengths = []
             for pred in preds:
-                score = self._compute_single_pred_multi_ref(
-                        predictions=[pred],
-                        references=[refs],
-                        normalized=normalized,
-                        no_punct=no_punct,
-                        asian_support=asian_support,
-                        case_sensitive=case_sensitive
+                score = self._compute_ter_score(
+                    predictions=pred,
+                    references=refs,
+                    sentence_level=True,
+                    normalized=normalized,
+                    no_punct=no_punct,
+                    asian_support=asian_support,
+                    case_sensitive=case_sensitive,
                 )
                 pred_scores.append(score["score"])
-            pred_score = reduce_fn(pred_scores)
+                num_edits.append(score["num_edits"])
+                ref_lengths.append(score["ref_length"])
+            pred_score = reduce_fn(pred_scores).item()
+            avg_num_edits += sum(num_edits) / len(num_edits)
+            avg_ref_length += sum(ref_lengths) / len(ref_lengths)
             scores.append(pred_score)
-        return {"score": sum(scores) / len(scores)}
-
-
-if __name__ == "__main__":
-    import json
-
-    predictions = [["hello there general kenobi", "hi there"], ["foo bar foobar"]]
-    # references = ["hello there general kenobi", "foo bar foobar"]
-    references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
-
-    ter = TERForLanguageGeneration()
-    res = ter.compute(predictions=predictions, references=references)
-
-    print(json.dumps(res, indent=2))
+        return {
+            "score": sum(scores) / len(scores),
+            "avg_num_edits": avg_num_edits / len(predictions),
+            "avg_ref_length": avg_ref_length / len(predictions),
+        }
+
+    def evaluate(
+        self, predictions: Collator, references: Collator, reduce_fn: Callable = None, **kwargs
+    ) -> Dict[str, float]:
+        if predictions.can_collapse() and references.can_collapse():
+            predictions = predictions.collapse()
+            eval_fn = self._compute_single_pred_single_ref
+        elif predictions.can_collapse() and not references.can_collapse():
+            predictions = predictions.collapse()
+            eval_fn = self._compute_single_pred_multi_ref
+        else:
+            eval_fn = self._compute_multi_pred_multi_ref
+        self._validate_references(references)
+        return eval_fn(predictions=predictions, references=references, reduce_fn=reduce_fn, **kwargs)
diff --git a/tests/jury/metrics/test_ter.py b/tests/jury/metrics/test_ter.py
new file mode 100644
index 0000000..3542ac4
--- /dev/null
+++ b/tests/jury/metrics/test_ter.py
@@ -0,0 +1,47 @@
+import pytest
+
+from jury import Jury
+from jury.metrics import AutoMetric
+from tests.jury.conftest import get_expected_output
+from tests.utils import assert_almost_equal_dict
+
+
+@pytest.fixture(scope="module")
+def jury_ter():
+    metric = AutoMetric.load("ter", compute_kwargs={"normalized": True})
+    return Jury(metrics=metric)
+
+
+@pytest.fixture
+@get_expected_output(prefix="metrics")
+def output_basic():
+    return output_basic.output
+
+
+@pytest.fixture
+@get_expected_output(prefix="metrics")
+def output_multiple_ref():
+    return output_multiple_ref.output
+
+
+@pytest.fixture
+@get_expected_output(prefix="metrics")
+def output_multiple_pred_multiple_ref():
+    return output_multiple_pred_multiple_ref.output
+
+
+def test_basic(predictions, references, jury_ter, output_basic):
+    scores = jury_ter(predictions=predictions, references=references)
+    assert_almost_equal_dict(actual=scores, desired=output_basic)
+
+
+def test_multiple_ref(predictions, multiple_references, jury_ter, output_multiple_ref):
+    scores = jury_ter(predictions=predictions, references=multiple_references)
+    assert_almost_equal_dict(actual=scores, desired=output_multiple_ref)
+
+
+def test_multiple_pred_multiple_ref(
+    multiple_predictions, multiple_references, jury_ter, output_multiple_pred_multiple_ref
+):
+    scores = jury_ter(predictions=multiple_predictions, references=multiple_references)
+    assert_almost_equal_dict(actual=scores, desired=output_multiple_pred_multiple_ref)
diff --git a/tests/test_data/expected_outputs/metrics/test_ter.json b/tests/test_data/expected_outputs/metrics/test_ter.json
new file mode 100644
index 0000000..d4a2e16
--- /dev/null
+++ b/tests/test_data/expected_outputs/metrics/test_ter.json
@@ -0,0 +1,29 @@
+{
+  "basic": {
+    "total_items": 2,
+    "empty_items": 0,
+    "ter": {
+      "score": 0.4615384615384615,
+      "num_edits": 3,
+      "ref_length": 6.5
+    }
+  },
+  "multiple_ref": {
+    "total_items": 2,
+    "empty_items": 0,
+    "ter": {
+      "score": 0.6153846153846154,
+      "num_edits": 8,
+      "ref_length": 13.0
+    }
+  },
+  "multiple_pred_multiple_ref": {
+    "total_items": 2,
+    "empty_items": 0,
+    "ter": {
+      "score": 0.812121212121212,
+      "avg_num_edits": 3.25,
+      "avg_ref_length": 6.5
+    }
+  }
+}
\ No newline at end of file

From ef4e979dc126789627a496ded2c2067937f34caf Mon Sep 17 00:00:00 2001
From: devrimcavusoglu <devrimcavusoglu@gmail.com>
Date: Thu, 10 Mar 2022 12:04:03 +0300
Subject: [PATCH 3/5] Typo fixed in docstring.

---
 jury/metrics/ter/ter_for_language_generation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py
index 0355c00..77a9fe7 100644
--- a/jury/metrics/ter/ter_for_language_generation.py
+++ b/jury/metrics/ter/ter_for_language_generation.py
@@ -12,9 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Word Error Ratio (WER) metric. The part of this file is adapted from HuggingFace's
-datasets package implementation of CER metric. See
-https://github.com/huggingface/datasets/blob/master/metrics/wer/wer.py
+""" Translation Edit Rate(TER) metric. The part of this file is adapted from HuggingFace's
+datasets package implementation of TER metric. See
+https://github.com/huggingface/datasets/blob/master/metrics/ter/ter.py
 """
 from typing import Callable, Dict, Sequence
 

From e1841841210d08adf179f838338d6ac30df4fe13 Mon Sep 17 00:00:00 2001
From: devrimcavusoglu <devrimcavusoglu@gmail.com>
Date: Thu, 10 Mar 2022 13:28:09 +0300
Subject: [PATCH 4/5] Typo fixed in docstring.

---
 jury/metrics/ter/ter_for_language_generation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py
index 77a9fe7..f7010fc 100644
--- a/jury/metrics/ter/ter_for_language_generation.py
+++ b/jury/metrics/ter/ter_for_language_generation.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Translation Edit Rate(TER) metric. The part of this file is adapted from HuggingFace's
+""" Translation Edit Rate (TER) metric. The part of this file is adapted from HuggingFace's
 datasets package implementation of TER metric. See
 https://github.com/huggingface/datasets/blob/master/metrics/ter/ter.py
 """
@@ -78,11 +78,11 @@
     asian_support: Whether to support Asian character processing.
     case_sensitive: Whether to disable lowercasing.
 Returns:
-    'score': TER score (num_edits / sum_ref_lengths * 100),
+    'score': TER score (num_edits / sum_ref_lengths),
     'num_edits': The cumulative number of edits,
     'ref_length': The cumulative average reference length.
 Examples:
-    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
+    >>> predictions = [["hello there general kenobi", "foo bar foobar"]]
     >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
     >>> ter = jury.load_metric("ter")
     >>> results = ter.compute(predictions=predictions, references=references)

From af01cfbb2265cdcac282250c4b3e7e3fd4fc12d9 Mon Sep 17 00:00:00 2001
From: devrimcavusoglu <devrimcavusoglu@gmail.com>
Date: Tue, 15 Mar 2022 12:52:08 +0300
Subject: [PATCH 5/5] Docstring example corrected. - requirement_message
 corrected.

---
 jury/metrics/ter/ter_for_language_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py
index f7010fc..1dc7986 100644
--- a/jury/metrics/ter/ter_for_language_generation.py
+++ b/jury/metrics/ter/ter_for_language_generation.py
@@ -82,7 +82,7 @@
     'num_edits': The cumulative number of edits,
     'ref_length': The cumulative average reference length.
 Examples:
-    >>> predictions = [["hello there general kenobi", "foo bar foobar"]]
+    >>> predictions = [["hello there general kenobi"], ["foo bar foobar"]]
     >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
     >>> ter = jury.load_metric("ter")
     >>> results = ter.compute(predictions=predictions, references=references)
@@ -114,7 +114,7 @@ def _download_and_prepare(self, dl_manager):
             import sacrebleu as scb
             from sacrebleu import TER as TERScorer
         except ModuleNotFoundError:
-            raise ModuleNotFoundError(requirement_message(path="WER", package_name="jiwer"))
+            raise ModuleNotFoundError(requirement_message(path="TER", package_name="sacrebleu"))
         else:
             super(TERForLanguageGeneration, self)._download_and_prepare(dl_manager)