From 7437f09ba608bfce34b10f7aa5ff9098923228ca Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Fri, 4 Mar 2022 20:34:34 +0300 Subject: [PATCH 1/5] WIP: Adding TER. Multi pred & multi ref has not been completed yet. --- jury/metrics/ter/__init__.py | 1 + jury/metrics/ter/ter.py | 8 + .../ter/ter_for_language_generation.py | 208 ++++++++++++++++++ 3 files changed, 217 insertions(+) create mode 100644 jury/metrics/ter/__init__.py create mode 100644 jury/metrics/ter/ter.py create mode 100644 jury/metrics/ter/ter_for_language_generation.py diff --git a/jury/metrics/ter/__init__.py b/jury/metrics/ter/__init__.py new file mode 100644 index 0000000..e03f17b --- /dev/null +++ b/jury/metrics/ter/__init__.py @@ -0,0 +1 @@ +from jury.metrics.ter.ter import TER diff --git a/jury/metrics/ter/ter.py b/jury/metrics/ter/ter.py new file mode 100644 index 0000000..24f4be9 --- /dev/null +++ b/jury/metrics/ter/ter.py @@ -0,0 +1,8 @@ +from jury.metrics._core import MetricAlias +from jury.metrics.ter.ter_for_language_generation import TERForLanguageGeneration + +__main_class__ = "TER" + + +class TER(MetricAlias): + _SUBCLASS = TERForLanguageGeneration diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py new file mode 100644 index 0000000..70f12f2 --- /dev/null +++ b/jury/metrics/ter/ter_for_language_generation.py @@ -0,0 +1,208 @@ +# coding=utf-8 +# Copyright 2020 Open Business Software Solutions, The HuggingFace Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Word Error Ratio (WER) metric. The part of this file is adapted from HuggingFace's +datasets package implementation of CER metric. See +https://github.com/huggingface/datasets/blob/master/metrics/wer/wer.py +""" +import warnings +from typing import Callable, List, Tuple, Union + +import datasets + +from jury.metrics import LanguageGenerationInstance, MetricForLanguageGeneration +from jury.metrics._core.utils import PackagePlaceholder, requirement_message + +# `import sacrebleu as scb` placeholder +scb = PackagePlaceholder(version="2.0.0") + + +_CITATION = """\ +@inproceedings{snover-etal-2006-study, + title = "A Study of Translation Edit Rate with Targeted Human Annotation", + author = "Snover, Matthew and + Dorr, Bonnie and + Schwartz, Rich and + Micciulla, Linnea and + Makhoul, John", + booktitle = "Proceedings of the 7th Conference of the Association for Machine Translation in the Americas: Technical Papers", + month = aug # " 8-12", + year = "2006", + address = "Cambridge, Massachusetts, USA", + publisher = "Association for Machine Translation in the Americas", + url = "https://aclanthology.org/2006.amta-papers.25", + pages = "223--231", +} +@inproceedings{post-2018-call, + title = "A Call for Clarity in Reporting {BLEU} Scores", + author = "Post, Matt", + booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers", + month = oct, + year = "2018", + address = "Belgium, Brussels", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/W18-6319", + pages = "186--191", +} +""" + +_DESCRIPTION = """\ +TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a +hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu +(https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found +here: https://github.com/jhclark/tercom. +The implementation here is slightly different from sacrebleu in terms of the required input format. The length of +the references and hypotheses lists need to be the same, so you may need to transpose your references compared to +sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534 +See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information. +""" + +_KWARGS_DESCRIPTION = """ +Produces TER scores alongside the number of edits and reference length. +Args: + predictions: The system stream (a sequence of segments). + references: A list of one or more reference streams (each a sequence of segments). + normalized: Whether to apply basic tokenization to sentences. + no_punct: Whether to remove punctuations from sentences. + asian_support: Whether to support Asian character processing. + case_sensitive: Whether to disable lowercasing. +Returns: + 'score': TER score (num_edits / sum_ref_lengths * 100), + 'num_edits': The cumulative number of edits, + 'ref_length': The cumulative average reference length. +Examples: + >>> predictions = ["hello there general kenobi", "foo bar foobar"] + >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] + >>> ter = jury.load_metric("ter") + >>> results = ter.compute(predictions=predictions, references=references) + >>> print(results) + {'score': 0.0, 'num_edits': 0, 'ref_length': 6.5} +""" + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class TERForLanguageGeneration(MetricForLanguageGeneration): + def _info(self): + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + homepage="http://www.cs.umd.edu/~snover/tercom/", + inputs_description=_KWARGS_DESCRIPTION, + features=self._default_features, + codebase_urls=["https://github.com/mjpost/sacreBLEU#ter"], + reference_urls=[ + "https://github.com/jhclark/tercom", + ], + ) + + def _download_and_prepare(self, dl_manager): + global scb + global TERScorer + + try: + import sacrebleu as scb + from sacrebleu import TER as TERScorer + except ModuleNotFoundError: + raise ModuleNotFoundError(requirement_message(path="WER", package_name="jiwer")) + else: + super(TERForLanguageGeneration, self)._download_and_prepare(dl_manager) + + def _compute_ter_score(self, predictions: LanguageGenerationInstance, references: LanguageGenerationInstance, **kwargs): + sb_ter = TERScorer(**kwargs) + output = sb_ter.corpus_score(predictions, references) + return {"score": output.score, "num_edits": output.num_edits, "ref_length": output.ref_length} + + def _compute_single_pred_single_ref( + self, + predictions: LanguageGenerationInstance, + references: LanguageGenerationInstance, + reduce_fn: Callable = None, + normalized: bool = False, + no_punct: bool = False, + asian_support: bool = False, + case_sensitive: bool = False, + ): + transformed_references = [[r] for r in references] + return self._compute_ter_score( + predictions=predictions, + references=transformed_references, + normalized=normalized, + no_punct=no_punct, + asian_support=asian_support, + case_sensitive=case_sensitive + ) + + def _compute_single_pred_multi_ref( + self, + predictions: LanguageGenerationInstance, + references: LanguageGenerationInstance, + reduce_fn: Callable = None, + normalized: bool = False, + no_punct: bool = False, + asian_support: bool = False, + case_sensitive: bool = False, + ): + references_per_prediction = len(references[0]) + if any(len(refs) != references_per_prediction for refs in references): + raise ValueError("Sacrebleu requires the same number of references for each prediction") + transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)] + # SacreBleu inherently supports multiple references. + return self._compute_ter_score( + predictions=predictions, + references=transformed_references, + normalized=normalized, + no_punct=no_punct, + asian_support=asian_support, + case_sensitive=case_sensitive + ) + + def _compute_multi_pred_multi_ref( + self, + predictions: LanguageGenerationInstance, + references: LanguageGenerationInstance, + reduce_fn: Callable = None, + normalized: bool = False, + no_punct: bool = False, + asian_support: bool = False, + case_sensitive: bool = False, + ): + scores = [] + for preds, refs in zip(predictions, references): + pred_scores = [] + for pred in preds: + score = self._compute_single_pred_multi_ref( + predictions=[pred], + references=[refs], + normalized=normalized, + no_punct=no_punct, + asian_support=asian_support, + case_sensitive=case_sensitive + ) + pred_scores.append(score["score"]) + pred_score = reduce_fn(pred_scores) + scores.append(pred_score) + return {"score": sum(scores) / len(scores)} + + +if __name__ == "__main__": + import json + + predictions = [["hello there general kenobi", "hi there"], ["foo bar foobar"]] + # references = ["hello there general kenobi", "foo bar foobar"] + references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] + + ter = TERForLanguageGeneration() + res = ter.compute(predictions=predictions, references=references) + + print(json.dumps(res, indent=2)) From 3e04d8a4f7d2eae92bf1b01f762152b52acee115 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Wed, 9 Mar 2022 20:37:16 +0300 Subject: [PATCH 2/5] TER metric added. --- jury/metrics/__init__.py | 1 + .../ter/ter_for_language_generation.py | 107 +++++++++++------- tests/jury/metrics/test_ter.py | 47 ++++++++ .../expected_outputs/metrics/test_ter.json | 29 +++++ 4 files changed, 140 insertions(+), 44 deletions(-) create mode 100644 tests/jury/metrics/test_ter.py create mode 100644 tests/test_data/expected_outputs/metrics/test_ter.json diff --git a/jury/metrics/__init__.py b/jury/metrics/__init__.py index 853ff97..258dc82 100644 --- a/jury/metrics/__init__.py +++ b/jury/metrics/__init__.py @@ -26,4 +26,5 @@ from jury.metrics.rouge import Rouge from jury.metrics.sacrebleu import Sacrebleu from jury.metrics.squad import Squad +from jury.metrics.ter import TER from jury.metrics.wer import WER diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py index 70f12f2..0355c00 100644 --- a/jury/metrics/ter/ter_for_language_generation.py +++ b/jury/metrics/ter/ter_for_language_generation.py @@ -16,11 +16,11 @@ datasets package implementation of CER metric. See https://github.com/huggingface/datasets/blob/master/metrics/wer/wer.py """ -import warnings -from typing import Callable, List, Tuple, Union +from typing import Callable, Dict, Sequence import datasets +from jury.collator import Collator from jury.metrics import LanguageGenerationInstance, MetricForLanguageGeneration from jury.metrics._core.utils import PackagePlaceholder, requirement_message @@ -118,10 +118,20 @@ def _download_and_prepare(self, dl_manager): else: super(TERForLanguageGeneration, self)._download_and_prepare(dl_manager) - def _compute_ter_score(self, predictions: LanguageGenerationInstance, references: LanguageGenerationInstance, **kwargs): + def _validate_references(self, references: Collator) -> None: + references_per_prediction = len(references[0]) + if any(len(refs) != references_per_prediction for refs in references): + raise ValueError("Sacrebleu requires the same number of references for each prediction") + + def _compute_ter_score( + self, predictions: Sequence[str], references: Sequence[Sequence[str]], sentence_level: bool = False, **kwargs + ): sb_ter = TERScorer(**kwargs) - output = sb_ter.corpus_score(predictions, references) - return {"score": output.score, "num_edits": output.num_edits, "ref_length": output.ref_length} + if sentence_level: + output = sb_ter.sentence_score(predictions, references) + else: + output = sb_ter.corpus_score(predictions, references) + return {"score": float(output.score / 100), "num_edits": output.num_edits, "ref_length": output.ref_length} def _compute_single_pred_single_ref( self, @@ -133,14 +143,13 @@ def _compute_single_pred_single_ref( asian_support: bool = False, case_sensitive: bool = False, ): - transformed_references = [[r] for r in references] return self._compute_ter_score( - predictions=predictions, - references=transformed_references, - normalized=normalized, - no_punct=no_punct, - asian_support=asian_support, - case_sensitive=case_sensitive + predictions=predictions, + references=references, + normalized=normalized, + no_punct=no_punct, + asian_support=asian_support, + case_sensitive=case_sensitive, ) def _compute_single_pred_multi_ref( @@ -153,18 +162,14 @@ def _compute_single_pred_multi_ref( asian_support: bool = False, case_sensitive: bool = False, ): - references_per_prediction = len(references[0]) - if any(len(refs) != references_per_prediction for refs in references): - raise ValueError("Sacrebleu requires the same number of references for each prediction") - transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)] # SacreBleu inherently supports multiple references. return self._compute_ter_score( - predictions=predictions, - references=transformed_references, - normalized=normalized, - no_punct=no_punct, - asian_support=asian_support, - case_sensitive=case_sensitive + predictions=predictions, + references=references, + normalized=normalized, + no_punct=no_punct, + asian_support=asian_support, + case_sensitive=case_sensitive, ) def _compute_multi_pred_multi_ref( @@ -178,31 +183,45 @@ def _compute_multi_pred_multi_ref( case_sensitive: bool = False, ): scores = [] + avg_num_edits = 0 + avg_ref_length = 0 for preds, refs in zip(predictions, references): pred_scores = [] + num_edits = [] + ref_lengths = [] for pred in preds: - score = self._compute_single_pred_multi_ref( - predictions=[pred], - references=[refs], - normalized=normalized, - no_punct=no_punct, - asian_support=asian_support, - case_sensitive=case_sensitive + score = self._compute_ter_score( + predictions=pred, + references=refs, + sentence_level=True, + normalized=normalized, + no_punct=no_punct, + asian_support=asian_support, + case_sensitive=case_sensitive, ) pred_scores.append(score["score"]) - pred_score = reduce_fn(pred_scores) + num_edits.append(score["num_edits"]) + ref_lengths.append(score["ref_length"]) + pred_score = reduce_fn(pred_scores).item() + avg_num_edits += sum(num_edits) / len(num_edits) + avg_ref_length += sum(ref_lengths) / len(ref_lengths) scores.append(pred_score) - return {"score": sum(scores) / len(scores)} - - -if __name__ == "__main__": - import json - - predictions = [["hello there general kenobi", "hi there"], ["foo bar foobar"]] - # references = ["hello there general kenobi", "foo bar foobar"] - references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] - - ter = TERForLanguageGeneration() - res = ter.compute(predictions=predictions, references=references) - - print(json.dumps(res, indent=2)) + return { + "score": sum(scores) / len(scores), + "avg_num_edits": avg_num_edits / len(predictions), + "avg_ref_length": avg_ref_length / len(predictions), + } + + def evaluate( + self, predictions: Collator, references: Collator, reduce_fn: Callable = None, **kwargs + ) -> Dict[str, float]: + if predictions.can_collapse() and references.can_collapse(): + predictions = predictions.collapse() + eval_fn = self._compute_single_pred_single_ref + elif predictions.can_collapse() and not references.can_collapse(): + predictions = predictions.collapse() + eval_fn = self._compute_single_pred_multi_ref + else: + eval_fn = self._compute_multi_pred_multi_ref + self._validate_references(references) + return eval_fn(predictions=predictions, references=references, reduce_fn=reduce_fn, **kwargs) diff --git a/tests/jury/metrics/test_ter.py b/tests/jury/metrics/test_ter.py new file mode 100644 index 0000000..3542ac4 --- /dev/null +++ b/tests/jury/metrics/test_ter.py @@ -0,0 +1,47 @@ +import pytest + +from jury import Jury +from jury.metrics import AutoMetric +from tests.jury.conftest import get_expected_output +from tests.utils import assert_almost_equal_dict + + +@pytest.fixture(scope="module") +def jury_ter(): + metric = AutoMetric.load("ter", compute_kwargs={"normalized": True}) + return Jury(metrics=metric) + + +@pytest.fixture +@get_expected_output(prefix="metrics") +def output_basic(): + return output_basic.output + + +@pytest.fixture +@get_expected_output(prefix="metrics") +def output_multiple_ref(): + return output_multiple_ref.output + + +@pytest.fixture +@get_expected_output(prefix="metrics") +def output_multiple_pred_multiple_ref(): + return output_multiple_pred_multiple_ref.output + + +def test_basic(predictions, references, jury_ter, output_basic): + scores = jury_ter(predictions=predictions, references=references) + assert_almost_equal_dict(actual=scores, desired=output_basic) + + +def test_multiple_ref(predictions, multiple_references, jury_ter, output_multiple_ref): + scores = jury_ter(predictions=predictions, references=multiple_references) + assert_almost_equal_dict(actual=scores, desired=output_multiple_ref) + + +def test_multiple_pred_multiple_ref( + multiple_predictions, multiple_references, jury_ter, output_multiple_pred_multiple_ref +): + scores = jury_ter(predictions=multiple_predictions, references=multiple_references) + assert_almost_equal_dict(actual=scores, desired=output_multiple_pred_multiple_ref) diff --git a/tests/test_data/expected_outputs/metrics/test_ter.json b/tests/test_data/expected_outputs/metrics/test_ter.json new file mode 100644 index 0000000..d4a2e16 --- /dev/null +++ b/tests/test_data/expected_outputs/metrics/test_ter.json @@ -0,0 +1,29 @@ +{ + "basic": { + "total_items": 2, + "empty_items": 0, + "ter": { + "score": 0.4615384615384615, + "num_edits": 3, + "ref_length": 6.5 + } + }, + "multiple_ref": { + "total_items": 2, + "empty_items": 0, + "ter": { + "score": 0.6153846153846154, + "num_edits": 8, + "ref_length": 13.0 + } + }, + "multiple_pred_multiple_ref": { + "total_items": 2, + "empty_items": 0, + "ter": { + "score": 0.812121212121212, + "avg_num_edits": 3.25, + "avg_ref_length": 6.5 + } + } +} \ No newline at end of file From ef4e979dc126789627a496ded2c2067937f34caf Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Thu, 10 Mar 2022 12:04:03 +0300 Subject: [PATCH 3/5] Typo fixed in docstring. --- jury/metrics/ter/ter_for_language_generation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py index 0355c00..77a9fe7 100644 --- a/jury/metrics/ter/ter_for_language_generation.py +++ b/jury/metrics/ter/ter_for_language_generation.py @@ -12,9 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Word Error Ratio (WER) metric. The part of this file is adapted from HuggingFace's -datasets package implementation of CER metric. See -https://github.com/huggingface/datasets/blob/master/metrics/wer/wer.py +""" Translation Edit Rate(TER) metric. The part of this file is adapted from HuggingFace's +datasets package implementation of TER metric. See +https://github.com/huggingface/datasets/blob/master/metrics/ter/ter.py """ from typing import Callable, Dict, Sequence From e1841841210d08adf179f838338d6ac30df4fe13 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Thu, 10 Mar 2022 13:28:09 +0300 Subject: [PATCH 4/5] Typo fixed in docstring. --- jury/metrics/ter/ter_for_language_generation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py index 77a9fe7..f7010fc 100644 --- a/jury/metrics/ter/ter_for_language_generation.py +++ b/jury/metrics/ter/ter_for_language_generation.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Translation Edit Rate(TER) metric. The part of this file is adapted from HuggingFace's +""" Translation Edit Rate (TER) metric. The part of this file is adapted from HuggingFace's datasets package implementation of TER metric. See https://github.com/huggingface/datasets/blob/master/metrics/ter/ter.py """ @@ -78,11 +78,11 @@ asian_support: Whether to support Asian character processing. case_sensitive: Whether to disable lowercasing. Returns: - 'score': TER score (num_edits / sum_ref_lengths * 100), + 'score': TER score (num_edits / sum_ref_lengths), 'num_edits': The cumulative number of edits, 'ref_length': The cumulative average reference length. Examples: - >>> predictions = ["hello there general kenobi", "foo bar foobar"] + >>> predictions = [["hello there general kenobi", "foo bar foobar"]] >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] >>> ter = jury.load_metric("ter") >>> results = ter.compute(predictions=predictions, references=references) From af01cfbb2265cdcac282250c4b3e7e3fd4fc12d9 Mon Sep 17 00:00:00 2001 From: devrimcavusoglu Date: Tue, 15 Mar 2022 12:52:08 +0300 Subject: [PATCH 5/5] Docstring example corrected. - requirement_message corrected. --- jury/metrics/ter/ter_for_language_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jury/metrics/ter/ter_for_language_generation.py b/jury/metrics/ter/ter_for_language_generation.py index f7010fc..1dc7986 100644 --- a/jury/metrics/ter/ter_for_language_generation.py +++ b/jury/metrics/ter/ter_for_language_generation.py @@ -82,7 +82,7 @@ 'num_edits': The cumulative number of edits, 'ref_length': The cumulative average reference length. Examples: - >>> predictions = [["hello there general kenobi", "foo bar foobar"]] + >>> predictions = [["hello there general kenobi"], ["foo bar foobar"]] >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]] >>> ter = jury.load_metric("ter") >>> results = ter.compute(predictions=predictions, references=references) @@ -114,7 +114,7 @@ def _download_and_prepare(self, dl_manager): import sacrebleu as scb from sacrebleu import TER as TERScorer except ModuleNotFoundError: - raise ModuleNotFoundError(requirement_message(path="WER", package_name="jiwer")) + raise ModuleNotFoundError(requirement_message(path="TER", package_name="sacrebleu")) else: super(TERForLanguageGeneration, self)._download_and_prepare(dl_manager)