Skip to content

Commit

Permalink
machinery: Add DeepL support for glossaries
Browse files Browse the repository at this point in the history
- Add generic mixin to handle service side glossaries
- Implement DeepL integration

Fixes #7086
Fixes #10468
  • Loading branch information
nijel committed Nov 30, 2023
1 parent a4637ab commit 161ee89
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 6 deletions.
2 changes: 2 additions & 0 deletions docs/admin/machine.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ Replace the XXX with your auth_key. If you receive a JSON object which contains
Weblate supports DeepL formality, it will choose matching one based on the
language (for example, there is ``de@formal`` and ``de@informal``).

The service automatically uses :ref:`glossary`, see :ref:`glossary-mt`.

.. seealso::

`DeepL translator <https://www.deepl.com/translator>`_,
Expand Down
2 changes: 1 addition & 1 deletion docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Not yet released.
**Improvements**

* Reduced memory usage for stastics.
* :ref:`mt-deepl` performs better in :ref:`auto-translation`.
* :ref:`mt-deepl` performs better in :ref:`auto-translation` and supports :ref:`glossary-mt`.

**Bug fixes**

Expand Down
7 changes: 7 additions & 0 deletions docs/user/glossary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,16 @@ Glossaries in automatic suggestion
Following automatic suggestion services utilize glossaries during the translation:

* :ref:`mt-openai`
* :ref:`mt-deepl`

The glossary is processed before exposed to the service:

* Duplicate source entries are not allowed, any additional entries with the same source are skipped.
* Any control characters and leading and trailing whitespace are stripped.
* :ref:`glossary-forbidden` are skipped.

.. note::

Many services store glossaries server-side and enforce limit on the number
of saved glossaries. Weblate always deletes the oldest glossary if it runs out of
space.
112 changes: 111 additions & 1 deletion weblate/machinery/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from weblate.lang.models import Language, PluralMapper
from weblate.logger import LOGGER
from weblate.utils.errors import report_error
from weblate.utils.hash import calculate_dict_hash, calculate_hash
from weblate.utils.hash import calculate_dict_hash, calculate_hash, hash_to_checksum
from weblate.utils.requests import request
from weblate.utils.search import Comparer
from weblate.utils.site import get_site_url
Expand Down Expand Up @@ -592,3 +592,113 @@ def is_rate_limited(self):

def get_language_possibilities(self, language):
yield get_machinery_language(language)


class GlossaryMachineTranslationMixin:
glossary_name_format = (
"weblate:{project}:{source_language}:{target_language}:{checksum}"
)
glossary_count_limit = None

def is_glossary_supported(self, source_language: str, target_language: str) -> bool:
return True

def list_glossaries(self) -> dict[str:str]:
"""
Lists glossaries from the service.
Returns dictionary with names and id.
"""
raise NotImplementedError

def delete_glossary(self, glossary_id: str):
raise NotImplementedError

def delete_oldest_glossary(self):
raise NotImplementedError

def create_glossary(
self, source_language: str, target_language: str, name: str, tsv: str
):
raise NotImplementedError

def get_glossaries(self, use_cache: bool = True):
cache_key = self.get_cache_key("glossaries")
if use_cache:
cached = cache.get(cache_key)
if cached is not None:
return cached

result = self.list_glossaries()

cache.set(cache_key, result, 24 * 3600)
return result

def get_glossary_id(
self, source_language: str, target_language: str, unit
) -> int | str | None:
from weblate.glossary.models import get_glossary_tsv

if unit is None:
return None

translation = unit.translation

# Check glossary support for a language pair
if not self.is_glossary_supported(source_language, target_language):
return None

# Check if there is a glossary
glossary_tsv = get_glossary_tsv(translation)
if not glossary_tsv:
return None

# Calculate hash to check for changes
glossary_checksum = hash_to_checksum(calculate_hash(glossary_tsv))
glossary_name = self.glossary_name_format.format(
project=translation.component.project.id,
source_language=source_language,
target_language=target_language,
checksum=glossary_checksum,
)

# Fetch list of glossaries
glossaries = self.get_glossaries()
if glossary_name in glossaries:
return glossaries[glossary_name]

# Remove stale glossaries for this language pair
hashless_name = self.glossary_name_format.format(
project=translation.component.project.id,
source_language=source_language,
target_language=target_language,
checksum="",
)
for name, glossary_id in glossaries.items():
if name.startswith(hashless_name):
translation.log_debug(
"%s: removing stale glossary %s (%s)", self.mtid, name, glossary_id
)
self.delete_glossary(glossary_id)

# Ensure we are in service limits
if (
self.glossary_count_limit
and len(glossaries) + 1 >= self.glossary_count_limit
):
translation.log_debug(
"%s: approached limit of %d glossaries, removing oldest glossary",
self.mtid,
self.glossary_count_limit,
)
self.delete_oldest_glossary()

# Create new glossary
translation.log_debug("%s: creating glossary %s", self.mtid, glossary_name)
self.create_glossary(
source_language, target_language, glossary_name, glossary_tsv
)

# Fetch glossaries again, without using cache
glossaries = self.get_glossaries(use_cache=False)
return glossaries[glossary_name]
68 changes: 64 additions & 4 deletions weblate/machinery/deepl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@
from html import escape, unescape
from typing import TYPE_CHECKING

from .base import BatchMachineTranslation
from dateutil.parser import isoparse
from django.core.cache import cache

from .base import BatchMachineTranslation, GlossaryMachineTranslationMixin
from .forms import DeepLMachineryForm

if TYPE_CHECKING:
from weblate.trans.models import Unit


class DeepLTranslation(BatchMachineTranslation):
class DeepLTranslation(BatchMachineTranslation, GlossaryMachineTranslationMixin):
"""DeepL (Linguee) machine translation support."""

name = "DeepL"
Expand All @@ -29,6 +32,7 @@ class DeepLTranslation(BatchMachineTranslation):
force_uncleanup = True
hightlight_syntax = True
settings_form = DeepLMachineryForm
glossary_count_limit = 1000

def map_language_code(self, code):
"""Convert language to service specific code."""
Expand All @@ -50,7 +54,7 @@ def download_languages(self):

# Handle formality extensions
for item in response.json():
lang_code = item["language"]
lang_code = item["language"].upper()
target_languages.add(lang_code)
if item.get("supports_formality"):
target_languages.add(f"{lang_code}@FORMAL")
Expand All @@ -74,8 +78,12 @@ def download_multiple_translations(
user=None,
threshold: int = 75,
) -> dict[str, list[dict[str, str]]]:
texts = [text for text, _unit in sources]
"""Download list of possible translations from a service."""
texts = [text for text, _unit in sources]
unit = sources[0][1]

glossary_id = self.get_glossary_id(source, language, unit)

params = {
"text": texts,
"source_lang": source,
Expand All @@ -90,6 +98,8 @@ def download_multiple_translations(
elif language.endswith("@INFORMAL"):
params["target_lang"] = language[:-9]
params["formality"] = "less"
if glossary_id is not None:
params["glossary_id"] = glossary_id
response = self.request(
"post",
self.get_api_url("translate"),
Expand Down Expand Up @@ -123,3 +133,53 @@ def format_replacement(self, h_start: int, h_end: int, h_text: str):

def make_re_placeholder(self, text: str):
return re.escape(text)

def is_glossary_supported(self, source_language: str, target_language: str) -> bool:
cache_key = self.get_cache_key("glossary_languages")
languages = cache.get(cache_key)
if languages is None:
response = self.request("get", self.get_api_url("glossary-language-pairs"))
languages = [
(support["source_lang"].upper(), support["target_lang"].upper())
for support in response.json()["supported_languages"]
]

cache.set(cache_key, languages, 24 * 3600)

source_language = source_language.split("-")[0]
target_language = target_language.split("-")[0]
return (source_language, target_language) in languages

def list_glossaries(self) -> dict[str:str]:
response = self.request("get", self.get_api_url("glossaries"))
return {
glossary["name"]: glossary["glossary_id"]
for glossary in response.json()["glossaries"]
}

def delete_oldest_glossary(self):
response = self.request("get", self.get_api_url("glossaries"))
glossaries = sorted(
response.json()["glossaries"],
key=lambda glossary: isoparse(glossary["creation_time"]),
)
if glossaries:
self.delete_glossary(glossaries[0]["glossary_id"])

def delete_glossary(self, glossary_id: str):
self.request("delete", self.get_api_url("glossaries", glossary_id))

def create_glossary(
self, source_language: str, target_language: str, name: str, tsv: str
):
self.request(
"post",
self.get_api_url("glossaries"),
json={
"name": name,
"source_lang": source_language,
"target_lang": target_language,
"entries": tsv,
"entries_format": "tsv",
},
)
56 changes: 56 additions & 0 deletions weblate/machinery/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,16 @@ def mock_languages():
"https://api.deepl.com/v2/languages",
json=DEEPL_LANG_RESPONSE,
)
responses.add(
responses.GET,
"https://api.deepl.com/v2/glossary-language-pairs",
json={
"supported_languages": [
{"source_lang": "de", "target_lang": "en"},
{"source_lang": "en", "target_lang": "de"},
]
},
)

@classmethod
def mock_response(cls):
Expand Down Expand Up @@ -1060,6 +1070,52 @@ def request_callback(request):
"DE@INFORMAL", self.SOURCE_TRANSLATED, self.EXPECTED_LEN, machine=machine
)

@responses.activate
@patch("weblate.glossary.models.get_glossary_tsv", new=lambda _: "foo\tbar")
def test_glossary(self):
def request_callback(request):
headers = {}
payload = json.loads(request.body)
self.assertIn("glossary_id", payload)
return (200, headers, json.dumps(DEEPL_RESPONSE))

machine = self.MACHINE_CLS(self.CONFIGURATION)
machine.delete_cache()
self.mock_languages()
responses.add_callback(
responses.POST,
"https://api.deepl.com/v2/translate",
callback=request_callback,
)
responses.add(
responses.GET,
"https://api.deepl.com/v2/glossaries",
json={"glossaries": []},
)
responses.add(
responses.POST,
"https://api.deepl.com/v2/glossaries",
)
responses.add(
responses.GET,
"https://api.deepl.com/v2/glossaries",
json={
"glossaries": [
{
"glossary_id": "def3a26b-3e84-45b3-84ae-0c0aaf3525f7",
"name": "weblate:1:EN:DE:9e250d830c11d70f",
"ready": True,
"source_lang": "EN",
"target_lang": "DE",
"creation_time": "2021-08-03T14:16:18.329Z",
"entry_count": 1,
}
]
},
)
# Fetch from service
self.assert_translate(self.SUPPORTED, self.SOURCE_TRANSLATED, self.EXPECTED_LEN)

@responses.activate
def test_replacements(self):
def request_callback(request):
Expand Down

0 comments on commit 161ee89

Please sign in to comment.