Skip to content

Commit

Permalink
machinery: Include glossary in OpenAI translations
Browse files Browse the repository at this point in the history
Issue #7086
  • Loading branch information
nijel committed Nov 30, 2023
1 parent 3537b3d commit a4637ab
Show file tree
Hide file tree
Showing 8 changed files with 168 additions and 9 deletions.
4 changes: 4 additions & 0 deletions docs/admin/machine.rst
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,8 @@ This service uses an API, and you need to obtain key and secret from NetEase.
OpenAI
------

.. versionadded:: 5.3

:Service ID: ``openai``
:Configuration: +-------------+--------------------+---------------------------------------------------------------------------------------------------------------------------+
| ``key`` | API key | |
Expand Down Expand Up @@ -403,6 +405,8 @@ Use persona and style fields to further fine-tune translations. These will be
used in a prompt for OpenAI and allow you to change the style of the
translations.

The service automatically uses :ref:`glossary`, see :ref:`glossary-mt`.

.. seealso::

`OpenAI models <https://platform.openai.com/docs/models>`_,
Expand Down
1 change: 1 addition & 0 deletions docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Not yet released.

* :ref:`mt-openai` automatic suggestion service.
* Added labels API, see :http:get:`/api/projects/(string:project)/labels/`.
* :ref:`glossary-mt`.

**Improvements**

Expand Down
17 changes: 17 additions & 0 deletions docs/user/glossary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,20 @@ listed in the glossary sidebar when translating.
.. seealso::

:ref:`variants`

.. _glossary-mt:

Glossaries in automatic suggestion
----------------------------------

.. versionadded:: 5.3

Following automatic suggestion services utilize glossaries during the translation:

* :ref:`mt-openai`

The glossary is processed before exposed to the service:

* Duplicate source entries are not allowed, any additional entries with the same source are skipped.
* Any control characters and leading and trailing whitespace are stripped.
* :ref:`glossary-forbidden` are skipped.
7 changes: 7 additions & 0 deletions weblate/checks/tests/test_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ def __init__(self):
self.use_shared_tm = True
self.name = "MockProject"

def get_glossary_tsv_cache_key(self, source_language, language):
return f"project-glossary-tsv-test-{source_language.code}-{language.code}"

@property
def glossaries(self):
return []


class MockComponent:
"""Mock component object."""
Expand Down
99 changes: 94 additions & 5 deletions weblate/glossary/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
from __future__ import annotations

import re
import sys
import unicodedata
from collections import defaultdict
from itertools import chain

import ahocorasick_rs
import sentry_sdk
from django.db.models import Q, Value
from django.core.cache import cache
from django.db.models import Prefetch, Q, Value
from django.db.models.functions import MD5, Lower

from weblate.trans.models.unit import Unit
Expand All @@ -19,6 +22,15 @@

SPLIT_RE = re.compile(r"[\s,.:!?]+")
NON_WORD_RE = re.compile(r"\W")
# All control chars including tab and newline, this is dufferent from
# weblate.formats.helpers.CONTROLCHARS which contains only chars
# problematic in XML or SQL scopes.
CONTROLCHARS = [
char
for char in map(chr, range(sys.maxunicode + 1))
if unicodedata.category(char) in ("Zl", "Cc")
]
CONTROLCHARS_TRANS = str.maketrans({char: None for char in CONTROLCHARS})


def get_glossary_sources(component):
Expand Down Expand Up @@ -51,6 +63,14 @@ def get_glossary_automaton(project):
)


def get_glossary_units(project, source_language, target_language):
return Unit.objects.filter(
translation__component__in=project.glossaries,
translation__component__source_language=source_language,
translation__language=target_language,
)


def get_glossary_terms(unit: Unit) -> list[Unit]:
"""Return list of term pairs for an unit."""
if unit.glossary_terms is not None:
Expand Down Expand Up @@ -93,12 +113,10 @@ def get_glossary_terms(unit: Unit) -> list[Unit]:
return []

units = list(
Unit.objects.prefetch()
get_glossary_units(project, source_language, language)
.prefetch()
.filter(
Q(source__lower__md5__in=[MD5(Value(term)) for term in positions]),
translation__component__in=project.glossaries,
translation__component__source_language=source_language,
translation__language=language,
)
.select_related("source_unit", "variant")
)
Expand Down Expand Up @@ -132,3 +150,74 @@ def get_glossary_terms(unit: Unit) -> list[Unit]:
unit.glossary_terms = units

return units


def render_glossary_units_tsv(units) -> str:
r"""
Builds a tab separated glossary.
Based on the DeepL specification:
- duplicate source entries are not allowed
- neither source nor target entry may be empty
- source and target entries must not contain any C0 or C1 control characters (including, e.g., "\t" or "\n") or any Unicode newline
- source and target entries must not contain any leading or trailing Unicode whitespace character
- source/target entry pairs are separated by a newline
- source entries and target entries are separated by a tab
"""
from weblate.trans.models.component import Component

def cleanup(text):
return text.translate(CONTROLCHARS_TRANS).strip()

included = set()
output = []
for unit in units.prefetch_related(
"source_unit",
"translation",
Prefetch("translation__component", queryset=Component.objects.defer_huge()),
):
# Skip forbidden term
if "forbidden" in unit.all_flags:
continue

if not unit.translated and "read-only" not in unit.all_flags:
continue

# Cleanup strings
source = cleanup(unit.source)
target = source if "read-only" in unit.all_flags else cleanup(unit.target)

# Skip blanks and duplicates
if not source or not target or source in included:
continue

# Memoize included
included.add(source)

# Render TSV
output.append(f"{source}\t{target}")

return "\n".join(output)


def get_glossary_tsv(translation) -> str:
project = translation.component.project
source_language = translation.component.source_language
language = translation.language

cache_key = project.get_glossary_tsv_cache_key(source_language, language)

cached = cache.get(cache_key)
if cached is not None:
return cached

# Get glossary units
units = get_glossary_units(project, source_language, language)

# Render as tsv
result = render_glossary_units_tsv(units.filter(state__gte=STATE_TRANSLATED))

cache.set(cache_key, result, 24 * 3600)

return result
17 changes: 16 additions & 1 deletion weblate/glossary/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

"""Test for glossary manipulations."""

import csv
import json
from io import StringIO

from django.urls import reverse

from weblate.glossary.models import get_glossary_terms
from weblate.glossary.models import get_glossary_terms, get_glossary_tsv
from weblate.glossary.tasks import sync_terminology
from weblate.trans.models import Unit
from weblate.trans.tests.test_views import ViewTestCase
Expand Down Expand Up @@ -433,3 +435,16 @@ def test_terminology_explanation_sync(self):
),
{""},
)

def test_tsv(self):
# Import file
self.import_file(TEST_CSV)

tsv_data = get_glossary_tsv(self.get_translation())

handle = StringIO(tsv_data)

reader = csv.reader(handle, "excel-tab")
lines = list(reader)
self.assertEqual(len(lines), 163)
self.assertTrue(all(len(line) == 2 for line in lines))
21 changes: 18 additions & 3 deletions weblate/machinery/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from django.core.cache import cache
from openai import OpenAI

from weblate.glossary.models import get_glossary_tsv

from .base import BatchMachineTranslation, MachineTranslationError
from .forms import OpenAIMachineryForm

Expand All @@ -26,6 +28,10 @@
You do not include transliteration.
{glossary}
"""
GLOSSARY_PROMPT = """
Use the following glossary during the translation:
{}
"""


class OpenAITranslation(BatchMachineTranslation):
Expand Down Expand Up @@ -61,13 +67,20 @@ def get_model(self) -> str:

raise MachineTranslationError(f"Unsupported model: {self.settings['model']}")

def get_prompt(self, source_language: str, target_language: str) -> str:
def get_prompt(
self, source_language: str, target_language: str, translation
) -> str:
glossary = ""
if translation:
glossary = get_glossary_tsv(translation)
if glossary:
glossary = GLOSSARY_PROMPT.format(glossary)
return PROMPT.format(
source_language=source_language,
target_language=target_language,
persona=self.settings["persona"],
style=self.settings["style"],
glossary="",
glossary=glossary,
)

def download_multiple_translations(
Expand All @@ -79,8 +92,10 @@ def download_multiple_translations(
threshold: int = 75,
) -> dict[str, list[dict[str, str]]]:
texts = [text for text, _unit in sources]
unit = sources[0][1]
prompt = self.get_prompt(source, language, unit.translation if unit else None)
messages = [
{"role": "system", "content": self.get_prompt(source, language)},
{"role": "system", "content": prompt},
*({"role": "user", "content": text} for text in texts),
]

Expand Down
11 changes: 11 additions & 0 deletions weblate/trans/models/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,9 @@ def child_components(self):
def source_language_cache_key(self):
return f"project-source-language-ids-{self.pk}"

def get_glossary_tsv_cache_key(self, source_language, language):
return f"project-glossary-tsv-{self.pk}-{source_language.code}-{language.code}"

def invalidate_source_language_cache(self):
cache.delete(self.source_language_cache_key)

Expand Down Expand Up @@ -554,6 +557,14 @@ def glossaries(self):
def invalidate_glossary_cache(self):
if "glossary_automaton" in self.__dict__:
del self.__dict__["glossary_automaton"]
tsv_cache_keys = [
self.get_glossary_tsv_cache_key(source_language, language)
for source_language in Language.objects.filter(
component__project=self
).distinct()
for language in self.languages
]
cache.delete_many(tsv_cache_keys)

@cached_property
def glossary_automaton(self):
Expand Down

0 comments on commit a4637ab

Please sign in to comment.