From cf97ccb6c9a01540421e7a114dad8efeb172e975 Mon Sep 17 00:00:00 2001 From: Sascha Brawer Date: Tue, 4 Feb 2025 11:22:26 +0100 Subject: [PATCH 1/2] Add plugin for detecting values in Burmese Zawgyi encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background: [Zawgyi](https://en.wikipedia.org/wiki/Zawgyi_font) is an obsolete font encoding that is incompatible with proper Unicode. Structurally, Zawgyi strings look like Unicode (they can be passed around and stored as UTF-8, UTF-16, etc.), but when rendered, text gets displayed as garbled characters unless the user happens to have a non-standard font installed. With this non-standard font, the system is able to render Zawgyi, but properly encoded Unicode strings look broken. Also, because the Zawgyi encoding abuses codepoints intended for Myanmar’s minority languages, installing a Zawgyi font breaks the display of text in those minority languages. The situation is a bit like with ISO 8859 in the 1980s, but worse because Zawgyi text and fonts pretend to be Unicode. Because of the structural similarity to Unicode, detecting Zawgyi is non-trivial and can only be done probabilistically. As of early 2025, Zawgyi is on the decline on the general Internet. However, OpenStreetMap still contains thousands of objects with tag values that are encoded in Burmese Zawgyi instead of proper Unicode. This Osmose plugin makes use of Google’s open-source Zawgyi detector, which uses Markov chains to estimate the likelihood of a Burmese string being Zawgyi-encoded versus proper Unicode. The Osmose plugin suggests fixes by converting mis-encoded strings from Zawgyi to Unicode using the Unicode ICU library, which comes with a built-in converter for this purpose. Fixes https://github.com/osm-fr/osmose-backend/issues/2442. --- docker/Dockerfile | 2 + mypy.ini | 4 ++ plugins/TagFix_ZawgyiBurmese.py | 102 ++++++++++++++++++++++++++++++++ requirements.txt | 2 + 4 files changed, 110 insertions(+) create mode 100644 plugins/TagFix_ZawgyiBurmese.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 2d3a1919f..771f25e9b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -13,6 +13,7 @@ RUN apt-get update && \ git \ libarchive-dev \ libboost-python-dev \ + libicu-dev \ libosmpbf-dev \ libprotobuf-dev \ locales \ @@ -23,6 +24,7 @@ RUN apt-get update && \ postgresql-client \ protobuf-compiler \ python3-dev \ + python3-icu \ python3-pip \ python3-setuptools \ python3-wheel \ diff --git a/mypy.ini b/mypy.ini index f5b4fd823..f28589069 100644 --- a/mypy.ini +++ b/mypy.ini @@ -10,12 +10,16 @@ ignore_errors = True # External [mypy-pandas.*] ignore_missing_imports = True +[mypy-icu.*] +ignore_missing_imports = True [mypy-ipyleaflet.*] ignore_missing_imports = True [mypy-ipywidgets.*] ignore_missing_imports = True [mypy-antlr4.*] ignore_missing_imports = True +[mypy-myanmartools.*] +ignore_missing_imports = True [mypy-Pyro.*] ignore_missing_imports = True [mypy-shapely.*] diff --git a/plugins/TagFix_ZawgyiBurmese.py b/plugins/TagFix_ZawgyiBurmese.py new file mode 100644 index 000000000..b10e65f18 --- /dev/null +++ b/plugins/TagFix_ZawgyiBurmese.py @@ -0,0 +1,102 @@ +#-*- coding: utf-8 -*- + +########################################################################### +## ## +## Copyrights Sascha Brawer 2025 ## +## ## +## This program is free software: you can redistribute it and/or modify ## +## it under the terms of the GNU General Public License as published by ## +## the Free Software Foundation, either version 3 of the License, or ## +## (at your option) any later version. ## +## ## +## This program is distributed in the hope that it will be useful, ## +## but WITHOUT ANY WARRANTY; without even the implied warranty of ## +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## +## GNU General Public License for more details. ## +## ## +## You should have received a copy of the GNU General Public License ## +## along with this program. If not, see . ## +## ## +########################################################################### + +from modules.OsmoseTranslation import T_ +from plugins.Plugin import Plugin + +import myanmartools +import icu + + +# https://en.wikipedia.org/wiki/Zawgyi_font + +class TagFix_ZawgyiBurmese(Plugin): + + only_for = ['MM'] + + def init(self, logger): + Plugin.init(self, logger) + self.errors[50706] = self.def_class( + item = 5070, + level = 2, + tags = ['value', 'fix:chair'], + title = T_('Value contains Zawgyi-encoded Burmese characters'), + detail = T_( +'''Tag values in OpenStreetMap should be stored in Unicode. However, this +value contains Burmese characters in the obsolete “Zawgyi” font encoding. +As long as this value is stored in a non-standard way, modern devices cannot +display it correctly. Please change the text to be encoded in Unicode.'''), + ) + self.detector = myanmartools.ZawgyiDetector() + self.converter = icu.Transliterator.createInstance('Zawgyi-my') + + def node(self, data, tags): + errs = [] + for key, value in tags.items(): + if not any(0x1000 <= ord(c) <= 0x109F for c in value): + continue + score = self.detector.get_zawgyi_probability(value) + if score < 0.8: + continue + fixed_value = self.converter.transliterate(value) + if value == fixed_value: + continue + errs.append({'class': 50706, 'subclass': 0, 'fix': {key: fixed_value}}) + return errs + + def way(self, data, tags, nodes): + return self.node(data, tags) + + def relation(self, data, tags, members): + return self.node(data, tags) + + +########################################################################### +from plugins.Plugin import TestPluginCommon + + +class Test(TestPluginCommon): + def test(self): + a = TagFix_ZawgyiBurmese(None) + a.init(None) + for name in [ + '', + 'foo', + 'ဘားအံ', + 'ကျိုက်မရော အဝေးပြေးလမ်း', + ]: + assert not a.node(None, {'name': name}), name + assert not a.way(None, {'name': name}, nodes=None), name + assert not a.relation(None, {'name': name}, members=None), name + + for zawgyi, uni in [('မ္း', 'မ်း'), ('က္ေ', 'က်ေ')]: + self.check_err( + a.node(None, {'addr:street': zawgyi}), + {'class': 50706, 'subclass': 0, 'fix': {'addr:street': uni}}, + ) + self.check_err( + a.way(None, {'addr:city': zawgyi}, nodes=None), + {'class': 50706, 'subclass': 0, 'fix': {'addr:city': uni}}, + ) + self.check_err( + a.relation(None, {'fixme': zawgyi}, members=None), + {'class': 50706, 'subclass': 0, 'fix': {'fixme': uni}}, + ) diff --git a/requirements.txt b/requirements.txt index f53d5e9be..f89106d0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,6 +17,8 @@ tiletanic sentry-sdk wikitextparser pycountry +myanmartools +PyICU # Tests pytest == 7.4.4 # In v8 it skips the plugins folder, see our issue #2266 and https://github.com/pytest-dev/pytest/issues/12605 From 486893e92f3d2d69234f8bac2a3895a19b2f251b Mon Sep 17 00:00:00 2001 From: Sascha Brawer Date: Tue, 4 Feb 2025 12:01:30 +0100 Subject: [PATCH 2/2] Change detail message for emitted errors, as suggested in code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Frédéric Rodrigo --- plugins/TagFix_ZawgyiBurmese.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/TagFix_ZawgyiBurmese.py b/plugins/TagFix_ZawgyiBurmese.py index b10e65f18..8f688518a 100644 --- a/plugins/TagFix_ZawgyiBurmese.py +++ b/plugins/TagFix_ZawgyiBurmese.py @@ -40,7 +40,7 @@ def init(self, logger): tags = ['value', 'fix:chair'], title = T_('Value contains Zawgyi-encoded Burmese characters'), detail = T_( -'''Tag values in OpenStreetMap should be stored in Unicode. However, this +'''Tag values should be stored in Unicode. However, this value contains Burmese characters in the obsolete “Zawgyi” font encoding. As long as this value is stored in a non-standard way, modern devices cannot display it correctly. Please change the text to be encoded in Unicode.'''),