Skip to content

Commit

Permalink
Add plugin for detecting Burmese Zawgyi
Browse files Browse the repository at this point in the history
Zawgyi is an obsolete font encoding that was shoehorned into Unicode,
incompatible with proper Unicode. Structurally, Zawgyi stings looks like
Unicode (and can be passed around and stored as UTF-8, UTF-16, etc.),
but when rendered the result looks garbled. Because of the similarity
to proper Unicode, detection of Zawgyi is non-trivial and can only be done
probabilistically. This plugin makes use of Google’s open-source Zawgyi
detector, which uses Markov chains to determine the likelihood of
a Burmese string being Zawgyi-encoded versus proper Unicode. The plugin
suggests fixes by converting the detected problem cases from Zawgyi
to Unicode using the Unicode ICU library.

Fixes osm-fr#2442.
  • Loading branch information
brawer committed Feb 4, 2025
1 parent bcd4706 commit 5d57eaf
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ RUN apt-get update && \
git \
libarchive-dev \
libboost-python-dev \
libicu-dev \
libosmpbf-dev \
libprotobuf-dev \
locales \
Expand All @@ -23,6 +24,7 @@ RUN apt-get update && \
postgresql-client \
protobuf-compiler \
python3-dev \
python3-icu \
python3-pip \
python3-setuptools \
python3-wheel \
Expand Down
4 changes: 4 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@ ignore_errors = True
# External
[mypy-pandas.*]
ignore_missing_imports = True
[mypy-icu.*]
ignore_missing_imports = True
[mypy-ipyleaflet.*]
ignore_missing_imports = True
[mypy-ipywidgets.*]
ignore_missing_imports = True
[mypy-antlr4.*]
ignore_missing_imports = True
[mypy-myanmartools.*]
ignore_missing_imports = True
[mypy-Pyro.*]
ignore_missing_imports = True
[mypy-shapely.*]
Expand Down
102 changes: 102 additions & 0 deletions plugins/TagFix_ZawgyiBurmese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#-*- coding: utf-8 -*-

###########################################################################
## ##
## Copyrights Sascha Brawer 2025 ##
## ##
## This program is free software: you can redistribute it and/or modify ##
## it under the terms of the GNU General Public License as published by ##
## the Free Software Foundation, either version 3 of the License, or ##
## (at your option) any later version. ##
## ##
## This program is distributed in the hope that it will be useful, ##
## but WITHOUT ANY WARRANTY; without even the implied warranty of ##
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ##
## GNU General Public License for more details. ##
## ##
## You should have received a copy of the GNU General Public License ##
## along with this program. If not, see <http://www.gnu.org/licenses/>. ##
## ##
###########################################################################

from modules.OsmoseTranslation import T_
from plugins.Plugin import Plugin

import myanmartools
import icu


# https://en.wikipedia.org/wiki/Zawgyi_font

class TagFix_ZawgyiBurmese(Plugin):

only_for = ["MM"]

def init(self, logger):
Plugin.init(self, logger)
self.errors[50706] = self.def_class(
item = 5070,
level = 2,
tags = ['value', 'fix:chair'],
title = T_('Value contains Zawgyi-encoded Burmese characters'),
detail = T_(
'''Tag values in OpenStreetMap should be stored in Unicode. However, this
value contains Burmese characters in the obsolete “Zawgyi” font encoding.
As long as this value is stored in a non-standard way, modern devices cannot
display it correctly. Please change the text to be encoded in Unicode.'''),
)
self.detector = myanmartools.ZawgyiDetector()
self.converter = icu.Transliterator.createInstance('Zawgyi-my')

def node(self, data, tags):
errs = []
for key, value in tags.items():
if not any(0x1000 <= ord(c) <= 0x109F for c in value):
continue
score = self.detector.get_zawgyi_probability(value)
if score < 0.8:
continue
fixed_value = self.converter.transliterate(value)
if value == fixed_value:
continue
errs.append({"class": 50706, "subclass": 0, "fix": {key: fixed_value}})
return errs

def way(self, data, tags, nodes):
return self.node(data, tags)

def relation(self, data, tags, members):
return self.node(data, tags)


###########################################################################
from plugins.Plugin import TestPluginCommon


class Test(TestPluginCommon):
def test(self):
a = TagFix_ZawgyiBurmese(None)
a.init(None)
for name in [
"",
"foo",
"ဘားအံ",
"ကျိုက်မရော အဝေးပြေးလမ်း",
]:
assert not a.node(None, {"name": name}), name
assert not a.way(None, {"name": name}, nodes=None), name
assert not a.relation(None, {"name": name}, members=None), name

for zawgyi, uni in [("မ္း", "မ်း"), ("က္ေ", "က်ေ")]:
self.check_err(
a.node(None, {"addr:street": zawgyi}),
{"class": 50706, "subclass": 0, "fix": {"addr:street": uni}},
)
self.check_err(
a.way(None, {"addr:city": zawgyi}, nodes=None),
{"class": 50706, "subclass": 0, "fix": {"addr:street": uni}},
)
self.check_err(
a.relation(None, {"fixme": zawgyi}, members=None),
{"class": 50706, "subclass": 0, "fix": {"addr:street": uni}},
)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ tiletanic
sentry-sdk
wikitextparser
pycountry
myanmartools
PyICU

# Tests
pytest == 7.4.4 # In v8 it skips the plugins folder, see our issue #2266 and https://github.com/pytest-dev/pytest/issues/12605
Expand Down

0 comments on commit 5d57eaf

Please sign in to comment.