Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement zero-width support for Hangul Jamo #111

Merged
merged 4 commits into from
Jan 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion bin/update-tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@
MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '6'))
BACKOFF_FACTOR = float(os.environ.get('BACKOFF_FACTOR', '0.1'))

# Hangul Jamo is a decomposed form of Hangul Syllables, see
# see https://www.unicode.org/faq/korean.html#3
# https://github.com/ridiculousfish/widecharwidth/pull/17
# https://github.com/jquast/ucs-detect/issues/9
# https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
# "Conjoining Jamo are divided into three classes: L, V, T (Leading
# consonant, Vowel, Trailing consonant). A Hangul Syllable consists of
# <LV> or <LVT> sequences."
HANGUL_JAMO_ZEROWIDTH = (
*range(0x1160, 0x1200), # Hangul Jungseong Filler .. Hangul Jongseong Ssangnieun
*range(0xD7B0, 0xD800), # Hangul Jungseong O-Yeo .. Undefined Character of Hangul Jamo Extended-B
)


def _bisearch(ucs, table):
"""A copy of wcwwidth._bisearch, to prevent having issues when depending on code that imports
Expand Down Expand Up @@ -333,6 +346,9 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0).values)

# Also subtract Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values = table[version].values.difference(HANGUL_JAMO_ZEROWIDTH)

# finally, join with atypical 'wide' characters defined by category 'Sk',
table[version].values.update(parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=2).values)
Expand All @@ -351,8 +367,11 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
table[version] = parse_category(fname=UnicodeDataFile.DerivedGeneralCategory(version),
wide=0)

# And, include NULL
# Include NULL
table[version].values.add(0)

# Add Hangul Jamo Vowels and Hangul Trailing Consonants
table[version].values.update(HANGUL_JAMO_ZEROWIDTH)
return UnicodeTableRenderCtx('ZERO_WIDTH', table)


Expand Down
3 changes: 3 additions & 0 deletions docs/intro.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ Other Languages
=======
History
=======
Unreleased
* **Bugfix** zero-width support for Hangul Jamo (Korean)

0.2.12 *2023-11-21*
* re-release to remove .pyi file misplaced in wheel files `Issue #101`_.

Expand Down
43 changes: 37 additions & 6 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,17 +222,48 @@ def test_balinese_script():
assert length_phrase == expect_length_phrase


def test_kr_jamo():
"""
Test basic combining of HANGUL CHOSEONG and JUNGSEONG

Example and from Raymond Chen's blog post,
https://devblogs.microsoft.com/oldnewthing/20201009-00/?p=104351
"""
# This is an example where both characters are "wide" when displayed alone.
#
# But JUNGSEONG (vowel) is designed for combination with a CHOSEONG (consonant).
#
# This wcwidth library understands their width only when combination,
# and not by independent display, like other zero-width characters that may
# only combine with an appropriate preceding character.
phrase = (
u"\u1100" # ᄀ HANGUL CHOSEONG KIYEOK (consonant)
u"\u1161" # ᅡ HANGUL JUNGSEONG A (vowel)
)
expect_length_each = (2, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
length_phrase = wcwidth.wcswidth(phrase)

# verify.
assert length_each == expect_length_each
assert length_phrase == expect_length_phrase


def test_kr_jamo_filler():
u"""
Jamo filler is 0 width.

According to https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf this character and others
like it, ``\uffa0``, ``\u1160``, ``\u115f``, ``\u1160``, are not commonly viewed with a terminal,
seems it doesn't matter whether it is implemented or not, they are not typically used !
Example from https://www.unicode.org/L2/L2006/06310-hangul-decompose9.pdf
"""
phrase = u"\u1100\u1160"
expect_length_each = (2, 1)
expect_length_phrase = 3
phrase = (
u"\u1100" # HANGUL CHOSEONG KIYEOK (consonant)
u"\u1160" # HANGUL JUNGSEONG FILLER (vowel)
)
expect_length_each = (2, 0)
expect_length_phrase = 2

# exercise,
length_each = tuple(map(wcwidth.wcwidth, phrase))
Expand Down
14 changes: 1 addition & 13 deletions wcwidth/table_wide.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Exports WIDE_EASTASIAN table keyed by supporting unicode version level.

This code generated by wcwidth/bin/update-tables.py on 2024-01-03 17:16:09 UTC.
This code generated by wcwidth/bin/update-tables.py on 2024-01-06 01:39:49 UTC.
"""
WIDE_EASTASIAN = {
'4.1.0': (
Expand Down Expand Up @@ -126,8 +126,6 @@
# Date: 2009-06-09, 17:47:00 PDT [KW]
#
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
Expand All @@ -149,8 +147,6 @@
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
Expand All @@ -169,8 +165,6 @@
# Date: 2010-08-17, 12:17:00 PDT [KW]
#
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
Expand All @@ -192,8 +186,6 @@
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
Expand All @@ -214,8 +206,6 @@
# Date: 2011-09-19, 18:46:00 GMT [KW]
#
(0x01100, 0x0115f,), # Hangul Choseong Kiyeok ..Hangul Choseong Filler
(0x011a3, 0x011a7,), # Hangul Jungseong A-eu ..Hangul Jungseong O-yae
(0x011fa, 0x011ff,), # Hangul Jongseong Kiyeok-..Hangul Jongseong Ssangni
(0x02329, 0x0232a,), # Left-pointing Angle Brac..Right-pointing Angle Bra
(0x02e80, 0x02e99,), # Cjk Radical Repeat ..Cjk Radical Rap
(0x02e9b, 0x02ef3,), # Cjk Radical Choke ..Cjk Radical C-simplified
Expand All @@ -237,8 +227,6 @@
(0x0a490, 0x0a4c6,), # Yi Radical Qot ..Yi Radical Ke
(0x0a960, 0x0a97c,), # Hangul Choseong Tikeut-m..Hangul Choseong Ssangyeo
(0x0ac00, 0x0d7a3,), # Hangul Syllable Ga ..Hangul Syllable Hih
(0x0d7b0, 0x0d7c6,), # Hangul Jungseong O-yeo ..Hangul Jungseong Araea-e
(0x0d7cb, 0x0d7fb,), # Hangul Jongseong Nieun-r..Hangul Jongseong Phieuph
(0x0f900, 0x0faff,), # Cjk Compatibility Ideogr..(nil)
(0x0fe10, 0x0fe19,), # Presentation Form For Ve..Presentation Form For Ve
(0x0fe30, 0x0fe52,), # Presentation Form For Ve..Small Full Stop
Expand Down
Loading