From 8147ed4d9b59eeba12023916ecb57e389190c45d Mon Sep 17 00:00:00 2001 From: Wahome Macharia Date: Tue, 21 Feb 2023 19:03:35 +0300 Subject: [PATCH] Implement RFC 5646 compliant regex to validate language tags and locales --- src/lib/isLocale.js | 110 ++++++++++++++++++++++++++++++++++++++-- test/validators.test.js | 35 +++++++++++++ 2 files changed, 140 insertions(+), 5 deletions(-) diff --git a/src/lib/isLocale.js b/src/lib/isLocale.js index cacac8aec..ec84c8fce 100644 --- a/src/lib/isLocale.js +++ b/src/lib/isLocale.js @@ -1,11 +1,111 @@ import assertString from './util/assertString'; -const localeReg = /^[A-Za-z]{2,4}([_-]([A-Za-z]{4}|[\d]{3}))?([_-]([A-Za-z]{2}|[\d]{3}))?$/; +/* + = 3ALPHA ; selected ISO 639 codes + *2("-" 3ALPHA) ; permanently reserved + */ +const extlang = '([A-Za-z]{3}(-[A-Za-z]{3}){0,2})'; + +/* + = 2*3ALPHA ; shortest ISO 639 code + ["-" extlang] ; sometimes followed by + ; extended language subtags + / 4ALPHA ; or reserved for future use + / 5*8ALPHA ; or registered language subtag + */ +const language = `(([a-zA-Z]{2,3}(-${extlang})?)|([a-zA-Z]{5,8}))`; + +/* + = 4ALPHA ; ISO 15924 code + */ +const script = '([A-Za-z]{4})'; + +/* + = 2ALPHA ; ISO 3166-1 code + / 3DIGIT ; UN M.49 code + */ +const region = '([A-Za-z]{2}|\\d{3})'; + +/* + = 5*8alphanum ; registered variants + / (DIGIT 3alphanum) + */ +const variant = '([A-Za-z0-9]{5,8}|(\\d[A-Z-a-z0-9]{3}))'; + +/* + = DIGIT ; 0 - 9 + / %x41-57 ; A - W + / %x59-5A ; Y - Z + / %x61-77 ; a - w + / %x79-7A ; y - z + */ +const singleton = '(\\d|[A-W]|[Y-Z]|[a-w]|[y-z])'; + +/* + = singleton 1*("-" (2*8alphanum)) + ; Single alphanumerics + ; "x" reserved for private use + */ +const extension = `(${singleton}(-[A-Za-z0-9]{2,8})+)`; + +/* + = "x" 1*("-" (1*8alphanum)) + */ +const privateuse = '(x(-[A-Za-z0-9]{1,8})+)'; + +// irregular tags do not match the 'langtag' production and would not +// otherwise be considered 'well-formed'. These tags are all valid, but +// most are deprecated in favor of more modern subtags or subtag combination + +const irregular = '((en-GB-oed)|(i-ami)|(i-bnn)|(i-default)|(i-enochian)|' + + '(i-hak)|(i-klingon)|(i-lux)|(i-mingo)|(i-navajo)|(i-pwn)|(i-tao)|' + + '(i-tay)|(i-tsu)|(sgn-BE-FR)|(sgn-BE-NL)|(sgn-CH-DE))'; + +// regular tags match the 'langtag' production, but their subtags are not +// extended language or variant subtags: their meaning is defined by +// their registration and all of these are deprecated in favor of a more +// modern subtag or sequence of subtags + +const regular = '((art-lojban)|(cel-gaulish)|(no-bok)|(no-nyn)|(zh-guoyu)|' + + '(zh-hakka)|(zh-min)|(zh-min-nan)|(zh-xiang))'; + +/* + = irregular ; non-redundant tags registered + / regular ; during the RFC 3066 era + + */ +const grandfathered = `(${irregular}|${regular})`; + +/* + RFC 5646 defines delimitation of subtags via a hyphen: + + "Subtag" refers to a specific section of a tag, delimited by a + hyphen, such as the subtags 'zh', 'Hant', and 'CN' in the tag "zh- + Hant-CN". Examples of subtags in this document are enclosed in + single quotes ('Hant') + + However, we need to add "_" to maintain the existing behaviour. + */ +const delimiter = '(-|_)'; + +/* + = language + ["-" script] + ["-" region] + *("-" variant) + *("-" extension) + ["-" privateuse] + */ +const langtag = `${language}(${delimiter}${script})?(${delimiter}${region})?(${delimiter}${variant})*(${delimiter}${extension})*(${delimiter}${privateuse})?`; + +/* + Regex implementation based on BCP RFC 5646 + Tags for Identifying Languages + https://www.rfc-editor.org/rfc/rfc5646.html + */ +const languageTagRegex = new RegExp(`(^${privateuse}$)|(^${grandfathered}$)|(^${langtag}$)`); export default function isLocale(str) { assertString(str); - if (str === 'en_US_POSIX' || str === 'ca_ES_VALENCIA') { - return true; - } - return localeReg.test(str); + return languageTagRegex.test(str); } diff --git a/test/validators.test.js b/test/validators.test.js index 239f172ca..611ce9de8 100644 --- a/test/validators.test.js +++ b/test/validators.test.js @@ -4816,16 +4816,51 @@ describe('Validators', () => { 'uz_Latn_UZ', 'en', 'gsw', + 'en-US', 'es_ES', + 'es-419', 'sw_KE', 'am_ET', + 'zh-CHS', 'ca_ES_VALENCIA', 'en_US_POSIX', + 'hak-CN', + 'zh-Hant', + 'zh-Hans', + 'sr-Cyrl', + 'sr-Latn', + 'zh-cmn-Hans-CN', + 'cmn-Hans-CN', + 'zh-yue-HK', + 'yue-HK', + 'zh-Hans-CN', + 'sr-Latn-RS', + 'sl-rozaj', + 'sl-rozaj-biske', + 'sl-nedis', + 'de-CH-1901', + 'sl-IT-nedis', + 'hy-Latn-IT-arevela', + 'i-enochian', + 'en-scotland-fonipa', + 'sl-IT-rozaj-biske-1994', + 'de-CH-x-phonebk', + 'az-Arab-x-AZE-derbend', + 'x-whatever', + 'qaa-Qaaa-QM-x-southern', + 'de-Qaaa', + 'sr-Latn-QM', + 'sr-Qaaa-RS', + 'en-US-u-islamcal', + 'zh-CN-a-myext-x-private', + 'en-a-myext-b-another', ], invalid: [ 'lo_POP', '12', '12_DD', + 'de-419-DE', + 'a-DE', ], }); });