From d1958bada9e0045c30380ce1e9830df0fc6812f0 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Thu, 1 Jun 2023 17:39:38 -0700 Subject: [PATCH 1/6] Fix creating cultures with extensions in the name --- .../tests/CultureInfo/CultureInfoCtor.cs | 52 ++++++- .../System/Globalization/CultureData.Icu.cs | 137 ++++++++++++++---- 2 files changed, 161 insertions(+), 28 deletions(-) diff --git a/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs b/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs index f53ed73cc7f067..49afdb21fd88ea 100644 --- a/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs +++ b/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using Microsoft.DotNet.RemoteExecutor; using System.Collections.Generic; using Xunit; @@ -434,13 +435,58 @@ public void TestCreationWithTemporaryLCID(int lcid) Assert.NotEqual(lcid, new CultureInfo(lcid).LCID); } + [InlineData("zh-TW-u-co-zhuyin", "zh-TW", "zh-TW_zhuyin")] + [InlineData("de-DE-u-co-phonebk", "de-DE", "de-DE_phonebook")] + [InlineData("de-u-co-phonebk", "de", "de_phonebook")] + [InlineData("de-DE-u-co-phonebk-u-xx", "de-DE-u-xx", "de-DE-u-xx_phonebook")] + [InlineData("de-DE-u-xx-u-co-phonebk", "de-DE-u-xx-u-co-phonebk", "de-DE-u-xx-u-co-phonebk")] + [InlineData("de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk_phonebook")] + [InlineData("de-DE-u-co-phonebk-t-xx", "de-DE-t-xx", "de-DE-t-xx_phonebook")] + [InlineData("de-DE-u-co-phonebk-t-xx-u-yy", "de-DE-t-xx-u-yy", "de-DE-t-xx-u-yy_phonebook")] + [InlineData("de-DE", "de-DE", "de-DE")] + [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] + public void TestCreationWithMangledSortName(string cultureName, string expectedCultureName, string expectedSortName) + { + CultureInfo ci = CultureInfo.GetCultureInfo(cultureName); + + Assert.Equal(expectedCultureName, ci.Name); + Assert.Equal(expectedSortName, ci.CompareInfo.Name); + } + + [InlineData("xx-u-XX", "xx-u-xx")] + [InlineData("xx-u-XX-u-yy", "xx-u-xx-u-yy")] + [InlineData("xx-t-ja-JP", "xx-t-ja-jp")] + [InlineData("qps-plocm", "qps-PLOCM")] // ICU normalize this name to "qps--plocm" which we normalize it back to "qps-plocm" + [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] + public void TestCreationWithICUNormalizedNames(string cultureName, string expectedCultureName) + { + CultureInfo ci = CultureInfo.GetCultureInfo(cultureName); + Assert.Equal(expectedCultureName, ci.Name); + } + + [InlineData("xx-u-XX")] + [InlineData("xx-u-XX-u-yy")] + [InlineData("xx-t-ja-JP")] + [InlineData("qps-plocm")] [InlineData("zh-TW-u-co-zhuyin")] - [InlineData("de-DE-u-co-phoneb")] + [InlineData("de-DE-u-co-phonebk")] [InlineData("de-u-co-phonebk")] + [InlineData("de-DE-u-co-phonebk-u-xx")] + [InlineData("de-DE-u-xx-u-co-phonebk")] + [InlineData("de-DE-t-xx-u-co-phonebk")] + [InlineData("de-DE-u-co-phonebk-t-xx")] + [InlineData("de-DE-u-co-phonebk-t-xx-u-yy")] + [InlineData("de-DE")] [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] - public void TestCreationWithMangledSortName(string cultureName) + public void TestWithResourceLookup(string cultureName) { - Assert.True(CultureInfo.GetCultureInfo(cultureName).CompareInfo.Name.Equals(cultureName, StringComparison.OrdinalIgnoreCase)); + RemoteExecutor.Invoke(name => { + CultureInfo.CurrentUICulture = CultureInfo.GetCultureInfo(name); + int Zero = 0; + + // This should go through the resource manager to get the localized exception message using the current UI culture + Assert.Throws(() => 1 / Zero); + }, cultureName).Dispose(); } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs index aa8cb4391d480e..d9b916ca21c471 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs @@ -13,6 +13,94 @@ internal sealed partial class CultureData private const int ICU_ULOC_KEYWORD_AND_VALUES_CAPACITY = 100; // max size of keyword or value private const int ICU_ULOC_FULLNAME_CAPACITY = 157; // max size of locale name + /// + /// This method process the locale name that ICU returns and converts it to the format that .NET expects. + /// + /// The locale name that ICU returns. + /// The original locale name that was passed to ICU. + /// The index of the extensions in the rawName. + /// The index of the collation in the name. + /// + /// BCP 47 specifications allow for extensions in the locale name, following the format language-script-region-extensions-collation. However, + /// not all extensions supported by ICU are supported in .NET. In the locale name, extensions are separated from the rest of the name using '-u-' or '-t-'. + /// In .NET, only the collation extension is supported. If the name includes a collation extension, it will be prefixed with '-u-co-'. + /// For example, en-US-u-co-search would be converted to the ICU name en_US@collation=co-search, which would then be translated to the .NET name en-US_search. + /// All extensions in the ICU names start with @. When normalizing the name to the .NET format, + /// we retain the extensions in the name to ensure differentiation between names with extensions and those without. + /// For example, we may have a name like en-US and en-US-u-xx. Although .NET doesn't support the extension xx, + /// we still include it in the name to distinguish it from the name without the extension. + /// + private static string NormalizeCultureName(string name, string rawName, int extensionsIndex, out int collationStart) + { + Debug.Assert(name is not null); + Debug.Assert(name.Length <= ICU_ULOC_FULLNAME_CAPACITY); + + collationStart = -1; + bool changed = false; + Span buffer = stackalloc char[ICU_ULOC_FULLNAME_CAPACITY]; + int bufferIndex = 0; + + for (int i = 0; i < name.Length && bufferIndex < ICU_ULOC_FULLNAME_CAPACITY; i++) + { + char c = name[i]; + if (c == '-' && i < name.Length - 1 && name[i + 1] == '-') + { + // ICU change the names like `qps_plocm` to `qps__plocm` + // The reason this occurs is because, while ICU canonicalizing, ulocimp_getCountry returns an empty string since the country code value is > 3 (rightly so). + // But append an extra '-' thinking that country code was in-fact appended (for the empty string value as well). + changed = true; + buffer[bufferIndex++] = '-'; + i++; + } + else if (c == '@') + { + changed = true; + + if (extensionsIndex > 0) // raw culture name has extensions -u- or -t- + { + int length = rawName.Length - extensionsIndex; + if (buffer.Length - bufferIndex >= length) + { + // append the extensions to the buffer to be part of the culture name + rawName.AsSpan(extensionsIndex).CopyTo(buffer.Slice(bufferIndex)); + bufferIndex += length; + } + } + + int collationIndex = name.IndexOf("collation=", i + 1, StringComparison.Ordinal); + if (collationIndex > 0) + { + collationIndex += "collation=".Length; + + // format of the locale properties is @key=value;collation=collationName;key=value;key=value + int endOfCollation = name.IndexOf(';', collationIndex); + if (endOfCollation < 0) + { + endOfCollation = name.Length; + } + + int length = endOfCollation - collationIndex; + if (buffer.Length - bufferIndex >= length + 1 ) + { + collationStart = bufferIndex; + buffer[bufferIndex++] = '_'; + name.AsSpan(collationIndex, length).CopyTo(buffer.Slice(bufferIndex)); + bufferIndex += length; + } + } + + // done getting all parts can be supported in the .NET culture names. + break; + } + else + { + buffer[bufferIndex++] = name[i]; + } + } + + return changed ? new (buffer.Slice(0, bufferIndex)) : name; + } + /// /// This method uses the sRealName field (which is initialized by the constructor before this is called) to /// initialize the rest of the state of CultureData based on the underlying OS globalization library. @@ -26,16 +114,15 @@ private bool InitIcuCultureDataCore() string realNameBuffer = _sRealName; // Basic validation - if (!IsValidCultureName(realNameBuffer, out var index)) + if (!IsValidCultureName(realNameBuffer, out var index, out int indexOfExtensions)) { return false; } // Replace _ (alternate sort) with @collation= for ICU - ReadOnlySpan alternateSortName = default; if (index > 0) { - alternateSortName = realNameBuffer.AsSpan(index + 1); + ReadOnlySpan alternateSortName = realNameBuffer.AsSpan(index + 1); realNameBuffer = string.Concat(realNameBuffer.AsSpan(0, index), ICU_COLLATION_KEYWORD, alternateSortName); } @@ -45,22 +132,9 @@ private bool InitIcuCultureDataCore() return false; // fail } - // Replace the ICU collation keyword with an _ Debug.Assert(_sWindowsName != null); - index = _sWindowsName.IndexOf(ICU_COLLATION_KEYWORD, StringComparison.Ordinal); - if (index >= 0) - { - // Use original culture name if alternateSortName is not set, which is possible even if the normalized - // culture name has "@collation=". - // "zh-TW-u-co-zhuyin" is a good example. The term "u-co-" means the following part will be the sort name - // and it will be treated in ICU as "zh-TW@collation=zhuyin". - _sName = alternateSortName.Length == 0 ? realNameBuffer : string.Concat(_sWindowsName.AsSpan(0, index), "_", alternateSortName); - } - else - { - _sName = _sWindowsName; - } - _sRealName = _sName; + + _sRealName = NormalizeCultureName(_sWindowsName, _sRealName, indexOfExtensions, out int collationStart); _iLanguage = LCID; if (_iLanguage == 0) @@ -69,11 +143,10 @@ private bool InitIcuCultureDataCore() } _bNeutral = TwoLetterISOCountryName.Length == 0; _sSpecificCulture = _bNeutral ? IcuLocaleData.GetSpecificCultureName(_sRealName) : _sRealName; + // Remove the sort from sName unless custom culture - if (index > 0 && !_bNeutral && !IsCustomCultureId(_iLanguage)) - { - _sName = _sWindowsName.Substring(0, index); - } + _sName = collationStart < 0 ? _sRealName : _sRealName.Substring(0, collationStart); + return true; } @@ -367,7 +440,7 @@ private static CultureInfo[] IcuEnumCultures(CultureTypes types) } bool enumNeutrals = (types & CultureTypes.NeutralCultures) != 0; - bool enumSpecificss = (types & CultureTypes.SpecificCultures) != 0; + bool enumSpecifics = (types & CultureTypes.SpecificCultures) != 0; List list = new List(); if (enumNeutrals) @@ -382,7 +455,7 @@ private static CultureInfo[] IcuEnumCultures(CultureTypes types) if (index + length <= bufferLength) { CultureInfo ci = CultureInfo.GetCultureInfo(new string(chars, index, length)); - if ((enumNeutrals && ci.IsNeutralCulture) || (enumSpecificss && !ci.IsNeutralCulture)) + if ((enumNeutrals && ci.IsNeutralCulture) || (enumSpecifics && !ci.IsNeutralCulture)) { list.Add(ci); } @@ -416,10 +489,14 @@ private static string IcuGetConsoleFallbackName(string cultureName) /// * Disallow input that starts or ends with '-' or '_'. /// * Disallow input that has any combination of consecutive '-' or '_'. /// * Disallow input that has multiple '_'. + /// + /// The IsValidCultureName method also identifies the presence of any extensions in the name (such as -u- or -t-) and returns the index of the extension. + /// This is necessary because we need to append the extensions to the name when normalizing it to the .NET format. /// - private static bool IsValidCultureName(string subject, out int indexOfUnderscore) + private static bool IsValidCultureName(string subject, out int indexOfUnderscore, out int indexOfExtensions) { indexOfUnderscore = -1; + indexOfExtensions = -1; if (subject.Length == 0) return true; // Invariant Culture if (subject.Length == 1 || subject.Length > LocaleNameMaxLength) return false; @@ -444,6 +521,16 @@ private static bool IsValidCultureName(string subject, out int indexOfUnderscore seenUnderscore = true; indexOfUnderscore = i; } + else + { + if (indexOfExtensions < 0 && i < subject.Length - 2 && (subject[i + 1] == 'u' || subject[i + 1] == 't') && subject[i + 2] == '-') // we have -u- or -t- which is an extension + { + if (subject[i + 1] == 't' || i >= subject.Length - 6 || subject[i + 3] != 'c' || subject[i + 4] != 'o' || subject[i + 5] != '-' ) // not -u-co- collation extension + { + indexOfExtensions = i; + } + } + } } else { From 39c2f3410e91bfc0d3e0e890f3a1c1799562157d Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Fri, 2 Jun 2023 18:42:24 -0700 Subject: [PATCH 2/6] Fix running on Windows --- .../tests/CultureInfo/CultureInfoCtor.cs | 21 ++++++++++++------- .../System/Globalization/CultureData.Icu.cs | 14 ++++++++----- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs b/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs index 49afdb21fd88ea..280fb7517f997c 100644 --- a/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs +++ b/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs @@ -435,14 +435,15 @@ public void TestCreationWithTemporaryLCID(int lcid) Assert.NotEqual(lcid, new CultureInfo(lcid).LCID); } + private static bool SupportRemoteExecutionWithIcu => RemoteExecutor.IsSupported && PlatformDetection.IsIcuGlobalization; + [InlineData("zh-TW-u-co-zhuyin", "zh-TW", "zh-TW_zhuyin")] - [InlineData("de-DE-u-co-phonebk", "de-DE", "de-DE_phonebook")] - [InlineData("de-u-co-phonebk", "de", "de_phonebook")] - [InlineData("de-DE-u-co-phonebk-u-xx", "de-DE-u-xx", "de-DE-u-xx_phonebook")] + [InlineData("de-DE-u-co-phonebk", "de-DE", "de-DE_phoneboo")] + [InlineData("de-DE-u-co-phonebk-u-xx", "de-DE-u-xx", "de-DE-u-xx_phoneboo")] [InlineData("de-DE-u-xx-u-co-phonebk", "de-DE-u-xx-u-co-phonebk", "de-DE-u-xx-u-co-phonebk")] - [InlineData("de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk_phonebook")] - [InlineData("de-DE-u-co-phonebk-t-xx", "de-DE-t-xx", "de-DE-t-xx_phonebook")] - [InlineData("de-DE-u-co-phonebk-t-xx-u-yy", "de-DE-t-xx-u-yy", "de-DE-t-xx-u-yy_phonebook")] + [InlineData("de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk_phoneboo")] + [InlineData("de-DE-u-co-phonebk-t-xx", "de-DE-t-xx", "de-DE-t-xx_phoneboo")] + [InlineData("de-DE-u-co-phonebk-t-xx-u-yy", "de-DE-t-xx-u-yy", "de-DE-t-xx-u-yy_phoneboo")] [InlineData("de-DE", "de-DE", "de-DE")] [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] public void TestCreationWithMangledSortName(string cultureName, string expectedCultureName, string expectedSortName) @@ -453,6 +454,13 @@ public void TestCreationWithMangledSortName(string cultureName, string expectedC Assert.Equal(expectedSortName, ci.CompareInfo.Name); } + [ConditionalFact(nameof(SupportRemoteExecutionWithIcu))] + public void TestNeutralCultureWithCollationName() + { + Assert.Throws(() => CultureInfo.GetCultureInfo("zh-u-co-zhuyin")); + Assert.Throws(() => CultureInfo.GetCultureInfo("de-u-co-phonebk")); + } + [InlineData("xx-u-XX", "xx-u-xx")] [InlineData("xx-u-XX-u-yy", "xx-u-xx-u-yy")] [InlineData("xx-t-ja-JP", "xx-t-ja-jp")] @@ -470,7 +478,6 @@ public void TestCreationWithICUNormalizedNames(string cultureName, string expect [InlineData("qps-plocm")] [InlineData("zh-TW-u-co-zhuyin")] [InlineData("de-DE-u-co-phonebk")] - [InlineData("de-u-co-phonebk")] [InlineData("de-DE-u-co-phonebk-u-xx")] [InlineData("de-DE-u-xx-u-co-phonebk")] [InlineData("de-DE-t-xx-u-co-phonebk")] diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs index d9b916ca21c471..1200d625c1a62b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs @@ -24,10 +24,9 @@ internal sealed partial class CultureData /// BCP 47 specifications allow for extensions in the locale name, following the format language-script-region-extensions-collation. However, /// not all extensions supported by ICU are supported in .NET. In the locale name, extensions are separated from the rest of the name using '-u-' or '-t-'. /// In .NET, only the collation extension is supported. If the name includes a collation extension, it will be prefixed with '-u-co-'. - /// For example, en-US-u-co-search would be converted to the ICU name en_US@collation=co-search, which would then be translated to the .NET name en-US_search. - /// All extensions in the ICU names start with @. When normalizing the name to the .NET format, - /// we retain the extensions in the name to ensure differentiation between names with extensions and those without. - /// For example, we may have a name like en-US and en-US-u-xx. Although .NET doesn't support the extension xx, + /// For example, en-US-u-co-search would be converted to the ICU name en_US@collation=search, which would then be translated to the .NET name en-US_search. + /// All extensions in the ICU names start with @. When normalizing the name to the .NET format, we retain the extensions in the name to ensure differentiation + /// between names with extensions and those without. For example, we may have a name like en-US and en-US-u-xx. Although .NET doesn't support the extension xx, /// we still include it in the name to distinguish it from the name without the extension. /// private static string NormalizeCultureName(string name, string rawName, int extensionsIndex, out int collationStart) @@ -79,7 +78,7 @@ private static string NormalizeCultureName(string name, string rawName, int exte endOfCollation = name.Length; } - int length = endOfCollation - collationIndex; + int length = Math.Min(8, endOfCollation - collationIndex); // Windows doesn't allow collation names longer than 8 characters if (buffer.Length - bufferIndex >= length + 1 ) { collationStart = bufferIndex; @@ -144,6 +143,11 @@ private bool InitIcuCultureDataCore() _bNeutral = TwoLetterISOCountryName.Length == 0; _sSpecificCulture = _bNeutral ? IcuLocaleData.GetSpecificCultureName(_sRealName) : _sRealName; + if (_bNeutral && collationStart > 0) + { + return false; // neutral cultures cannot have collation + } + // Remove the sort from sName unless custom culture _sName = collationStart < 0 ? _sRealName : _sRealName.Substring(0, collationStart); From 60ca9bb3a00d91d0141249caa7659aea20059378 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Sun, 4 Jun 2023 19:49:03 -0700 Subject: [PATCH 3/6] Fix the test on arm runs --- .../tests/CultureInfo/CultureInfoCtor.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs b/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs index 280fb7517f997c..329bf076d500c2 100644 --- a/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs +++ b/src/libraries/System.Globalization/tests/CultureInfo/CultureInfoCtor.cs @@ -435,8 +435,6 @@ public void TestCreationWithTemporaryLCID(int lcid) Assert.NotEqual(lcid, new CultureInfo(lcid).LCID); } - private static bool SupportRemoteExecutionWithIcu => RemoteExecutor.IsSupported && PlatformDetection.IsIcuGlobalization; - [InlineData("zh-TW-u-co-zhuyin", "zh-TW", "zh-TW_zhuyin")] [InlineData("de-DE-u-co-phonebk", "de-DE", "de-DE_phoneboo")] [InlineData("de-DE-u-co-phonebk-u-xx", "de-DE-u-xx", "de-DE-u-xx_phoneboo")] @@ -454,7 +452,7 @@ public void TestCreationWithMangledSortName(string cultureName, string expectedC Assert.Equal(expectedSortName, ci.CompareInfo.Name); } - [ConditionalFact(nameof(SupportRemoteExecutionWithIcu))] + [ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] public void TestNeutralCultureWithCollationName() { Assert.Throws(() => CultureInfo.GetCultureInfo("zh-u-co-zhuyin")); @@ -472,6 +470,8 @@ public void TestCreationWithICUNormalizedNames(string cultureName, string expect Assert.Equal(expectedCultureName, ci.Name); } + private static bool SupportRemoteExecutionWithIcu => RemoteExecutor.IsSupported && PlatformDetection.IsIcuGlobalization; + [InlineData("xx-u-XX")] [InlineData("xx-u-XX-u-yy")] [InlineData("xx-t-ja-JP")] @@ -484,7 +484,7 @@ public void TestCreationWithICUNormalizedNames(string cultureName, string expect [InlineData("de-DE-u-co-phonebk-t-xx")] [InlineData("de-DE-u-co-phonebk-t-xx-u-yy")] [InlineData("de-DE")] - [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] + [ConditionalTheory(nameof(SupportRemoteExecutionWithIcu))] public void TestWithResourceLookup(string cultureName) { RemoteExecutor.Invoke(name => { From 1ae110892d2d05653d0130c045af668cad9095ac Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 5 Jun 2023 10:46:47 -0700 Subject: [PATCH 4/6] Address the feedback --- .../System/Globalization/CultureData.Icu.cs | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs index 1200d625c1a62b..fd65909040d923 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs @@ -14,11 +14,10 @@ internal sealed partial class CultureData private const int ICU_ULOC_FULLNAME_CAPACITY = 157; // max size of locale name /// - /// This method process the locale name that ICU returns and converts it to the format that .NET expects. + /// Process the locale name that ICU returns and convert it to the format that .NET expects. /// /// The locale name that ICU returns. - /// The original locale name that was passed to ICU. - /// The index of the extensions in the rawName. + /// The extension part in the original culture name. /// The index of the collation in the name. /// /// BCP 47 specifications allow for extensions in the locale name, following the format language-script-region-extensions-collation. However, @@ -29,7 +28,7 @@ internal sealed partial class CultureData /// between names with extensions and those without. For example, we may have a name like en-US and en-US-u-xx. Although .NET doesn't support the extension xx, /// we still include it in the name to distinguish it from the name without the extension. /// - private static string NormalizeCultureName(string name, string rawName, int extensionsIndex, out int collationStart) + private static string NormalizeCultureName(string name, ReadOnlySpan extension, out int collationStart) { Debug.Assert(name is not null); Debug.Assert(name.Length <= ICU_ULOC_FULLNAME_CAPACITY); @@ -44,7 +43,7 @@ private static string NormalizeCultureName(string name, string rawName, int exte char c = name[i]; if (c == '-' && i < name.Length - 1 && name[i + 1] == '-') { - // ICU change the names like `qps_plocm` to `qps__plocm` + // ICU changes names like `qps_plocm` (one underscore) to `qps__plocm` (two underscores) // The reason this occurs is because, while ICU canonicalizing, ulocimp_getCountry returns an empty string since the country code value is > 3 (rightly so). // But append an extra '-' thinking that country code was in-fact appended (for the empty string value as well). changed = true; @@ -55,15 +54,9 @@ private static string NormalizeCultureName(string name, string rawName, int exte { changed = true; - if (extensionsIndex > 0) // raw culture name has extensions -u- or -t- + if (!extension.IsEmpty && extension.TryCopyTo(buffer.Slice(bufferIndex))) { - int length = rawName.Length - extensionsIndex; - if (buffer.Length - bufferIndex >= length) - { - // append the extensions to the buffer to be part of the culture name - rawName.AsSpan(extensionsIndex).CopyTo(buffer.Slice(bufferIndex)); - bufferIndex += length; - } + bufferIndex += extension.Length; } int collationIndex = name.IndexOf("collation=", i + 1, StringComparison.Ordinal); @@ -79,7 +72,7 @@ private static string NormalizeCultureName(string name, string rawName, int exte } int length = Math.Min(8, endOfCollation - collationIndex); // Windows doesn't allow collation names longer than 8 characters - if (buffer.Length - bufferIndex >= length + 1 ) + if (buffer.Length - bufferIndex >= length + 1) { collationStart = bufferIndex; buffer[bufferIndex++] = '_'; @@ -97,7 +90,7 @@ private static string NormalizeCultureName(string name, string rawName, int exte } } - return changed ? new (buffer.Slice(0, bufferIndex)) : name; + return changed ? new string(buffer.Slice(0, bufferIndex)) : name; } /// @@ -133,7 +126,7 @@ private bool InitIcuCultureDataCore() Debug.Assert(_sWindowsName != null); - _sRealName = NormalizeCultureName(_sWindowsName, _sRealName, indexOfExtensions, out int collationStart); + _sRealName = NormalizeCultureName(_sWindowsName, indexOfExtensions > 0 ? _sRealName.AsSpan(indexOfExtensions) : ReadOnlySpan.Empty, out int collationStart); _iLanguage = LCID; if (_iLanguage == 0) @@ -527,7 +520,7 @@ private static bool IsValidCultureName(string subject, out int indexOfUnderscore } else { - if (indexOfExtensions < 0 && i < subject.Length - 2 && (subject[i + 1] == 'u' || subject[i + 1] == 't') && subject[i + 2] == '-') // we have -u- or -t- which is an extension + if (indexOfExtensions < 0 && i < subject.Length - 2 && (subject[i + 1] is 'u' or 't') && subject[i + 2] == '-') // we have -u- or -t- which is an extension { if (subject[i + 1] == 't' || i >= subject.Length - 6 || subject[i + 3] != 'c' || subject[i + 4] != 'o' || subject[i + 5] != '-' ) // not -u-co- collation extension { From b69ae23193368b9edf967948724c877abe1eaf75 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 5 Jun 2023 11:05:01 -0700 Subject: [PATCH 5/6] Fix the comment. --- .../src/System/Globalization/CultureData.Icu.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs index fd65909040d923..36becea802aef0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs @@ -45,7 +45,8 @@ private static string NormalizeCultureName(string name, ReadOnlySpan exten { // ICU changes names like `qps_plocm` (one underscore) to `qps__plocm` (two underscores) // The reason this occurs is because, while ICU canonicalizing, ulocimp_getCountry returns an empty string since the country code value is > 3 (rightly so). - // But append an extra '-' thinking that country code was in-fact appended (for the empty string value as well). + // But append an extra '_' thinking that country code was in-fact appended (for the empty string value as well). + // Before processing, the name qps__plocm will be converted to its .NET name equivalent, which is qps--plocm. changed = true; buffer[bufferIndex++] = '-'; i++; From 4a413ab50c0dc7504a3b48b3919a42802a5b6554 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 5 Jun 2023 21:17:32 -0700 Subject: [PATCH 6/6] Define the max collation name length constant --- .../src/System/Globalization/CultureData.Icu.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs index 36becea802aef0..cfb829d617814e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CultureData.Icu.cs @@ -12,6 +12,7 @@ internal sealed partial class CultureData // ICU constants private const int ICU_ULOC_KEYWORD_AND_VALUES_CAPACITY = 100; // max size of keyword or value private const int ICU_ULOC_FULLNAME_CAPACITY = 157; // max size of locale name + private const int WINDOWS_MAX_COLLATION_NAME_LENGTH = 8; // max collation name length in the culture name /// /// Process the locale name that ICU returns and convert it to the format that .NET expects. @@ -72,7 +73,7 @@ private static string NormalizeCultureName(string name, ReadOnlySpan exten endOfCollation = name.Length; } - int length = Math.Min(8, endOfCollation - collationIndex); // Windows doesn't allow collation names longer than 8 characters + int length = Math.Min(WINDOWS_MAX_COLLATION_NAME_LENGTH, endOfCollation - collationIndex); // Windows doesn't allow collation names longer than 8 characters if (buffer.Length - bufferIndex >= length + 1) { collationStart = bufferIndex;