Skip to content

Commit

Permalink
Fix creating cultures with extensions in the name (#87114)
Browse files Browse the repository at this point in the history
  • Loading branch information
tarekgh authored Jun 6, 2023
1 parent 838968e commit 2248ebd
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 29 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using Microsoft.DotNet.RemoteExecutor;
using System.Collections.Generic;
using Xunit;

Expand Down Expand Up @@ -434,13 +435,65 @@ public void TestCreationWithTemporaryLCID(int lcid)
Assert.NotEqual(lcid, new CultureInfo(lcid).LCID);
}

[InlineData("zh-TW-u-co-zhuyin")]
[InlineData("de-DE-u-co-phoneb")]
[InlineData("de-u-co-phonebk")]
[InlineData("zh-TW-u-co-zhuyin", "zh-TW", "zh-TW_zhuyin")]
[InlineData("de-DE-u-co-phonebk", "de-DE", "de-DE_phoneboo")]
[InlineData("de-DE-u-co-phonebk-u-xx", "de-DE-u-xx", "de-DE-u-xx_phoneboo")]
[InlineData("de-DE-u-xx-u-co-phonebk", "de-DE-u-xx-u-co-phonebk", "de-DE-u-xx-u-co-phonebk")]
[InlineData("de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk", "de-DE-t-xx-u-co-phonebk_phoneboo")]
[InlineData("de-DE-u-co-phonebk-t-xx", "de-DE-t-xx", "de-DE-t-xx_phoneboo")]
[InlineData("de-DE-u-co-phonebk-t-xx-u-yy", "de-DE-t-xx-u-yy", "de-DE-t-xx-u-yy_phoneboo")]
[InlineData("de-DE", "de-DE", "de-DE")]
[ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))]
public void TestCreationWithMangledSortName(string cultureName)
public void TestCreationWithMangledSortName(string cultureName, string expectedCultureName, string expectedSortName)
{
CultureInfo ci = CultureInfo.GetCultureInfo(cultureName);

Assert.Equal(expectedCultureName, ci.Name);
Assert.Equal(expectedSortName, ci.CompareInfo.Name);
}

[ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))]
public void TestNeutralCultureWithCollationName()
{
Assert.True(CultureInfo.GetCultureInfo(cultureName).CompareInfo.Name.Equals(cultureName, StringComparison.OrdinalIgnoreCase));
Assert.Throws<CultureNotFoundException>(() => CultureInfo.GetCultureInfo("zh-u-co-zhuyin"));
Assert.Throws<CultureNotFoundException>(() => CultureInfo.GetCultureInfo("de-u-co-phonebk"));
}

[InlineData("xx-u-XX", "xx-u-xx")]
[InlineData("xx-u-XX-u-yy", "xx-u-xx-u-yy")]
[InlineData("xx-t-ja-JP", "xx-t-ja-jp")]
[InlineData("qps-plocm", "qps-PLOCM")] // ICU normalize this name to "qps--plocm" which we normalize it back to "qps-plocm"
[ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))]
public void TestCreationWithICUNormalizedNames(string cultureName, string expectedCultureName)
{
CultureInfo ci = CultureInfo.GetCultureInfo(cultureName);
Assert.Equal(expectedCultureName, ci.Name);
}

private static bool SupportRemoteExecutionWithIcu => RemoteExecutor.IsSupported && PlatformDetection.IsIcuGlobalization;

[InlineData("xx-u-XX")]
[InlineData("xx-u-XX-u-yy")]
[InlineData("xx-t-ja-JP")]
[InlineData("qps-plocm")]
[InlineData("zh-TW-u-co-zhuyin")]
[InlineData("de-DE-u-co-phonebk")]
[InlineData("de-DE-u-co-phonebk-u-xx")]
[InlineData("de-DE-u-xx-u-co-phonebk")]
[InlineData("de-DE-t-xx-u-co-phonebk")]
[InlineData("de-DE-u-co-phonebk-t-xx")]
[InlineData("de-DE-u-co-phonebk-t-xx-u-yy")]
[InlineData("de-DE")]
[ConditionalTheory(nameof(SupportRemoteExecutionWithIcu))]
public void TestWithResourceLookup(string cultureName)
{
RemoteExecutor.Invoke(name => {
CultureInfo.CurrentUICulture = CultureInfo.GetCultureInfo(name);
int Zero = 0;

// This should go through the resource manager to get the localized exception message using the current UI culture
Assert.Throws<DivideByZeroException>(() => 1 / Zero);
}, cultureName).Dispose();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,88 @@ internal sealed partial class CultureData
// ICU constants
private const int ICU_ULOC_KEYWORD_AND_VALUES_CAPACITY = 100; // max size of keyword or value
private const int ICU_ULOC_FULLNAME_CAPACITY = 157; // max size of locale name
private const int WINDOWS_MAX_COLLATION_NAME_LENGTH = 8; // max collation name length in the culture name

/// <summary>
/// Process the locale name that ICU returns and convert it to the format that .NET expects.
/// </summary>
/// <param name="name">The locale name that ICU returns.</param>
/// <param name="extension">The extension part in the original culture name.</param>
/// <param name="collationStart">The index of the collation in the name.</param>
/// <remarks>
/// BCP 47 specifications allow for extensions in the locale name, following the format language-script-region-extensions-collation. However,
/// not all extensions supported by ICU are supported in .NET. In the locale name, extensions are separated from the rest of the name using '-u-' or '-t-'.
/// In .NET, only the collation extension is supported. If the name includes a collation extension, it will be prefixed with '-u-co-'.
/// For example, en-US-u-co-search would be converted to the ICU name en_US@collation=search, which would then be translated to the .NET name en-US_search.
/// All extensions in the ICU names start with @. When normalizing the name to the .NET format, we retain the extensions in the name to ensure differentiation
/// between names with extensions and those without. For example, we may have a name like en-US and en-US-u-xx. Although .NET doesn't support the extension xx,
/// we still include it in the name to distinguish it from the name without the extension.
/// </remarks>
private static string NormalizeCultureName(string name, ReadOnlySpan<char> extension, out int collationStart)
{
Debug.Assert(name is not null);
Debug.Assert(name.Length <= ICU_ULOC_FULLNAME_CAPACITY);

collationStart = -1;
bool changed = false;
Span<char> buffer = stackalloc char[ICU_ULOC_FULLNAME_CAPACITY];
int bufferIndex = 0;

for (int i = 0; i < name.Length && bufferIndex < ICU_ULOC_FULLNAME_CAPACITY; i++)
{
char c = name[i];
if (c == '-' && i < name.Length - 1 && name[i + 1] == '-')
{
// ICU changes names like `qps_plocm` (one underscore) to `qps__plocm` (two underscores)
// The reason this occurs is because, while ICU canonicalizing, ulocimp_getCountry returns an empty string since the country code value is > 3 (rightly so).
// But append an extra '_' thinking that country code was in-fact appended (for the empty string value as well).
// Before processing, the name qps__plocm will be converted to its .NET name equivalent, which is qps--plocm.
changed = true;
buffer[bufferIndex++] = '-';
i++;
}
else if (c == '@')
{
changed = true;

if (!extension.IsEmpty && extension.TryCopyTo(buffer.Slice(bufferIndex)))
{
bufferIndex += extension.Length;
}

int collationIndex = name.IndexOf("collation=", i + 1, StringComparison.Ordinal);
if (collationIndex > 0)
{
collationIndex += "collation=".Length;

// format of the locale properties is @key=value;collation=collationName;key=value;key=value
int endOfCollation = name.IndexOf(';', collationIndex);
if (endOfCollation < 0)
{
endOfCollation = name.Length;
}

int length = Math.Min(WINDOWS_MAX_COLLATION_NAME_LENGTH, endOfCollation - collationIndex); // Windows doesn't allow collation names longer than 8 characters
if (buffer.Length - bufferIndex >= length + 1)
{
collationStart = bufferIndex;
buffer[bufferIndex++] = '_';
name.AsSpan(collationIndex, length).CopyTo(buffer.Slice(bufferIndex));
bufferIndex += length;
}
}

// done getting all parts can be supported in the .NET culture names.
break;
}
else
{
buffer[bufferIndex++] = name[i];
}
}

return changed ? new string(buffer.Slice(0, bufferIndex)) : name;
}

/// <summary>
/// This method uses the sRealName field (which is initialized by the constructor before this is called) to
Expand All @@ -26,16 +108,15 @@ private bool InitIcuCultureDataCore()
string realNameBuffer = _sRealName;

// Basic validation
if (!IsValidCultureName(realNameBuffer, out var index))
if (!IsValidCultureName(realNameBuffer, out var index, out int indexOfExtensions))
{
return false;
}

// Replace _ (alternate sort) with @collation= for ICU
ReadOnlySpan<char> alternateSortName = default;
if (index > 0)
{
alternateSortName = realNameBuffer.AsSpan(index + 1);
ReadOnlySpan<char> alternateSortName = realNameBuffer.AsSpan(index + 1);
realNameBuffer = string.Concat(realNameBuffer.AsSpan(0, index), ICU_COLLATION_KEYWORD, alternateSortName);
}

Expand All @@ -45,22 +126,9 @@ private bool InitIcuCultureDataCore()
return false; // fail
}

// Replace the ICU collation keyword with an _
Debug.Assert(_sWindowsName != null);
index = _sWindowsName.IndexOf(ICU_COLLATION_KEYWORD, StringComparison.Ordinal);
if (index >= 0)
{
// Use original culture name if alternateSortName is not set, which is possible even if the normalized
// culture name has "@collation=".
// "zh-TW-u-co-zhuyin" is a good example. The term "u-co-" means the following part will be the sort name
// and it will be treated in ICU as "zh-TW@collation=zhuyin".
_sName = alternateSortName.Length == 0 ? realNameBuffer : string.Concat(_sWindowsName.AsSpan(0, index), "_", alternateSortName);
}
else
{
_sName = _sWindowsName;
}
_sRealName = _sName;

_sRealName = NormalizeCultureName(_sWindowsName, indexOfExtensions > 0 ? _sRealName.AsSpan(indexOfExtensions) : ReadOnlySpan<char>.Empty, out int collationStart);

_iLanguage = LCID;
if (_iLanguage == 0)
Expand All @@ -69,11 +137,15 @@ private bool InitIcuCultureDataCore()
}
_bNeutral = TwoLetterISOCountryName.Length == 0;
_sSpecificCulture = _bNeutral ? IcuLocaleData.GetSpecificCultureName(_sRealName) : _sRealName;
// Remove the sort from sName unless custom culture
if (index > 0 && !_bNeutral && !IsCustomCultureId(_iLanguage))

if (_bNeutral && collationStart > 0)
{
_sName = _sWindowsName.Substring(0, index);
return false; // neutral cultures cannot have collation
}

// Remove the sort from sName unless custom culture
_sName = collationStart < 0 ? _sRealName : _sRealName.Substring(0, collationStart);

return true;
}

Expand Down Expand Up @@ -367,7 +439,7 @@ private static CultureInfo[] IcuEnumCultures(CultureTypes types)
}

bool enumNeutrals = (types & CultureTypes.NeutralCultures) != 0;
bool enumSpecificss = (types & CultureTypes.SpecificCultures) != 0;
bool enumSpecifics = (types & CultureTypes.SpecificCultures) != 0;

List<CultureInfo> list = new List<CultureInfo>();
if (enumNeutrals)
Expand All @@ -382,7 +454,7 @@ private static CultureInfo[] IcuEnumCultures(CultureTypes types)
if (index + length <= bufferLength)
{
CultureInfo ci = CultureInfo.GetCultureInfo(new string(chars, index, length));
if ((enumNeutrals && ci.IsNeutralCulture) || (enumSpecificss && !ci.IsNeutralCulture))
if ((enumNeutrals && ci.IsNeutralCulture) || (enumSpecifics && !ci.IsNeutralCulture))
{
list.Add(ci);
}
Expand Down Expand Up @@ -416,10 +488,14 @@ private static string IcuGetConsoleFallbackName(string cultureName)
/// * Disallow input that starts or ends with '-' or '_'.
/// * Disallow input that has any combination of consecutive '-' or '_'.
/// * Disallow input that has multiple '_'.
///
/// The IsValidCultureName method also identifies the presence of any extensions in the name (such as -u- or -t-) and returns the index of the extension.
/// This is necessary because we need to append the extensions to the name when normalizing it to the .NET format.
/// </remarks>
private static bool IsValidCultureName(string subject, out int indexOfUnderscore)
private static bool IsValidCultureName(string subject, out int indexOfUnderscore, out int indexOfExtensions)
{
indexOfUnderscore = -1;
indexOfExtensions = -1;

if (subject.Length == 0) return true; // Invariant Culture
if (subject.Length == 1 || subject.Length > LocaleNameMaxLength) return false;
Expand All @@ -444,6 +520,16 @@ private static bool IsValidCultureName(string subject, out int indexOfUnderscore
seenUnderscore = true;
indexOfUnderscore = i;
}
else
{
if (indexOfExtensions < 0 && i < subject.Length - 2 && (subject[i + 1] is 'u' or 't') && subject[i + 2] == '-') // we have -u- or -t- which is an extension
{
if (subject[i + 1] == 't' || i >= subject.Length - 6 || subject[i + 3] != 'c' || subject[i + 4] != 'o' || subject[i + 5] != '-' ) // not -u-co- collation extension
{
indexOfExtensions = i;
}
}
}
}
else
{
Expand Down

0 comments on commit 2248ebd

Please sign in to comment.