From 917dc0b394f9c0bf2c604134db4bb6aefc150311 Mon Sep 17 00:00:00 2001 From: Ilona Tomkowicz Date: Fri, 26 May 2023 17:17:56 +0200 Subject: [PATCH 1/4] Fix + test. --- .../System/Globalization/TextInfoTests.cs | 1 + .../hybrid-globalization/change-case.ts | 66 +++++++++++++++---- 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs index 11b4dd5249843e..dba868fbda550c 100644 --- a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs +++ b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs @@ -394,6 +394,7 @@ public static IEnumerable ToUpper_TestData() // we also don't preform. // es-zed does not case to SS when uppercased. yield return new object[] { cultureName, "\u00DF", "\u00DF" }; + yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" }; // Ligatures do not expand when cased. yield return new object[] { cultureName, "\uFB00", "\uFB00" }; diff --git a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts index fddd80e9823829..7a559097cb66c0 100644 --- a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts +++ b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts @@ -13,15 +13,37 @@ export function mono_wasm_change_case_invariant(src: number, srcLength: number, const exceptionRoot = mono_wasm_new_external_root(ex_address); try{ const input = get_utf16_string(src, srcLength); - let result = toUpper ? input.toUpperCase() : input.toLowerCase(); + const result = toUpper ? input.toUpperCase() : input.toLowerCase(); + // Unicode defines some codepoints which expand into multiple codepoints, // originally we do not support this expansion - if (result.length > dstLength) - result = input; + if (result.length <= dstLength) + { + for (let i = 0; i < result.length; i++) + setU16(dst + i*2, result.charCodeAt(i)); + wrap_no_error_root(is_exception, exceptionRoot); + return; + } - for (let i = 0; i < result.length; i++) - setU16(dst + i*2, result.charCodeAt(i)); - wrap_no_error_root(is_exception, exceptionRoot); + // workaround to maintain the ICU-like behavior + if (toUpper) + { + for (let i=0; i < input.length; i++) + { + const upperChar = input[i].toUpperCase(); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16(dst + i*2, appendedChar.charCodeAt(0)); + } + } + else + { + for (let i=0; i < input.length; i++) + { + const lowerChar = input[i].toLowerCase(); + const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar; + setU16(dst + i*2, appendedChar.charCodeAt(0)); + } + } } catch (ex: any) { wrap_error_root(is_exception, ex, exceptionRoot); @@ -39,12 +61,34 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe if (!cultureName) throw new Error("Cannot change case, the culture name is null."); const input = get_utf16_string(src, srcLength); - let result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName); - if (result.length > destLength) - result = input; + const result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName); - for (let i = 0; i < destLength; i++) - setU16(dst + i*2, result.charCodeAt(i)); + if (result.length <= destLength) + { + for (let i = 0; i < result.length; i++) + setU16(dst + i*2, result.charCodeAt(i)); + wrap_no_error_root(is_exception, exceptionRoot); + return; + } + // workaround to maintain the ICU-like behavior + if (toUpper) + { + for (let i=0; i < input.length; i++) + { + const upperChar = input[i].toLocaleUpperCase(cultureName); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16(dst + i*2, appendedChar.charCodeAt(0)); + } + } + else + { + for (let i=0; i < input.length; i++) + { + const lowerChar = input[i].toLocaleLowerCase(cultureName); + const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar; + setU16(dst + i*2, appendedChar.charCodeAt(0)); + } + } wrap_no_error_root(is_exception, exceptionRoot); } catch (ex: any) { From 027882a513770bb031995711651c45279d4c9296 Mon Sep 17 00:00:00 2001 From: Ilona Tomkowicz Date: Mon, 29 May 2023 11:03:32 +0200 Subject: [PATCH 2/4] Fix surrogates problem, document final sigma. --- docs/design/features/hybrid-globalization.md | 5 + .../System/Globalization/TextInfoTests.cs | 12 +- .../hybrid-globalization/change-case.ts | 108 +++++++++++++++--- 3 files changed, 108 insertions(+), 17 deletions(-) diff --git a/docs/design/features/hybrid-globalization.md b/docs/design/features/hybrid-globalization.md index 916bada554ebaa..5d08676ea642e4 100644 --- a/docs/design/features/hybrid-globalization.md +++ b/docs/design/features/hybrid-globalization.md @@ -27,6 +27,11 @@ Affected public APIs: - TextInfo.ToTitleCase. Case change with invariant culture uses `toUpperCase` / `toLoweCase` functions that do not guarantee a full match with the original invariant culture. +Hybrid case change, same as ICU-based, does not support code points expansion e.g. "straße" -> "STRAßE". + +- Final sigma behavior correction: + +ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ". **String comparison** diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs index dba868fbda550c..180820e5de32bb 100644 --- a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs +++ b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs @@ -272,8 +272,17 @@ public static IEnumerable ToLower_TestData() // these sorts of expansions, since it would cause string lengths to change when cased, // which is non-intuitive. In addition, there are some context sensitive mappings which // we also don't preform. - // Greek Capital Letter Sigma (does not to case to U+03C2 with "final sigma" rule). + // Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule). yield return new object[] { cultureName, "\u03A3", "\u03C3" }; + if (PlatformDetection.IsHybridGlobalizationOnBrowser) + { + // JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior + yield return new object[] { cultureName, "O\u03A3", "o\u03C2" }; + } + else + { + yield return new object[] { cultureName, "O\u03A3", "o\u03C3" }; + } } foreach (string cultureName in GetTestLocales()) @@ -395,6 +404,7 @@ public static IEnumerable ToUpper_TestData() // es-zed does not case to SS when uppercased. yield return new object[] { cultureName, "\u00DF", "\u00DF" }; yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" }; + yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" }; // Ligatures do not expand when cased. yield return new object[] { cultureName, "\uFB00", "\uFB00" }; diff --git a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts index d505697152f598..8be96d8171d7b3 100644 --- a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts +++ b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts @@ -8,6 +8,11 @@ import { Int32Ptr } from "../types/emscripten"; import { wrap_error_root, wrap_no_error_root } from "../invoke-js"; import { localHeapViewU16, setU16_local } from "../memory"; +const SURROGATE_HIGHER_START = "\uD800"; +const SURROGATE_HIGHER_END = "\uDBFF"; +const SURROGATE_LOWER_START = "\uDC00"; +const SURROGATE_LOWER_END = "\uDFFF"; + export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void { const exceptionRoot = mono_wasm_new_external_root(ex_address); try { @@ -24,22 +29,50 @@ export function mono_wasm_change_case_invariant(src: number, srcLength: number, // workaround to maintain the ICU-like behavior const heapI16 = localHeapViewU16(); + let jump = 1; if (toUpper) { - for (let i=0; i < input.length; i++) + for (let i=0; i < input.length; i+=jump) { - const upperChar = input[i].toUpperCase(); - const appendedChar = upperChar.length > 1 ? input[i] : upperChar; - setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + // surrogate parts have to enter ToUpper/ToLower together to give correct output + if (IsSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toUpperCase(); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + + } + else + { + jump = 1; + const upperChar = input[i].toUpperCase(); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } } } else { - for (let i=0; i < input.length; i++) + for (let i=0; i < input.length; i+=jump) { - const lowerChar = input[i].toLowerCase(); - const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar; - setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + if (IsSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toLowerCase(); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + + } + else + { + jump = 1; + const upperChar = input[i].toLowerCase(); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } } } } @@ -69,22 +102,50 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe } // workaround to maintain the ICU-like behavior const heapI16 = localHeapViewU16(); + let jump = 1; if (toUpper) { - for (let i=0; i < input.length; i++) + for (let i=0; i < input.length; i+=jump) { - const upperChar = input[i].toLocaleUpperCase(cultureName); - const appendedChar = upperChar.length > 1 ? input[i] : upperChar; - setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + // surrogate parts have to enter ToUpper/ToLower together to give correct output + if (IsSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toLocaleUpperCase(cultureName); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + + } + else + { + jump = 1; + const upperChar = input[i].toLocaleUpperCase(cultureName); + const appendedChar = upperChar.length > 1 ? input[i] : upperChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } } } else { - for (let i=0; i < input.length; i++) + for (let i=0; i < input.length; i+=jump) { - const lowerChar = input[i].toLocaleLowerCase(cultureName); - const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar; - setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + // surrogate parts have to enter ToUpper/ToLower together to give correct output + if (IsSurrogate(input, i)) + { + jump = 2; + const surrogate = input.substring(i, i+2); + const upperSurrogate = surrogate.toLocaleLowerCase(cultureName); + const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; + AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + } + else + { + jump = 1; + const lowerChar = input[i].toLocaleLowerCase(cultureName); + const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar; + setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0)); + } } } wrap_no_error_root(is_exception, exceptionRoot); @@ -97,3 +158,18 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe exceptionRoot.release(); } } + +function IsSurrogate(str: string, startIdx: number) : boolean +{ + return SURROGATE_HIGHER_START <= str[startIdx] && + str[startIdx] <= SURROGATE_HIGHER_END && + startIdx+1 < str.length && + SURROGATE_LOWER_START <= str[startIdx+1] && + str[startIdx+1] <= SURROGATE_LOWER_END; +} + +function AppendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number) +{ + setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0)); + setU16_local(heapI16, dst + (idx+1)*2, surrogate.charCodeAt(1)); +} From c9c058285f0874f194cbbd35defdce548ac7b929 Mon Sep 17 00:00:00 2001 From: Ilona Tomkowicz <32700855+ilonatommy@users.noreply.github.com> Date: Mon, 29 May 2023 15:52:44 +0200 Subject: [PATCH 3/4] Update change-case.ts Feedback --- .../wasm/runtime/hybrid-globalization/change-case.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts index 8be96d8171d7b3..c6bca7ee4407a9 100644 --- a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts +++ b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts @@ -35,7 +35,7 @@ export function mono_wasm_change_case_invariant(src: number, srcLength: number, for (let i=0; i < input.length; i+=jump) { // surrogate parts have to enter ToUpper/ToLower together to give correct output - if (IsSurrogate(input, i)) + if (isSurrogate(input, i)) { jump = 2; const surrogate = input.substring(i, i+2); @@ -57,7 +57,7 @@ export function mono_wasm_change_case_invariant(src: number, srcLength: number, { for (let i=0; i < input.length; i+=jump) { - if (IsSurrogate(input, i)) + if (isSurrogate(input, i)) { jump = 2; const surrogate = input.substring(i, i+2); @@ -108,7 +108,7 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe for (let i=0; i < input.length; i+=jump) { // surrogate parts have to enter ToUpper/ToLower together to give correct output - if (IsSurrogate(input, i)) + if (isSurrogate(input, i)) { jump = 2; const surrogate = input.substring(i, i+2); @@ -131,7 +131,7 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe for (let i=0; i < input.length; i+=jump) { // surrogate parts have to enter ToUpper/ToLower together to give correct output - if (IsSurrogate(input, i)) + if (isSurrogate(input, i)) { jump = 2; const surrogate = input.substring(i, i+2); @@ -159,7 +159,7 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe } } -function IsSurrogate(str: string, startIdx: number) : boolean +function isSurrogate(str: string, startIdx: number) : boolean { return SURROGATE_HIGHER_START <= str[startIdx] && str[startIdx] <= SURROGATE_HIGHER_END && From bfc65130e24b96453c71c35b7e592740b8532dc7 Mon Sep 17 00:00:00 2001 From: Ilona Tomkowicz <32700855+ilonatommy@users.noreply.github.com> Date: Mon, 29 May 2023 15:07:05 +0000 Subject: [PATCH 4/4] Fix NLS + feedback --- .../tests/System/Globalization/TextInfoTests.cs | 5 +++-- .../wasm/runtime/hybrid-globalization/change-case.ts | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs index 180820e5de32bb..fcc4c53f5e4b6f 100644 --- a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs +++ b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs @@ -402,9 +402,10 @@ public static IEnumerable ToUpper_TestData() // which is non-intuitive. In addition, there are some context sensitive mappings which // we also don't preform. // es-zed does not case to SS when uppercased. - yield return new object[] { cultureName, "\u00DF", "\u00DF" }; + yield return new object[] { cultureName, "\u00DF", "\u00DF" }; yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" }; - yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" }; + if (!PlatformDetection.IsNlsGlobalization) + yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" }; // Ligatures do not expand when cased. yield return new object[] { cultureName, "\uFB00", "\uFB00" }; diff --git a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts index c6bca7ee4407a9..ea6fa2dfa181cd 100644 --- a/src/mono/wasm/runtime/hybrid-globalization/change-case.ts +++ b/src/mono/wasm/runtime/hybrid-globalization/change-case.ts @@ -41,7 +41,7 @@ export function mono_wasm_change_case_invariant(src: number, srcLength: number, const surrogate = input.substring(i, i+2); const upperSurrogate = surrogate.toUpperCase(); const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; - AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); } else @@ -63,7 +63,7 @@ export function mono_wasm_change_case_invariant(src: number, srcLength: number, const surrogate = input.substring(i, i+2); const upperSurrogate = surrogate.toLowerCase(); const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; - AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); } else @@ -114,7 +114,7 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe const surrogate = input.substring(i, i+2); const upperSurrogate = surrogate.toLocaleUpperCase(cultureName); const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; - AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); } else @@ -137,7 +137,7 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe const surrogate = input.substring(i, i+2); const upperSurrogate = surrogate.toLocaleLowerCase(cultureName); const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate; - AppendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); + appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i); } else { @@ -168,7 +168,7 @@ function isSurrogate(str: string, startIdx: number) : boolean str[startIdx+1] <= SURROGATE_LOWER_END; } -function AppendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number) +function appendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number) { setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0)); setU16_local(heapI16, dst + (idx+1)*2, surrogate.charCodeAt(1));