Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wasm][globalization] HybridGlobalization fix bug in change case #86799

Merged
merged 5 commits into from
May 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/design/features/hybrid-globalization.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ Affected public APIs:
- TextInfo.ToTitleCase.

Case change with invariant culture uses `toUpperCase` / `toLoweCase` functions that do not guarantee a full match with the original invariant culture.
Hybrid case change, same as ICU-based, does not support code points expansion e.g. "straße" -> "STRAßE".

- Final sigma behavior correction:

ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ".

**String comparison**

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,17 @@ public static IEnumerable<object[]> ToLower_TestData()
// these sorts of expansions, since it would cause string lengths to change when cased,
// which is non-intuitive. In addition, there are some context sensitive mappings which
// we also don't preform.
// Greek Capital Letter Sigma (does not to case to U+03C2 with "final sigma" rule).
// Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule).
yield return new object[] { cultureName, "\u03A3", "\u03C3" };
if (PlatformDetection.IsHybridGlobalizationOnBrowser)
{
// JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
yield return new object[] { cultureName, "O\u03A3", "o\u03C2" };
}
else
{
yield return new object[] { cultureName, "O\u03A3", "o\u03C3" };
}
}

foreach (string cultureName in GetTestLocales())
Expand Down Expand Up @@ -393,7 +402,10 @@ public static IEnumerable<object[]> ToUpper_TestData()
// which is non-intuitive. In addition, there are some context sensitive mappings which
// we also don't preform.
// es-zed does not case to SS when uppercased.
yield return new object[] { cultureName, "\u00DF", "\u00DF" };
yield return new object[] { cultureName, "\u00DF", "\u00DF" };
yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
if (!PlatformDetection.IsNlsGlobalization)
yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };

// Ligatures do not expand when cased.
yield return new object[] { cultureName, "\uFB00", "\uFB00" };
Expand Down
143 changes: 133 additions & 10 deletions src/mono/wasm/runtime/hybrid-globalization/change-case.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,75 @@ import { monoStringToString, utf16ToStringLoop, stringToUTF16 } from "../strings
import { MonoObject, MonoObjectRef, MonoString, MonoStringRef } from "../types/internal";
import { Int32Ptr } from "../types/emscripten";
import { wrap_error_root, wrap_no_error_root } from "../invoke-js";
import { localHeapViewU16, setU16_local } from "../memory";

const SURROGATE_HIGHER_START = "\uD800";
const SURROGATE_HIGHER_END = "\uDBFF";
const SURROGATE_LOWER_START = "\uDC00";
const SURROGATE_LOWER_END = "\uDFFF";

export function mono_wasm_change_case_invariant(src: number, srcLength: number, dst: number, dstLength: number, toUpper: number, is_exception: Int32Ptr, ex_address: MonoObjectRef): void {
const exceptionRoot = mono_wasm_new_external_root<MonoObject>(ex_address);
try {
const input = utf16ToStringLoop(src, src + 2 * srcLength);
let result = toUpper ? input.toUpperCase() : input.toLowerCase();
const result = toUpper ? input.toUpperCase() : input.toLowerCase();
// Unicode defines some codepoints which expand into multiple codepoints,
// originally we do not support this expansion
if (result.length > dstLength)
result = input;
stringToUTF16(dst, dst + 2 * dstLength, result);
wrap_no_error_root(is_exception, exceptionRoot);
if (result.length <= dstLength)
{
stringToUTF16(dst, dst + 2 * dstLength, result);
wrap_no_error_root(is_exception, exceptionRoot);
return;
}

// workaround to maintain the ICU-like behavior
const heapI16 = localHeapViewU16();
let jump = 1;
if (toUpper)
{
for (let i=0; i < input.length; i+=jump)
{
// surrogate parts have to enter ToUpper/ToLower together to give correct output
if (isSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toUpperCase();
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);

}
else
{
jump = 1;
const upperChar = input[i].toUpperCase();
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
else
{
for (let i=0; i < input.length; i+=jump)
{
if (isSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toLowerCase();
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);

}
else
{
jump = 1;
const upperChar = input[i].toLowerCase();
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
}
catch (ex: any) {
wrap_error_root(is_exception, ex, exceptionRoot);
Expand All @@ -35,11 +92,62 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
if (!cultureName)
throw new Error("Cannot change case, the culture name is null.");
const input = utf16ToStringLoop(src, src + 2 * srcLength);
let result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName);
if (result.length > dstLength)
result = input;
const result = toUpper ? input.toLocaleUpperCase(cultureName) : input.toLocaleLowerCase(cultureName);

if (result.length <= input.length)
{
stringToUTF16(dst, dst + 2 * dstLength, result);
wrap_no_error_root(is_exception, exceptionRoot);
return;
}
// workaround to maintain the ICU-like behavior
const heapI16 = localHeapViewU16();
let jump = 1;
if (toUpper)
{
for (let i=0; i < input.length; i+=jump)
{
// surrogate parts have to enter ToUpper/ToLower together to give correct output
if (isSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toLocaleUpperCase(cultureName);
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);

stringToUTF16(dst, dst + 2 * dstLength, result);
}
else
{
jump = 1;
const upperChar = input[i].toLocaleUpperCase(cultureName);
const appendedChar = upperChar.length > 1 ? input[i] : upperChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
else
{
for (let i=0; i < input.length; i+=jump)
{
// surrogate parts have to enter ToUpper/ToLower together to give correct output
if (isSurrogate(input, i))
{
jump = 2;
const surrogate = input.substring(i, i+2);
const upperSurrogate = surrogate.toLocaleLowerCase(cultureName);
const appendedSurrogate = upperSurrogate.length > 2 ? surrogate : upperSurrogate;
appendSurrogateToMemory(heapI16, dst, appendedSurrogate, i);
}
else
{
jump = 1;
const lowerChar = input[i].toLocaleLowerCase(cultureName);
const appendedChar = lowerChar.length > 1 ? input[i] : lowerChar;
setU16_local(heapI16, dst + i*2, appendedChar.charCodeAt(0));
}
}
}
wrap_no_error_root(is_exception, exceptionRoot);
}
catch (ex: any) {
Expand All @@ -49,4 +157,19 @@ export function mono_wasm_change_case(culture: MonoStringRef, src: number, srcLe
cultureRoot.release();
exceptionRoot.release();
}
}
}

function isSurrogate(str: string, startIdx: number) : boolean
{
return SURROGATE_HIGHER_START <= str[startIdx] &&
str[startIdx] <= SURROGATE_HIGHER_END &&
startIdx+1 < str.length &&
SURROGATE_LOWER_START <= str[startIdx+1] &&
str[startIdx+1] <= SURROGATE_LOWER_END;
}

function appendSurrogateToMemory(heapI16: Uint16Array, dst: number, surrogate: string, idx: number)
{
setU16_local(heapI16, dst + idx*2, surrogate.charCodeAt(0));
setU16_local(heapI16, dst + (idx+1)*2, surrogate.charCodeAt(1));
}