Skip to content

Commit f8cb3c7

Browse files
authored
[OSX] HybridGlobalization Workaround for insufficiently sized destination buffer (#88184)
Workaround for insufficiently sized dest buffer
1 parent d406b73 commit f8cb3c7

File tree

3 files changed

+101
-52
lines changed

3 files changed

+101
-52
lines changed

docs/design/features/globalization-hybrid-mode.md

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -423,19 +423,3 @@ Below function are used from apple native functions:
423423
- [uppercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1413316-uppercasestringwithlocale?language=objc)
424424
- [lowercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1417298-lowercasestringwithlocale?language=objc)
425425

426-
Behavioural changes compared to ICU
427-
428-
- Final sigma behavior correction:
429-
430-
ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ".
431-
432-
- Below cases will throw exception because of insufficiently sized destination buffer
433-
434-
- Capitalizing the German letter ß (sharp S) gives SS when using Apple native functions.
435-
436-
- Capitalizing ligatures gives different result on Apple platforms, eg. "\uFB00" (ff) uppercase (FF)
437-
438-
- Capitalizing "\u0149" (ʼn) on Apple platforms returns combination of "\u02BC" (ʼ) and N -> (ʼN)
439-
440-
441-

src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs

Lines changed: 20 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -274,9 +274,9 @@ public static IEnumerable<object[]> ToLower_TestData()
274274
// we also don't preform.
275275
// Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule).
276276
yield return new object[] { cultureName, "\u03A3", "\u03C3" };
277-
if (PlatformDetection.IsHybridGlobalizationOnBrowser || PlatformDetection.IsHybridGlobalizationOnOSX)
277+
if (PlatformDetection.IsHybridGlobalizationOnBrowser)
278278
{
279-
// JS and Apple platforms are using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
279+
// JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
280280
yield return new object[] { cultureName, "O\u03A3", "o\u03C2" };
281281
}
282282
else
@@ -396,29 +396,24 @@ public static IEnumerable<object[]> ToUpper_TestData()
396396
// RAINBOW (outside the BMP and does not case)
397397
yield return new object[] { cultureName, "\U0001F308", "\U0001F308" };
398398

399-
if (!PlatformDetection.IsHybridGlobalizationOnOSX)
400-
{
401-
// Unicode defines some codepoints which expand into multiple codepoints
402-
// when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
403-
// these sorts of expansions, since it would cause string lengths to change when cased,
404-
// which is non-intuitive. In addition, there are some context sensitive mappings which
405-
// we also don't preform.
406-
// es-zed does not case to SS when uppercased.
407-
// on OSX, capitalizing the German letter ß (sharp S) gives SS
408-
yield return new object[] { cultureName, "\u00DF", "\u00DF" };
409-
yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
410-
if (!PlatformDetection.IsNlsGlobalization)
411-
yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
412-
413-
// Ligatures do not expand when cased.
414-
// on OSX, this is uppercase to "FF"
415-
yield return new object[] { cultureName, "\uFB00", "\uFB00" };
416-
417-
// Precomposed character with no uppercase variant, we don't want to "decompose" this
418-
// as part of casing.
419-
// on OSX, this is uppercased to "ʼN"
420-
yield return new object[] { cultureName, "\u0149", "\u0149" };
421-
}
399+
400+
// Unicode defines some codepoints which expand into multiple codepoints
401+
// when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
402+
// these sorts of expansions, since it would cause string lengths to change when cased,
403+
// which is non-intuitive. In addition, there are some context sensitive mappings which
404+
// we also don't preform.
405+
// es-zed does not case to SS when uppercased.
406+
yield return new object[] { cultureName, "\u00DF", "\u00DF" };
407+
yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
408+
if (!PlatformDetection.IsNlsGlobalization)
409+
yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
410+
411+
// Ligatures do not expand when cased.
412+
yield return new object[] { cultureName, "\uFB00", "\uFB00" };
413+
414+
// Precomposed character with no uppercase variant, we don't want to "decompose" this
415+
// as part of casing.
416+
yield return new object[] { cultureName, "\u0149", "\u0149" };
422417
}
423418

424419
// Turkish i

src/native/libs/System.Globalization.Native/pal_casing.m

Lines changed: 81 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,47 @@
99

1010
#if defined(TARGET_OSX) || defined(TARGET_MACCATALYST) || defined(TARGET_IOS) || defined(TARGET_TVOS)
1111

12+
/**
13+
* Is this code unit a lead surrogate (U+d800..U+dbff)?
14+
* @param c 16-bit code unit
15+
* @return true or false
16+
*/
17+
#define IS_LEAD(c) (((c)&0xfffffc00) == 0xd800)
18+
19+
/**
20+
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
21+
* @param c 16-bit code unit
22+
* @return true or false
23+
*/
24+
#define IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00)
25+
26+
/**
27+
* Get a code point index from a string at a code point boundary offset,
28+
* and advance the offset to the next code point boundary.
29+
* (Post-incrementing forward iteration.)
30+
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
31+
*
32+
* The length can be negative for a NUL-terminated string.
33+
*
34+
* The offset may point to the lead surrogate unit
35+
* for a supplementary code point, in which case for casing will be read
36+
* the following trail surrogate as well.
37+
* If the offset points to a trail surrogate or
38+
* to a single, unpaired lead surrogate, then for casing will be read that unpaired surrogate.
39+
*
40+
* @param s const uint16_t* string
41+
* @param i output string offset, must be i<length
42+
* @param length string length
43+
*/
44+
#define NEXTOFFSET(s, i, length) { \
45+
uint16_t c = (s)[(i)++]; \
46+
if (IS_LEAD(c)) { \
47+
uint16_t __c2; \
48+
if ((i) != (length) && IS_TRAIL(__c2 = (s)[(i)])) { \
49+
++(i); \
50+
} \
51+
} \
52+
}
1253

1354
/**
1455
* Append a code point to a string, overwriting 1 or 2 code units.
@@ -46,6 +87,11 @@
4687
ChangeCaseNative
4788
4889
Performs upper or lower casing of a string into a new buffer, taking into account the specified locale.
90+
Two things we are considering here:
91+
1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
92+
Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
93+
2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
94+
(i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
4995
Returns 0 for success, non-zero on failure see ErrorCodes.
5096
*/
5197
int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t lNameLength,
@@ -61,15 +107,25 @@ int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t
61107
NSString *locName = [NSString stringWithCharacters: localeName length: lNameLength];
62108
currentLocale = [NSLocale localeWithLocaleIdentifier:locName];
63109
}
64-
NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
65-
NSString *result = bToUpper ? [source uppercaseStringWithLocale:currentLocale] : [source lowercaseStringWithLocale:currentLocale];
66110

67111
int32_t srcIdx = 0, dstIdx = 0, isError = 0;
68112
uint16_t dstCodepoint;
69-
while (srcIdx < result.length)
113+
while (srcIdx < cwSrcLength)
70114
{
71-
dstCodepoint = [result characterAtIndex:srcIdx++];
72-
Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
115+
int32_t startIndex = srcIdx;
116+
NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
117+
int32_t srcLength = srcIdx - startIndex;
118+
NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
119+
NSString *dst = bToUpper ? [src uppercaseStringWithLocale:currentLocale] : [src lowercaseStringWithLocale:currentLocale];
120+
int32_t index = 0;
121+
// iterate over all code points of a surrogate pair character
122+
while (index < srcLength)
123+
{
124+
// the dst.length > srcLength is to prevent code point expansions
125+
dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
126+
Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
127+
index++;
128+
}
73129
if (isError)
74130
return isError;
75131
}
@@ -81,19 +137,33 @@ int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t
81137
ChangeCaseInvariantNative
82138
83139
Performs upper or lower casing of a string into a new buffer.
140+
Two things we are considering here:
141+
1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
142+
Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
143+
2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
144+
(i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
84145
Returns 0 for success, non-zero on failure see ErrorCodes.
85146
*/
86147
int32_t GlobalizationNative_ChangeCaseInvariantNative(const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper)
87148
{
88-
NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
89-
NSString *result = bToUpper ? source.uppercaseString : source.lowercaseString;
90-
91149
int32_t srcIdx = 0, dstIdx = 0, isError = 0;
92150
uint16_t dstCodepoint;
93-
while (srcIdx < result.length)
151+
while (srcIdx < cwSrcLength)
94152
{
95-
dstCodepoint = [result characterAtIndex:srcIdx++];
96-
Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
153+
int32_t startIndex = srcIdx;
154+
NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
155+
int32_t srcLength = srcIdx - startIndex;
156+
NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
157+
NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString;
158+
int32_t index = 0;
159+
// iterate over all code points of a surrogate pair character
160+
while (index < srcLength)
161+
{
162+
// the dst.length > srcLength is to prevent code point expansions
163+
dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
164+
Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
165+
index++;
166+
}
97167
if (isError)
98168
return isError;
99169
}

0 commit comments

Comments
 (0)