[OSX] HybridGlobalization Workaround for insufficiently sized destination buffer (#88184)

mkhamoyan · web-flow · commit f8cb3c760030 · 2023-07-04T10:20:43.000+04:00
Workaround for insufficiently sized dest buffer
diff --git a/docs/design/features/globalization-hybrid-mode.md b/docs/design/features/globalization-hybrid-mode.md
@@ -423,19 +423,3 @@ Below function are used from apple native functions:
 - [uppercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1413316-uppercasestringwithlocale?language=objc)
 - [lowercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1417298-lowercasestringwithlocale?language=objc)
 
-Behavioural changes compared to ICU
-
-   - Final sigma behavior correction:
-
-     ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ".
-
-   - Below cases will throw exception because of insufficiently sized destination buffer
-
-      - Capitalizing the German letter ß (sharp S) gives SS when using Apple native functions.
-
-      - Capitalizing ligatures gives different result on Apple platforms, eg. "\uFB00" (ﬀ) uppercase (FF)
-
-      - Capitalizing "\u0149" (ŉ) on Apple platforms returns combination of  "\u02BC" (ʼ) and N -> (ʼN)
-
-
-
diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs
@@ -274,9 +274,9 @@ public static IEnumerable<object[]> ToLower_TestData()
                 // we also don't preform.
                 // Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule).
                 yield return new object[] { cultureName, "\u03A3", "\u03C3" };
-                if (PlatformDetection.IsHybridGlobalizationOnBrowser || PlatformDetection.IsHybridGlobalizationOnOSX)
+                if (PlatformDetection.IsHybridGlobalizationOnBrowser)
                 {
-                    // JS and Apple platforms are using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
+                    // JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior
                     yield return new object[] { cultureName, "O\u03A3", "o\u03C2" };
                 }
                 else
@@ -396,29 +396,24 @@ public static IEnumerable<object[]> ToUpper_TestData()
                 // RAINBOW (outside the BMP and does not case)
                 yield return new object[] { cultureName, "\U0001F308", "\U0001F308" };
 
-                if (!PlatformDetection.IsHybridGlobalizationOnOSX)
-                {
-                    // Unicode defines some codepoints which expand into multiple codepoints
-                    // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
-                    // these sorts of expansions, since it would cause string lengths to change when cased,
-                    // which is non-intuitive. In addition, there are some context sensitive mappings which
-                    // we also don't preform.
-                    // es-zed does not case to SS when uppercased.
-                    // on OSX, capitalizing the German letter ß (sharp S) gives SS
-                    yield return new object[] { cultureName, "\u00DF", "\u00DF" };
-                    yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
-                    if (!PlatformDetection.IsNlsGlobalization)
-                        yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
-
-                    // Ligatures do not expand when cased.
-                    // on OSX, this is uppercase to "FF"
-                    yield return new object[] { cultureName, "\uFB00", "\uFB00" };
-
-                    // Precomposed character with no uppercase variant, we don't want to "decompose" this
-                    // as part of casing.
-                    // on OSX, this is uppercased to "ʼN"
-                    yield return new object[] { cultureName, "\u0149", "\u0149" };
-                }
+                
+                // Unicode defines some codepoints which expand into multiple codepoints
+                // when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
+                // these sorts of expansions, since it would cause string lengths to change when cased,
+                // which is non-intuitive. In addition, there are some context sensitive mappings which
+                // we also don't preform.
+                // es-zed does not case to SS when uppercased.
+                yield return new object[] { cultureName, "\u00DF", "\u00DF" };
+                yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" };
+                if (!PlatformDetection.IsNlsGlobalization)
+                    yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" };
+
+                // Ligatures do not expand when cased.
+                yield return new object[] { cultureName, "\uFB00", "\uFB00" };
+
+                // Precomposed character with no uppercase variant, we don't want to "decompose" this
+                // as part of casing.
+                yield return new object[] { cultureName, "\u0149", "\u0149" };
             }
 
             // Turkish i
diff --git a/src/native/libs/System.Globalization.Native/pal_casing.m b/src/native/libs/System.Globalization.Native/pal_casing.m
@@ -9,6 +9,47 @@
 
 #if defined(TARGET_OSX) || defined(TARGET_MACCATALYST) || defined(TARGET_IOS) || defined(TARGET_TVOS)
 
+/**
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return true or false
+ */
+#define IS_LEAD(c) (((c)&0xfffffc00) == 0xd800)
+
+/**
+ * Is this code unit a trail surrogate (U+dc00..U+dfff)?
+ * @param c 16-bit code unit
+ * @return true or false
+ */
+#define IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00)
+
+/**
+ * Get a code point index from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The length can be negative for a NUL-terminated string.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case for casing will be read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then for casing will be read that unpaired surrogate.
+ *
+ * @param s const uint16_t* string
+ * @param i output string offset, must be i<length
+ * @param length string length
+ */
+#define NEXTOFFSET(s, i, length) { \
+    uint16_t c = (s)[(i)++]; \
+    if (IS_LEAD(c)) { \
+        uint16_t __c2; \
+        if ((i) != (length) && IS_TRAIL(__c2 = (s)[(i)])) { \
+            ++(i); \
+        } \
+    } \
+}
 
 /**
  * Append a code point to a string, overwriting 1 or 2 code units.
@@ -46,6 +87,11 @@
 ChangeCaseNative
 
 Performs upper or lower casing of a string into a new buffer, taking into account the specified locale.
+Two things we are considering here:
+1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
+   Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
+2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
+   (i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
 Returns 0 for success, non-zero on failure see ErrorCodes.
 */
 int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t lNameLength,
@@ -61,15 +107,25 @@ int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t
         NSString *locName = [NSString stringWithCharacters: localeName length: lNameLength];
         currentLocale = [NSLocale localeWithLocaleIdentifier:locName];
     }
-    NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
-    NSString *result = bToUpper ? [source uppercaseStringWithLocale:currentLocale] : [source lowercaseStringWithLocale:currentLocale];
 
     int32_t srcIdx = 0, dstIdx = 0, isError = 0;
     uint16_t dstCodepoint;
-    while (srcIdx < result.length)
+    while (srcIdx < cwSrcLength)
     {
-        dstCodepoint = [result characterAtIndex:srcIdx++];
-        Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+        int32_t startIndex = srcIdx;
+        NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
+        int32_t srcLength = srcIdx - startIndex;
+        NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
+        NSString *dst = bToUpper ? [src uppercaseStringWithLocale:currentLocale] : [src lowercaseStringWithLocale:currentLocale];
+        int32_t index = 0;
+        // iterate over all code points of a surrogate pair character
+        while (index < srcLength)
+        {
+            // the dst.length > srcLength is to prevent code point expansions
+            dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
+            Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+            index++;
+        }
         if (isError)
             return isError;
     }
@@ -81,19 +137,33 @@ int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t
 ChangeCaseInvariantNative
 
 Performs upper or lower casing of a string into a new buffer.
+Two things we are considering here:
+1. Prohibiting code point expansions. Some characters code points expand when uppercased or lowercased, which may lead to an insufficient destination buffer.
+   Instead, we prohibit these expansions and iterate through the string character by character opting for the original character if it would have been expanded.
+2. Properly handling surrogate pairs. Characters can be comprised of more than one code point
+   (i.e. surrogate pairs like \uD801\uDC37). All code points for a character are needed to properly change case
 Returns 0 for success, non-zero on failure see ErrorCodes.
 */
 int32_t GlobalizationNative_ChangeCaseInvariantNative(const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper)
 {
-    NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength];
-    NSString *result = bToUpper ? source.uppercaseString : source.lowercaseString;
-
     int32_t srcIdx = 0, dstIdx = 0, isError = 0;
     uint16_t dstCodepoint;
-    while (srcIdx < result.length)
+    while (srcIdx < cwSrcLength)
     {
-        dstCodepoint = [result characterAtIndex:srcIdx++];
-        Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+        int32_t startIndex = srcIdx;
+        NEXTOFFSET(lpSrc, srcIdx, cwSrcLength);
+        int32_t srcLength = srcIdx - startIndex;
+        NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength];
+        NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString;
+        int32_t index = 0;
+        // iterate over all code points of a surrogate pair character
+        while (index < srcLength)
+        {
+            // the dst.length > srcLength is to prevent code point expansions
+            dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index];
+            Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError);
+            index++;
+        }
         if (isError)
             return isError;
     }