Skip to content

Commit cd7fc5c

Browse files
committed
Fix the string search behavior when using ICU
1 parent 1fd3222 commit cd7fc5c

File tree

5 files changed

+86
-7
lines changed

5 files changed

+86
-7
lines changed

src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,56 @@ ResultCode GlobalizationNative_GetSortHandle(const char* lpLocaleName, SortHandl
449449
return GetResultCode(err);
450450
}
451451

452+
static const char* BreakIteratorRule = "$CR = [\\p{Grapheme_Cluster_Break = CR}]; \n" \
453+
"$LF = [\\p{Grapheme_Cluster_Break = LF}]; \n" \
454+
"$Control = [[\\p{Grapheme_Cluster_Break = Control}]]; \n" \
455+
"$Extend = [[\\p{Grapheme_Cluster_Break = Extend}]]; \n" \
456+
"$ZWJ = [\\p{Grapheme_Cluster_Break = ZWJ}]; \n" \
457+
"[^$Control $CR $LF] ($Extend | $ZWJ); \n";
458+
459+
// When doing string search operations using ICU, it is internally using a break iterator which doesn't allow breaking between some characters according to
460+
// the Grapheme Cluster Boundary Rules specified in http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules.
461+
// Unfortunately, not all rules will have the desired behavior we need to get in .NET. For example, the rules don't allow breaking between CR '\r' and LF '\n' characters.
462+
// When searching for "\n" in a string like "\r\n", will get not found result.
463+
// We are customizing the break iterator to include only the rules which give the desired behavior. Mainly, we include the GB9 rule http://www.unicode.org/reports/tr29/#GB9
464+
// which doesn't allow breaking before the nonspace marks.
465+
// The general rules syntax explained in the doc https://unicode-org.github.io/icu/userguide/boundaryanalysis/break-rules.html.
466+
// The ICU rules definition exist here https://github.com/unicode-org/icu/blob/main/icu4c/source/data/brkitr/rules/char.txt.
467+
static UBreakIterator* CreateCustomizedBreakIterator()
468+
{
469+
UChar uRule[400];
470+
assert(strlen(BreakIteratorRule) < 400);
471+
u_uastrcpy(uRule, BreakIteratorRule);
472+
473+
static UChar emptyString[1];
474+
UErrorCode status = U_ZERO_ERROR;
475+
476+
UBreakIterator* breaker = ubrk_openRules(uRule, (int32_t)strlen(BreakIteratorRule), emptyString, 0, NULL, &status);
477+
478+
return U_FAILURE(status) ? NULL : breaker;
479+
}
480+
481+
static void CloseSearchIterator(UStringSearch* pSearch)
482+
{
483+
assert(pSearch != NULL);
484+
485+
#if !defined(TARGET_WINDOWS)
486+
#pragma GCC diagnostic push
487+
#pragma GCC diagnostic ignored "-Wcast-qual"
488+
#endif
489+
UBreakIterator* breakIterator = (UBreakIterator*)usearch_getBreakIterator(pSearch);
490+
#if !defined(TARGET_WINDOWS)
491+
#pragma GCC diagnostic pop
492+
#endif
493+
494+
usearch_close(pSearch);
495+
496+
if (breakIterator != NULL)
497+
{
498+
ubrk_close(breakIterator);
499+
}
500+
}
501+
452502
void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
453503
{
454504
for (int i = 0; i <= CompareOptionsMask; i++)
@@ -460,7 +510,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
460510
{
461511
if (pSearch != USED_STRING_SEARCH)
462512
{
463-
usearch_close(pSearch);
513+
CloseSearchIterator(pSearch);
464514
}
465515
pSortHandle->searchIteratorList[i].searchIterator = NULL;
466516
SearchIteratorNode* pNext = pSortHandle->searchIteratorList[i].next;
@@ -470,7 +520,7 @@ void GlobalizationNative_CloseSortHandle(SortHandle* pSortHandle)
470520
{
471521
if (pNext->searchIterator != NULL && pNext->searchIterator != USED_STRING_SEARCH)
472522
{
473-
usearch_close(pNext->searchIterator);
523+
CloseSearchIterator(pNext->searchIterator);
474524
}
475525
SearchIteratorNode* pCurrent = pNext;
476526
pNext = pCurrent->next;
@@ -581,9 +631,14 @@ static int32_t GetSearchIteratorUsingCollator(
581631

582632
if (*pSearchIterator == NULL)
583633
{
584-
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err);
634+
UBreakIterator* breakIterator = CreateCustomizedBreakIterator();
635+
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err);
585636
if (!U_SUCCESS(err))
586637
{
638+
if (breakIterator != NULL)
639+
{
640+
ubrk_close(breakIterator);
641+
}
587642
assert(false && "Couldn't open the search iterator.");
588643
return -1;
589644
}
@@ -593,7 +648,7 @@ static int32_t GetSearchIteratorUsingCollator(
593648
{
594649
if (!CreateNewSearchNode(pSortHandle, options))
595650
{
596-
usearch_close(*pSearchIterator);
651+
CloseSearchIterator(*pSearchIterator);
597652
return -1;
598653
}
599654
}
@@ -619,16 +674,22 @@ static int32_t GetSearchIteratorUsingCollator(
619674

620675
if (*pSearchIterator == NULL) // Couldn't find any available handle to borrow then create a new one.
621676
{
622-
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, NULL, &err);
677+
UBreakIterator* breakIterator = CreateCustomizedBreakIterator();
678+
*pSearchIterator = usearch_openFromCollator(lpTarget, cwTargetLength, lpSource, cwSourceLength, pColl, breakIterator, &err);
623679
if (!U_SUCCESS(err))
624680
{
681+
if (breakIterator != NULL)
682+
{
683+
ubrk_close(breakIterator);
684+
}
685+
625686
assert(false && "Couldn't open a new search iterator.");
626687
return -1;
627688
}
628689

629690
if (!CreateNewSearchNode(pSortHandle, options))
630691
{
631-
usearch_close(*pSearchIterator);
692+
CloseSearchIterator(*pSearchIterator);
632693
return -1;
633694
}
634695

src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len,
7373
PER_FUNCTION_BLOCK(u_tolower, libicuuc, true) \
7474
PER_FUNCTION_BLOCK(u_toupper, libicuuc, true) \
7575
PER_FUNCTION_BLOCK(u_uastrcpy, libicuuc, true) \
76+
PER_FUNCTION_BLOCK(ubrk_close, libicui18n, true) \
77+
PER_FUNCTION_BLOCK(ubrk_openRules, libicui18n, true) \
7678
PER_FUNCTION_BLOCK(ucal_add, libicui18n, true) \
7779
PER_FUNCTION_BLOCK(ucal_close, libicui18n, true) \
7880
PER_FUNCTION_BLOCK(ucal_get, libicui18n, true) \
@@ -155,6 +157,7 @@ U_CAPI int32_t U_EXPORT2 ucal_getWindowsTimeZoneID(const UChar* id, int32_t len,
155157
PER_FUNCTION_BLOCK(ures_open, libicuuc, true) \
156158
PER_FUNCTION_BLOCK(usearch_close, libicui18n, true) \
157159
PER_FUNCTION_BLOCK(usearch_first, libicui18n, true) \
160+
PER_FUNCTION_BLOCK(usearch_getBreakIterator, libicui18n, true) \
158161
PER_FUNCTION_BLOCK(usearch_getMatchedLength, libicui18n, true) \
159162
PER_FUNCTION_BLOCK(usearch_last, libicui18n, true) \
160163
PER_FUNCTION_BLOCK(usearch_openFromCollator, libicui18n, true) \
@@ -216,6 +219,8 @@ FOR_ALL_ICU_FUNCTIONS
216219
#define u_tolower(...) u_tolower_ptr(__VA_ARGS__)
217220
#define u_toupper(...) u_toupper_ptr(__VA_ARGS__)
218221
#define u_uastrcpy(...) u_uastrcpy_ptr(__VA_ARGS__)
222+
#define ubrk_close(...) ubrk_close_ptr(__VA_ARGS__)
223+
#define ubrk_openRules(...) ubrk_openRules_ptr(__VA_ARGS__)
219224
#define ucal_add(...) ucal_add_ptr(__VA_ARGS__)
220225
#define ucal_close(...) ucal_close_ptr(__VA_ARGS__)
221226
#define ucal_get(...) ucal_get_ptr(__VA_ARGS__)
@@ -310,6 +315,7 @@ FOR_ALL_ICU_FUNCTIONS
310315
#define ures_open(...) ures_open_ptr(__VA_ARGS__)
311316
#define usearch_close(...) usearch_close_ptr(__VA_ARGS__)
312317
#define usearch_first(...) usearch_first_ptr(__VA_ARGS__)
318+
#define usearch_getBreakIterator(...) usearch_getBreakIterator_ptr(__VA_ARGS__)
313319
#define usearch_getMatchedLength(...) usearch_getMatchedLength_ptr(__VA_ARGS__)
314320
#define usearch_last(...) usearch_last_ptr(__VA_ARGS__)
315321
#define usearch_openFromCollator(...) usearch_openFromCollator_ptr(__VA_ARGS__)

src/libraries/Native/Unix/System.Globalization.Native/pal_icushim_internal_android.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,6 @@ typedef struct UFieldPosition {
433433
int32_t endIndex;
434434
} UFieldPosition;
435435

436-
437436
void u_charsToUChars(const char * cs, UChar * us, int32_t length);
438437
void u_getVersion(UVersionInfo versionArray);
439438
int32_t u_strlen(const UChar * s);
@@ -443,6 +442,8 @@ UChar * u_strncpy(UChar * dst, const UChar * src, int32_t n);
443442
UChar32 u_tolower(UChar32 c);
444443
UChar32 u_toupper(UChar32 c);
445444
UChar* u_uastrcpy(UChar * dst, const char * src);
445+
void ubrk_close(UBreakIterator * bi);
446+
UBreakIterator* ubrk_openRules(const UChar * rules, int32_t rulesLength, const UChar * text, int32_t textLength, UParseError * parseErr, UErrorCode * status);
446447
void ucal_add(UCalendar * cal, UCalendarDateFields field, int32_t amount, UErrorCode * status);
447448
void ucal_close(UCalendar * cal);
448449
int32_t ucal_get(const UCalendar * cal, UCalendarDateFields field, UErrorCode * status);
@@ -532,6 +533,7 @@ const UChar * ures_getStringByIndex(const UResourceBundle * resourceBundle, int3
532533
UResourceBundle * ures_open(const char * packageName, const char * locale, UErrorCode * status);
533534
void usearch_close(UStringSearch * searchiter);
534535
int32_t usearch_first(UStringSearch * strsrch, UErrorCode * status);
536+
const UBreakIterator* usearch_getBreakIterator(const UStringSearch * strsrch);
535537
int32_t usearch_getMatchedLength(const UStringSearch * strsrch);
536538
int32_t usearch_last(UStringSearch * strsrch, UErrorCode * status);
537539
UStringSearch * usearch_openFromCollator(const UChar * pattern, int32_t patternlength, const UChar * text, int32_t textlength, const UCollator * collator, UBreakIterator * breakiter, UErrorCode * status);

src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.IndexOf.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ public static IEnumerable<object[]> IndexOf_TestData()
7272
yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 0, 6, CompareOptions.Ordinal, -1, 0 };
7373
yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 0, 11, CompareOptions.IgnoreNonSpace, 4, 7 };
7474
yield return new object[] { s_invariantCompare, "o\u0308", "o", 0, 2, CompareOptions.None, -1, 0 };
75+
yield return new object[] { s_invariantCompare, "\r\n", "\n", 0, 2, CompareOptions.None, 1, 1 };
76+
yield return new object[] { s_invariantCompare, "\u0600x", "x", 0, 2, CompareOptions.None, 1, 1 };
77+
yield return new object[] { s_invariantCompare, "\u0601x", "x", 0, 2, CompareOptions.None, 1, 1 };
78+
yield return new object[] { s_invariantCompare, "x\u0600", "x", 0, 2, CompareOptions.None, 0, 1 };
79+
yield return new object[] { s_invariantCompare, "x\u0601", "x", 0, 2, CompareOptions.None, 0, 1 };
7580

7681
// Weightless characters
7782
yield return new object[] { s_invariantCompare, "", "\u200d", 0, 0, CompareOptions.None, 0, 0 };

src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.LastIndexOf.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ public static IEnumerable<object[]> LastIndexOf_TestData()
8686
yield return new object[] { s_invariantCompare, "FooBar", "Foo\u0400Bar", 5, 6, CompareOptions.Ordinal, -1, 0 };
8787
yield return new object[] { s_invariantCompare, "TestFooBA\u0300R", "FooB\u00C0R", 10, 11, CompareOptions.IgnoreNonSpace, 4, 7 };
8888
yield return new object[] { s_invariantCompare, "o\u0308", "o", 1, 2, CompareOptions.None, -1, 0 };
89+
yield return new object[] { s_invariantCompare, "\r\n", "\n", 1, 2, CompareOptions.None, 1, 1 };
90+
yield return new object[] { s_invariantCompare, "x\u0600", "x", 1, 2, CompareOptions.None, 0, 1 };
91+
yield return new object[] { s_invariantCompare, "x\u0601", "x", 1, 2, CompareOptions.None, 0, 1 };
92+
yield return new object[] { s_invariantCompare, "\u0600x", "x", 1, 2, CompareOptions.None, 1, 1 };
93+
yield return new object[] { s_invariantCompare, "\u0601x", "x", 1, 2, CompareOptions.None, 1, 1 };
8994

9095
// Weightless characters
9196
// NLS matches weightless characters at the end of the string

0 commit comments

Comments
 (0)