Skip to content

Commit a83026a

Browse files
authored
CLDR-14032 More space adjustments in DAIP; add test. Run it on en.xml, adjust tests (#2001)
1 parent 8a2f620 commit a83026a

File tree

6 files changed

+438
-307
lines changed

6 files changed

+438
-307
lines changed

common/main/en.xml

Lines changed: 292 additions & 292 deletions
Large diffs are not rendered by default.

tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import org.unicode.cldr.test.CheckExemplars.ExemplarType;
1717
import org.unicode.cldr.util.Builder;
18+
import org.unicode.cldr.util.CLDRConfig;
1819
import org.unicode.cldr.util.CLDRFile;
1920
import org.unicode.cldr.util.CLDRLocale;
2021
import org.unicode.cldr.util.CldrUtility;
@@ -23,6 +24,7 @@
2324
import org.unicode.cldr.util.Emoji;
2425
import org.unicode.cldr.util.ICUServiceBuilder;
2526
import org.unicode.cldr.util.PatternCache;
27+
import org.unicode.cldr.util.SupplementalDataInfo;
2628
import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
2729
import org.unicode.cldr.util.With;
2830
import org.unicode.cldr.util.XPathParts;
@@ -82,6 +84,35 @@ public class DisplayAndInputProcessor {
8284
private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*");
8385
private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");
8486

87+
// Pattern to match against paths that might have time formats with h or K (12-hour cycles)
88+
private static final Pattern HOUR_FORMAT_XPATHS = PatternCache
89+
.get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
90+
+ "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
91+
+ "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|"
92+
+ "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)");
93+
94+
private static final Pattern AMPM_SPACE_BEFORE = PatternCache.get("([Khms])[ \\u00A0]+a"); // time, space, a
95+
private static final Pattern AMPM_SPACE_AFTER = PatternCache.get("a[ \\u00A0]+([Kh])"); // a space, hour
96+
97+
// Pattern to match against paths that might have date formats with y
98+
private static final Pattern YEAR_FORMAT_XPATHS = PatternCache
99+
.get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
100+
+ "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
101+
+ "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|"
102+
+ "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)");
103+
104+
// Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436 \u043E \u0440 \u0441
105+
private static final Pattern YEAR_SPACE_YEARMARKER = PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start
106+
107+
public static final Pattern UNIT_NARROW_XPATHS = PatternCache
108+
.get("//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
109+
110+
public static final Pattern UNIT_SHORT_XPATHS = PatternCache
111+
.get("//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
112+
113+
private static final Pattern PLACEHOLDER_SPACE_AFTER = PatternCache.get("\\}[ \\u00A0\\u202F]+");
114+
private static final Pattern PLACEHOLDER_SPACE_BEFORE = PatternCache.get("[ \\u00A0\\u202F]+\\{");
115+
85116
/**
86117
* string of whitespace not including NBSP, i.e. [\t\n\r]+
87118
*/
@@ -106,6 +137,7 @@ public class DisplayAndInputProcessor {
106137
private static final Pattern FINAL_NBSP = PatternCache.get("\\u00A0+$");
107138
private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+");
108139

140+
// The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000
109141
private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
110142

111143
private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
@@ -167,6 +199,7 @@ public class DisplayAndInputProcessor {
167199
private UnicodeSetPrettyPrinter pp = null;
168200

169201
final private CLDRLocale locale;
202+
private String scriptCode; // actual or default script code (not null after init)
170203
private boolean isPosix;
171204

172205
/**
@@ -212,6 +245,18 @@ void init(CLDRLocale locale, boolean needsCollator) {
212245
.setOrdering(col)
213246
.setSpaceComparator(spaceCol);
214247
}
248+
String script = locale.getScript();
249+
if (script == null || script.length() < 4) {
250+
SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo();
251+
script = sdi.getDefaultScript(locale.getBaseName());
252+
if (script == null || script.length() < 4 || script.equals("Zzzz")) {
253+
script = sdi.getDefaultScript(locale.getLanguage());
254+
}
255+
if (script == null || script.length() < 4) {
256+
script = "Zzzz";
257+
}
258+
}
259+
scriptCode = script;
215260
}
216261

217262
public UnicodeSetPrettyPrinter getPrettyPrinter() {
@@ -298,7 +343,7 @@ public synchronized String processForDisplay(String path, String value) {
298343
}
299344
// Fix up hyphens, replacing with N-dash as appropriate
300345
if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
301-
value = normalizeIntervalHyphens(value);
346+
value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
302347
} else {
303348
value = normalizeHyphens(value);
304349
}
@@ -470,7 +515,7 @@ public synchronized String processInput(String path, String value, Exception[] i
470515
}
471516
// Fix up hyphens, replacing with N-dash as appropriate
472517
if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
473-
value = normalizeIntervalHyphens(value);
518+
value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
474519
} else if (!isUnicodeSet) {
475520
value = normalizeHyphens(value);
476521
}
@@ -646,20 +691,26 @@ private String normalizeApostrophes(String value) {
646691
}
647692
}
648693

649-
private String normalizeIntervalHyphens(String value) {
694+
private String normalizeIntervalHyphensAndSpaces(String value) {
650695
DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
651-
fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
696+
fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); // first format & separator including spaces
652697
List<Object> items = fp.getItems();
653698
Object last = items.get(items.size() - 1);
654699
if (last instanceof String) {
655-
String separator = last.toString();
656-
if (separator.contains("-")) {
700+
String separator = last.toString(); // separator including spaces
701+
String replacement = separator;
702+
if (scriptCode.equals("Latn") && (separator.equals(" - ") || separator.equals(" \u2013 "))) {
703+
replacement = "\u2009\u2013\u2009"; // Per CLDR-14032
704+
} else if (separator.contains("-")) {
705+
replacement = separator.replace("-", "\u2013");
706+
}
707+
if (!replacement.equals(separator)) {
657708
StringBuilder sb = new StringBuilder();
658709
sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
659710
if (sb.lastIndexOf(separator) >= 0) {
660711
sb.delete(sb.lastIndexOf(separator), sb.length());
661-
sb.append(separator.replace("-", "\u2013"));
662-
sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart());
712+
sb.append(replacement);
713+
sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart()); // second format only
663714
return sb.toString();
664715
}
665716
}
@@ -1037,6 +1088,24 @@ private String normalizeWhitespace(String path, String value) {
10371088
} else {
10381089
throw new IllegalArgumentException("Unknown PathSpaceType " + pst);
10391090
}
1091+
1092+
// Further whitespace adjustments per CLDR-14032
1093+
if (HOUR_FORMAT_XPATHS.matcher(path).matches()) {
1094+
value = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1\u202Fa");
1095+
value = AMPM_SPACE_AFTER.matcher(value).replaceAll("a\u202F$1");
1096+
}
1097+
if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) {
1098+
value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1");
1099+
}
1100+
if (UNIT_NARROW_XPATHS.matcher(path).matches()) {
1101+
value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP
1102+
value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{");
1103+
}
1104+
if (UNIT_SHORT_XPATHS.matcher(path).matches()) {
1105+
value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP
1106+
value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{");
1107+
}
1108+
10401109
return value;
10411110
}
10421111

tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,10 @@ private String diff(String value, String input, String path) {
364364
if (path.contains("/foreignSpaceReplacement")) {
365365
return null; // CLDR-15384 typically inherited; no DAIP processing desired
366366
}
367+
if (logKnownIssue("CLDR-15635", "Skip TestAll() for /intervalFormatItem until we update xml data") &&
368+
(path.contains("/timeFormat") || path.contains("/dateFormatItem") || path.contains("/intervalFormatItem"))) {
369+
return null; // CLDR-14032 changing normalization for intervalFormats but xml data not yet updated
370+
}
367371
if (path.contains("/exemplarCharacters") || path.contains("/parseLenient")) {
368372
try {
369373
UnicodeSet s1 = new UnicodeSet(value);
@@ -477,6 +481,64 @@ public static PathSpaceData[] getArray() {
477481
}
478482
}
479483

484+
/**
485+
* Test whether DisplayAndInputProcessor.processInput correctly makes whitespace adjustments
486+
*/
487+
public void TestWhitespaceAdjustments() {
488+
class PathSpaceAdjustData {
489+
String locale;
490+
String xpath;
491+
String rawValue;
492+
String normValue;
493+
494+
PathSpaceAdjustData(String loc, String path, String raw, String norm) {
495+
this.locale = loc;
496+
this.xpath = path;
497+
this.rawValue = raw;
498+
this.normValue = norm;
499+
}
500+
}
501+
502+
PathSpaceAdjustData[] testItems = {
503+
new PathSpaceAdjustData("en",
504+
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/timeFormats/timeFormatLength[@type=\"short\"]/timeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]",
505+
"h:mm a", "h:mm a"), // \u202F
506+
new PathSpaceAdjustData("ja",
507+
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/availableFormats/dateFormatItem[@id=\"hm\"]",
508+
"a K:mm", "a K:mm"), // \u202F
509+
new PathSpaceAdjustData("en",
510+
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id=\"hm\"]/greatestDifference[@id=\"a\"]",
511+
"h:mm - h:m a", "h:mm – h:m a"), // \u2009\u2013\u2009, \u202F
512+
new PathSpaceAdjustData("uk",
513+
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateFormats/dateFormatLength[@type=\"medium\"]/dateFormat[@type=\"standard\"]/pattern[@type=\"standard\"]",
514+
"d MMM y 'р'.", "d MMM y 'р'."), // \u202F after y
515+
new PathSpaceAdjustData("uk",
516+
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/availableFormats/dateFormatItem[@id=\"yMMMd\"]",
517+
"d MMM y 'р'.", "d MMM y 'р'."), // \u202F after y
518+
new PathSpaceAdjustData("uk",
519+
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id=\"yMMMd\"]/greatestDifference[@id=\"M\"]",
520+
"d MMM - d MMM y 'р'.", "d MMM – d MMM y 'р'."), // \u2013, \u202F after y
521+
new PathSpaceAdjustData("en",
522+
"//ldml/units/unitLength[@type=\"narrow\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
523+
"{0} g", "{0} g"), // \u202F
524+
new PathSpaceAdjustData("en",
525+
"//ldml/units/unitLength[@type=\"narrow\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
526+
"g {0}", "g {0}"), // \u202F
527+
new PathSpaceAdjustData("en",
528+
"//ldml/units/unitLength[@type=\"short\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
529+
"{0} g", "{0} g"), // \u00A0
530+
new PathSpaceAdjustData("en",
531+
"//ldml/units/unitLength[@type=\"short\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
532+
"g {0}", "g {0}"), // \u00A0
533+
};
534+
535+
for (PathSpaceAdjustData testItem: testItems) {
536+
DisplayAndInputProcessor daip = new DisplayAndInputProcessor(info.getCLDRFile(testItem.locale, true), false);
537+
String normValue = daip.processInput(testItem.xpath, testItem.rawValue, null);
538+
assertEquals("Whitespace adjustment for " + testItem.xpath, testItem.normValue, normValue);
539+
}
540+
}
541+
480542
/**
481543
* Test whether DisplayAndInputProcessor.processInput correctly normalizes annotations
482544
* containing “|” = U+007C VERTICAL LINE or its variations

tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestExampleGenerator.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -834,13 +834,13 @@ public void TestDayPeriods() {
834834
checkDayPeriod("pl", "format", "morning1", "〖06:00 – 10:00⁻〗〖❬8:00 ❭rano〗");
835835
checkDayPeriod("pl", "stand-alone", "morning1", "〖06:00 – 10:00⁻〗");
836836

837-
checkDayPeriod("en", "format", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗〖❬3:00 ❭at night〗");
837+
checkDayPeriod("en", "format", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗〖❬3:00❭at night〗");
838838
checkDayPeriod("en", "stand-alone", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗");
839839

840-
checkDayPeriod("en", "format", "noon", "〖12:00〗〖❬12:00 ❭noon〗");
841-
checkDayPeriod("en", "format", "midnight", "〖00:00〗〖❬12:00 ❭midnight〗");
842-
checkDayPeriod("en", "format", "am", "〖00:00 – 12:00⁻〗〖❬6:00 ❭AM〗");
843-
checkDayPeriod("en", "format", "pm", "〖12:00 – 24:00⁻〗〖❬6:00 ❭PM〗");
840+
checkDayPeriod("en", "format", "noon", "〖12:00〗〖❬12:00❭noon〗");
841+
checkDayPeriod("en", "format", "midnight", "〖00:00〗〖❬12:00❭midnight〗");
842+
checkDayPeriod("en", "format", "am", "〖00:00 – 12:00⁻〗〖❬6:00❭AM〗");
843+
checkDayPeriod("en", "format", "pm", "〖12:00 – 24:00⁻〗〖❬6:00❭PM〗");
844844
}
845845

846846
private void checkDayPeriod(String localeId, String type, String dayPeriodCode, String expected) {

tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestPathHeader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ public void TestAppendTimezone() {
346346
ExampleGenerator eg = new ExampleGenerator(cldrFile, cldrFile, CLDRPaths.SUPPLEMENTAL_DIRECTORY);
347347
String example = eg.getExampleHtml(APPEND_TIMEZONE, cldrFile.getStringValue(APPEND_TIMEZONE));
348348
String result = ExampleGenerator.simplify(example, false);
349-
assertEquals("", "〖❬6:25:59 PM❭ ❬GMT❭〗", result);
349+
assertEquals("", "〖❬6:25:59PM❭ ❬GMT❭〗", result);
350350
}
351351

352352
public void TestOptional() {

tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestPseudolocalization.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ public void testConverter() {
3636
result.getStringValue("//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]"));
3737

3838
assertEquals("Date and time placeholders should only be bracketed",
39-
"[h:mm:ss a]",
39+
"[h:mm:ssa]",
4040
result.getStringValue(
4141
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/"
4242
+ "availableFormats/dateFormatItem[@id=\"hms\"]"));

0 commit comments

Comments
 (0)