Skip to content

Commit 93c625e

Browse files
committed
New \p{Foo} escape sequence
1 parent 626c4b4 commit 93c625e

File tree

2 files changed

+216
-28
lines changed

2 files changed

+216
-28
lines changed

tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,129 @@ public void testA() throws Exception {
115115
"s4->RuleStop_A_2\n";
116116
checkTokensRule(g, null, expecting);
117117
}
118+
@Test public void testCharSet() throws Exception {
119+
LexerGrammar g = new LexerGrammar(
120+
"lexer grammar P;\n"+
121+
"A : [abc] ;"
122+
);
123+
String expecting =
124+
"s0->RuleStart_A_1\n" +
125+
"RuleStart_A_1->s3\n" +
126+
"s3-{97..99}->s4\n" +
127+
"s4->RuleStop_A_2\n";
128+
checkTokensRule(g, null, expecting);
129+
}
130+
@Test public void testCharSetRange() throws Exception {
131+
LexerGrammar g = new LexerGrammar(
132+
"lexer grammar P;\n"+
133+
"A : [a-c] ;"
134+
);
135+
String expecting =
136+
"s0->RuleStart_A_1\n" +
137+
"RuleStart_A_1->s3\n" +
138+
"s3-{97..99}->s4\n" +
139+
"s4->RuleStop_A_2\n";
140+
checkTokensRule(g, null, expecting);
141+
}
142+
@Test public void testCharSetUnicodeBMPEscape() throws Exception {
143+
LexerGrammar g = new LexerGrammar(
144+
"lexer grammar P;\n"+
145+
"A : [\\uABCD] ;"
146+
);
147+
String expecting =
148+
"s0->RuleStart_A_1\n" +
149+
"RuleStart_A_1->s3\n" +
150+
"s3-43981->s4\n" +
151+
"s4->RuleStop_A_2\n";
152+
checkTokensRule(g, null, expecting);
153+
}
154+
@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
155+
LexerGrammar g = new LexerGrammar(
156+
"lexer grammar P;\n"+
157+
"A : [a-c\\uABCD-\\uABFF] ;"
158+
);
159+
String expecting =
160+
"s0->RuleStart_A_1\n" +
161+
"RuleStart_A_1->s3\n" +
162+
"s3-{97..99, 43981..44031}->s4\n" +
163+
"s4->RuleStop_A_2\n";
164+
checkTokensRule(g, null, expecting);
165+
}
166+
@Test public void testCharSetUnicodeSMPEscape() throws Exception {
167+
LexerGrammar g = new LexerGrammar(
168+
"lexer grammar P;\n"+
169+
"A : [\\u{10ABCD}] ;"
170+
);
171+
String expecting =
172+
"s0->RuleStart_A_1\n" +
173+
"RuleStart_A_1->s3\n" +
174+
"s3-1092557->s4\n" +
175+
"s4->RuleStop_A_2\n";
176+
checkTokensRule(g, null, expecting);
177+
}
178+
@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
179+
LexerGrammar g = new LexerGrammar(
180+
"lexer grammar P;\n"+
181+
"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
182+
);
183+
String expecting =
184+
"s0->RuleStart_A_1\n" +
185+
"RuleStart_A_1->s3\n" +
186+
"s3-{97..99, 1092557..1092607}->s4\n" +
187+
"s4->RuleStop_A_2\n";
188+
checkTokensRule(g, null, expecting);
189+
}
190+
@Test public void testCharSetUnicodePropertyEscape() throws Exception {
191+
// The Gothic script is long dead and unlikely to change (which would
192+
// cause this test to fail)
193+
LexerGrammar g = new LexerGrammar(
194+
"lexer grammar P;\n"+
195+
"A : [\\p{Gothic}] ;"
196+
);
197+
String expecting =
198+
"s0->RuleStart_A_1\n" +
199+
"RuleStart_A_1->s3\n" +
200+
"s3-{66352..66378}->s4\n" +
201+
"s4->RuleStop_A_2\n";
202+
checkTokensRule(g, null, expecting);
203+
}
204+
@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
205+
LexerGrammar g = new LexerGrammar(
206+
"lexer grammar P;\n"+
207+
"A : [\\P{Gothic}] ;"
208+
);
209+
String expecting =
210+
"s0->RuleStart_A_1\n" +
211+
"RuleStart_A_1->s3\n" +
212+
"s3-{0..66351, 66379..1114111}->s4\n" +
213+
"s4->RuleStop_A_2\n";
214+
checkTokensRule(g, null, expecting);
215+
}
216+
@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
217+
// Ditto the Mahajani script. Not going to change soon. I hope.
218+
LexerGrammar g = new LexerGrammar(
219+
"lexer grammar P;\n"+
220+
"A : [\\p{Gothic}\\p{Mahajani}] ;"
221+
);
222+
String expecting =
223+
"s0->RuleStart_A_1\n" +
224+
"RuleStart_A_1->s3\n" +
225+
"s3-{66352..66378, 69968..70006}->s4\n" +
226+
"s4->RuleStop_A_2\n";
227+
checkTokensRule(g, null, expecting);
228+
}
229+
@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
230+
LexerGrammar g = new LexerGrammar(
231+
"lexer grammar P;\n"+
232+
"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
233+
);
234+
String expecting =
235+
"s0->RuleStart_A_1\n" +
236+
"RuleStart_A_1->s3\n" +
237+
"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
238+
"s4->RuleStop_A_2\n";
239+
checkTokensRule(g, null, expecting);
240+
}
118241
@Test public void testRangeOrRange() throws Exception {
119242
LexerGrammar g = new LexerGrammar(
120243
"lexer grammar P;\n"+

tool/src/org/antlr/v4/automata/LexerATNFactory.java

Lines changed: 93 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import org.antlr.runtime.Token;
1111
import org.antlr.v4.codegen.CodeGenerator;
1212
import org.antlr.v4.misc.CharSupport;
13+
import org.antlr.v4.misc.EscapeSequenceParsing;
1314
import org.antlr.v4.parse.ANTLRParser;
1415
import org.antlr.v4.runtime.IntStream;
1516
import org.antlr.v4.runtime.Lexer;
@@ -42,6 +43,7 @@
4243
import org.antlr.v4.tool.ast.GrammarAST;
4344
import org.antlr.v4.tool.ast.RangeAST;
4445
import org.antlr.v4.tool.ast.TerminalAST;
46+
import org.antlr.v4.unicode.UnicodeData;
4547
import org.stringtemplate.v4.ST;
4648
import org.stringtemplate.v4.STGroup;
4749

@@ -365,7 +367,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
365367
return new Handle(left, right);
366368
}
367369

368-
/** [Aa\t \u1234a-z\]\-] char sets */
370+
/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
369371
@Override
370372
public Handle charSetLiteral(GrammarAST charSetAST) {
371373
ATNState left = newState(charSetAST);
@@ -379,51 +381,114 @@ public Handle charSetLiteral(GrammarAST charSetAST) {
379381
public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
380382
String chars = charSetAST.getText();
381383
chars = chars.substring(1, chars.length() - 1);
382-
String cset = '"' + chars + '"';
383384
IntervalSet set = new IntervalSet();
384385

385386
if (chars.length() == 0) {
386387
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
387388
g.fileName, charSetAST.getToken(), "[]");
388389
return set;
389390
}
390-
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
391-
// so we can avoid seeing them as '-' range ops.
392-
chars = CharSupport.getStringFromGrammarStringLiteral(cset);
393-
if (chars == null) {
394-
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
395-
g.fileName, charSetAST.getToken());
396-
return set;
397-
}
391+
392+
int prevCodePoint = -1;
393+
boolean inRange = false;
398394
int n = chars.length();
399-
// now make x-y become set of char
400395
for (int i = 0; i < n; ) {
401396
int c = chars.codePointAt(i);
402397
int offset = Character.charCount(c);
403-
if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
404-
checkSetCollision(charSetAST, set, '-');
405-
set.add('-');
406-
offset++;
407-
}
408-
else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
409-
int x = c;
410-
int y = chars.codePointAt(i+offset+1);
411-
if (x <= y) {
412-
checkSetCollision(charSetAST, set, x, y);
413-
set.add(x,y);
398+
if (c == '\\') {
399+
EscapeSequenceParsing.Result escapeParseResult =
400+
EscapeSequenceParsing.parseEscape(chars, i);
401+
if (escapeParseResult == null) {
402+
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
403+
g.fileName, charSetAST.getToken());
404+
return new IntervalSet();
405+
} else {
406+
offset = escapeParseResult.codeUnitLength;
414407
}
415-
else {
408+
switch (escapeParseResult.type) {
409+
case UNICODE_CODE_POINT:
410+
if (inRange) {
411+
if (prevCodePoint > escapeParseResult.codePoint) {
412+
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
413+
CharSupport.toRange(prevCodePoint, escapeParseResult.codePoint, CharSupport.ToRangeMode.BRACKETED));
414+
}
415+
checkSetCollision(charSetAST, set, prevCodePoint, escapeParseResult.codePoint);
416+
set.add(prevCodePoint, escapeParseResult.codePoint);
417+
inRange = false;
418+
prevCodePoint = -1;
419+
} else if (prevCodePoint != -1) {
420+
checkSetCollision(charSetAST, set, prevCodePoint);
421+
set.add(prevCodePoint);
422+
prevCodePoint = escapeParseResult.codePoint;
423+
} else {
424+
prevCodePoint = escapeParseResult.codePoint;
425+
}
426+
break;
427+
case UNICODE_PROPERTY_NAME:
428+
// fall through
429+
case UNICODE_PROPERTY_NAME_INVERTED:
430+
if (prevCodePoint != -1) {
431+
checkSetCollision(charSetAST, set, prevCodePoint);
432+
set.add(prevCodePoint);
433+
prevCodePoint = -1;
434+
}
435+
436+
if (inRange) {
437+
// XXX make a proper error
416438
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
417-
g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
439+
g.fileName, charSetAST.getToken(), "[]");
440+
inRange = false;
441+
} else {
442+
IntervalSet propertySet = UnicodeData.getPropertyCodePoints(escapeParseResult.propertyName);
443+
if (propertySet == null) {
444+
// XXX make a proper error
445+
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
446+
g.fileName, charSetAST.getToken());
447+
} else {
448+
if (escapeParseResult.type == EscapeSequenceParsing.Result.Type.UNICODE_PROPERTY_NAME_INVERTED) {
449+
propertySet = propertySet.complement(IntervalSet.COMPLETE_CHAR_SET);
450+
}
451+
// We don't check for collision, since these sets can be huge.
452+
set.addAll(propertySet);
453+
}
454+
}
455+
break;
456+
}
457+
} else if (inRange) {
458+
if (prevCodePoint > c) {
459+
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
460+
CharSupport.toRange(prevCodePoint, c, CharSupport.ToRangeMode.BRACKETED));
418461
}
419-
offset += Character.charCount(y) + 1;
462+
checkSetCollision(charSetAST, set, prevCodePoint, c);
463+
set.add(prevCodePoint, c);
464+
inRange = false;
465+
prevCodePoint = -1;
466+
} else if (prevCodePoint != -1) {
467+
if (c == '-') {
468+
inRange = true;
469+
} else {
470+
checkSetCollision(charSetAST, set, prevCodePoint);
471+
set.add(prevCodePoint);
472+
prevCodePoint = c;
473+
}
474+
} else {
475+
if (c == '-') {
476+
// XXX make a proper error
477+
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
478+
g.fileName, charSetAST.getToken(), "[]");
479+
} else {
480+
prevCodePoint = c;
420481
}
421-
else {
422-
checkSetCollision(charSetAST, set, c);
423-
set.add(c);
424482
}
425483
i += offset;
426484
}
485+
// Whether or not we were in a range, we'll add the last code point found to the set.
486+
// If the range wasn't terminated, we'll treat it as a standalone codepoint.
487+
if (prevCodePoint != -1) {
488+
checkSetCollision(charSetAST, set, prevCodePoint);
489+
set.add(prevCodePoint);
490+
prevCodePoint = -1;
491+
}
427492
return set;
428493
}
429494

0 commit comments

Comments
 (0)