Skip to content

Commit 44ddf86

Browse files
committed
New \p{Foo} escape sequence
1 parent 54220ed commit 44ddf86

File tree

3 files changed

+230
-29
lines changed

3 files changed

+230
-29
lines changed

tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,129 @@ public void testA() throws Exception {
115115
"s4->RuleStop_A_2\n";
116116
checkTokensRule(g, null, expecting);
117117
}
118+
@Test public void testCharSet() throws Exception {
119+
LexerGrammar g = new LexerGrammar(
120+
"lexer grammar P;\n"+
121+
"A : [abc] ;"
122+
);
123+
String expecting =
124+
"s0->RuleStart_A_1\n" +
125+
"RuleStart_A_1->s3\n" +
126+
"s3-{97..99}->s4\n" +
127+
"s4->RuleStop_A_2\n";
128+
checkTokensRule(g, null, expecting);
129+
}
130+
@Test public void testCharSetRange() throws Exception {
131+
LexerGrammar g = new LexerGrammar(
132+
"lexer grammar P;\n"+
133+
"A : [a-c] ;"
134+
);
135+
String expecting =
136+
"s0->RuleStart_A_1\n" +
137+
"RuleStart_A_1->s3\n" +
138+
"s3-{97..99}->s4\n" +
139+
"s4->RuleStop_A_2\n";
140+
checkTokensRule(g, null, expecting);
141+
}
142+
@Test public void testCharSetUnicodeBMPEscape() throws Exception {
143+
LexerGrammar g = new LexerGrammar(
144+
"lexer grammar P;\n"+
145+
"A : [\\uABCD] ;"
146+
);
147+
String expecting =
148+
"s0->RuleStart_A_1\n" +
149+
"RuleStart_A_1->s3\n" +
150+
"s3-43981->s4\n" +
151+
"s4->RuleStop_A_2\n";
152+
checkTokensRule(g, null, expecting);
153+
}
154+
@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
155+
LexerGrammar g = new LexerGrammar(
156+
"lexer grammar P;\n"+
157+
"A : [a-c\\uABCD-\\uABFF] ;"
158+
);
159+
String expecting =
160+
"s0->RuleStart_A_1\n" +
161+
"RuleStart_A_1->s3\n" +
162+
"s3-{97..99, 43981..44031}->s4\n" +
163+
"s4->RuleStop_A_2\n";
164+
checkTokensRule(g, null, expecting);
165+
}
166+
@Test public void testCharSetUnicodeSMPEscape() throws Exception {
167+
LexerGrammar g = new LexerGrammar(
168+
"lexer grammar P;\n"+
169+
"A : [\\u{10ABCD}] ;"
170+
);
171+
String expecting =
172+
"s0->RuleStart_A_1\n" +
173+
"RuleStart_A_1->s3\n" +
174+
"s3-1092557->s4\n" +
175+
"s4->RuleStop_A_2\n";
176+
checkTokensRule(g, null, expecting);
177+
}
178+
@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
179+
LexerGrammar g = new LexerGrammar(
180+
"lexer grammar P;\n"+
181+
"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
182+
);
183+
String expecting =
184+
"s0->RuleStart_A_1\n" +
185+
"RuleStart_A_1->s3\n" +
186+
"s3-{97..99, 1092557..1092607}->s4\n" +
187+
"s4->RuleStop_A_2\n";
188+
checkTokensRule(g, null, expecting);
189+
}
190+
@Test public void testCharSetUnicodePropertyEscape() throws Exception {
191+
// The Gothic script is long dead and unlikely to change (which would
192+
// cause this test to fail)
193+
LexerGrammar g = new LexerGrammar(
194+
"lexer grammar P;\n"+
195+
"A : [\\p{Gothic}] ;"
196+
);
197+
String expecting =
198+
"s0->RuleStart_A_1\n" +
199+
"RuleStart_A_1->s3\n" +
200+
"s3-{66352..66378}->s4\n" +
201+
"s4->RuleStop_A_2\n";
202+
checkTokensRule(g, null, expecting);
203+
}
204+
@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
205+
LexerGrammar g = new LexerGrammar(
206+
"lexer grammar P;\n"+
207+
"A : [\\P{Gothic}] ;"
208+
);
209+
String expecting =
210+
"s0->RuleStart_A_1\n" +
211+
"RuleStart_A_1->s3\n" +
212+
"s3-{0..66351, 66379..1114111}->s4\n" +
213+
"s4->RuleStop_A_2\n";
214+
checkTokensRule(g, null, expecting);
215+
}
216+
@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
217+
// Ditto the Mahajani script. Not going to change soon. I hope.
218+
LexerGrammar g = new LexerGrammar(
219+
"lexer grammar P;\n"+
220+
"A : [\\p{Gothic}\\p{Mahajani}] ;"
221+
);
222+
String expecting =
223+
"s0->RuleStart_A_1\n" +
224+
"RuleStart_A_1->s3\n" +
225+
"s3-{66352..66378, 69968..70006}->s4\n" +
226+
"s4->RuleStop_A_2\n";
227+
checkTokensRule(g, null, expecting);
228+
}
229+
@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
230+
LexerGrammar g = new LexerGrammar(
231+
"lexer grammar P;\n"+
232+
"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
233+
);
234+
String expecting =
235+
"s0->RuleStart_A_1\n" +
236+
"RuleStart_A_1->s3\n" +
237+
"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
238+
"s4->RuleStop_A_2\n";
239+
checkTokensRule(g, null, expecting);
240+
}
118241
@Test public void testRangeOrRange() throws Exception {
119242
LexerGrammar g = new LexerGrammar(
120243
"lexer grammar P;\n"+

tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,40 @@ public void testSetUp() throws Exception {
529529
super.testErrors(pair, true);
530530
}
531531

532+
@Test public void testInvalidUnicodeEscapesInCharSet() {
533+
String grammar =
534+
"lexer grammar Test;\n" +
535+
"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
536+
"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
537+
"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
538+
"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
539+
"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
540+
"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
541+
"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
542+
"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
543+
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Foo}-\\p{Bar}];\n" +
544+
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Foo}-Z];\n" +
545+
"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Foo}];\n" +
546+
"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Foo}-\\P{Bar}];\n";
547+
548+
String expected =
549+
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:23: multi-character literals are not allowed in lexer sets: 'GH'\n" +
550+
"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:29: multi-character literals are not allowed in lexer sets: 'LM'\n" +
551+
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:3:26: string literals and sets cannot be empty: 'F'..'A'\n" +
552+
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:23: string literals and sets cannot be empty: [f-a]\n" +
553+
"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:29: string literals and sets cannot be empty: []\n" +
554+
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:23: invalid escape sequence\n" +
555+
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:33: invalid escape sequence\n" +
556+
"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:23: invalid escape sequence\n";
557+
558+
String[] pair = new String[] {
559+
grammar,
560+
expected
561+
};
562+
563+
super.testErrors(pair, true);
564+
}
565+
532566
/**
533567
* This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
534568
* is produced as described in the documentation.

tool/src/org/antlr/v4/automata/LexerATNFactory.java

Lines changed: 73 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import org.antlr.runtime.Token;
1111
import org.antlr.v4.codegen.CodeGenerator;
1212
import org.antlr.v4.misc.CharSupport;
13+
import org.antlr.v4.misc.EscapeSequenceParsing;
1314
import org.antlr.v4.parse.ANTLRParser;
1415
import org.antlr.v4.runtime.IntStream;
1516
import org.antlr.v4.runtime.Lexer;
@@ -365,7 +366,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
365366
return new Handle(left, right);
366367
}
367368

368-
/** [Aa\t \u1234a-z\]\-] char sets */
369+
/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
369370
@Override
370371
public Handle charSetLiteral(GrammarAST charSetAST) {
371372
ATNState left = newState(charSetAST);
@@ -379,51 +380,94 @@ public Handle charSetLiteral(GrammarAST charSetAST) {
379380
public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
380381
String chars = charSetAST.getText();
381382
chars = chars.substring(1, chars.length() - 1);
382-
String cset = '"' + chars + '"';
383383
IntervalSet set = new IntervalSet();
384384

385385
if (chars.length() == 0) {
386386
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
387387
g.fileName, charSetAST.getToken(), "[]");
388388
return set;
389389
}
390-
// unescape all valid escape char like \n, leaving escaped dashes as '\-'
391-
// so we can avoid seeing them as '-' range ops.
392-
chars = CharSupport.getStringFromGrammarStringLiteral(cset);
393-
if (chars == null) {
394-
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
395-
g.fileName, charSetAST.getToken());
396-
return set;
397-
}
390+
391+
int prevCodePoint = -1;
392+
boolean inRange = false;
398393
int n = chars.length();
399-
// now make x-y become set of char
400394
for (int i = 0; i < n; ) {
401395
int c = chars.codePointAt(i);
402396
int offset = Character.charCount(c);
403-
if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
404-
checkSetCollision(charSetAST, set, '-');
405-
set.add('-');
406-
offset++;
407-
}
408-
else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
409-
int x = c;
410-
int y = chars.codePointAt(i+offset+1);
411-
if (x <= y) {
412-
checkSetCollision(charSetAST, set, x, y);
413-
set.add(x,y);
397+
if (c == '\\') {
398+
EscapeSequenceParsing.Result escapeParseResult =
399+
EscapeSequenceParsing.parseEscape(chars, i);
400+
switch (escapeParseResult.type) {
401+
case INVALID:
402+
g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
403+
g.fileName, charSetAST.getToken());
404+
return new IntervalSet();
405+
case INTERVAL_SET:
406+
int codePoint = escapeParseResult.intervalSet.getSingleElement();
407+
boolean containsMultipleCodePoints = (codePoint == org.antlr.v4.runtime.Token.INVALID_TYPE);
408+
if (inRange) {
409+
if (containsMultipleCodePoints) {
410+
// XXX make a proper error
411+
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
412+
g.fileName, charSetAST.getToken(), "[]");
413+
} else if (prevCodePoint > codePoint) {
414+
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
415+
CharSupport.toRange(prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED));
416+
} else {
417+
checkSetCollision(charSetAST, set, prevCodePoint, codePoint);
418+
set.add(prevCodePoint, codePoint);
419+
}
420+
inRange = false;
421+
prevCodePoint = -1;
422+
} else if (prevCodePoint != -1) {
423+
checkSetCollision(charSetAST, set, prevCodePoint);
424+
set.add(prevCodePoint);
425+
426+
if (containsMultipleCodePoints) {
427+
prevCodePoint = -1;
428+
} else {
429+
prevCodePoint = codePoint;
430+
}
431+
432+
set.addAll(escapeParseResult.intervalSet);
433+
}
434+
break;
414435
}
415-
else {
416-
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
417-
g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
436+
} else if (inRange) {
437+
if (prevCodePoint > c) {
438+
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
439+
CharSupport.toRange(prevCodePoint, c, CharSupport.ToRangeMode.BRACKETED));
440+
}
441+
checkSetCollision(charSetAST, set, prevCodePoint, c);
442+
set.add(prevCodePoint, c);
443+
inRange = false;
444+
prevCodePoint = -1;
445+
} else if (prevCodePoint != -1) {
446+
if (c == '-') {
447+
inRange = true;
448+
} else {
449+
checkSetCollision(charSetAST, set, prevCodePoint);
450+
set.add(prevCodePoint);
451+
prevCodePoint = c;
418452
}
419-
offset += Character.charCount(y) + 1;
453+
} else {
454+
if (c == '-') {
455+
// XXX make a proper error
456+
g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
457+
g.fileName, charSetAST.getToken(), "[]");
458+
} else {
459+
prevCodePoint = c;
420460
}
421-
else {
422-
checkSetCollision(charSetAST, set, c);
423-
set.add(c);
424461
}
425462
i += offset;
426463
}
464+
// Whether or not we were in a range, we'll add the last code point found to the set.
465+
// If the range wasn't terminated, we'll treat it as a standalone codepoint.
466+
if (prevCodePoint != -1) {
467+
checkSetCollision(charSetAST, set, prevCodePoint);
468+
set.add(prevCodePoint);
469+
prevCodePoint = -1;
470+
}
427471
return set;
428472
}
429473

0 commit comments

Comments
 (0)