10
10
import org .antlr .runtime .Token ;
11
11
import org .antlr .v4 .codegen .CodeGenerator ;
12
12
import org .antlr .v4 .misc .CharSupport ;
13
+ import org .antlr .v4 .misc .EscapeSequenceParsing ;
13
14
import org .antlr .v4 .parse .ANTLRParser ;
14
15
import org .antlr .v4 .runtime .IntStream ;
15
16
import org .antlr .v4 .runtime .Lexer ;
42
43
import org .antlr .v4 .tool .ast .GrammarAST ;
43
44
import org .antlr .v4 .tool .ast .RangeAST ;
44
45
import org .antlr .v4 .tool .ast .TerminalAST ;
46
+ import org .antlr .v4 .unicode .UnicodeData ;
45
47
import org .stringtemplate .v4 .ST ;
46
48
import org .stringtemplate .v4 .STGroup ;
47
49
@@ -365,7 +367,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
365
367
return new Handle (left , right );
366
368
}
367
369
368
- /** [Aa\t \u1234a-z\]\-] char sets */
370
+ /** [Aa\t \u1234a-z\]\p{Letter}\ -] char sets */
369
371
@ Override
370
372
public Handle charSetLiteral (GrammarAST charSetAST ) {
371
373
ATNState left = newState (charSetAST );
@@ -379,51 +381,114 @@ public Handle charSetLiteral(GrammarAST charSetAST) {
379
381
public IntervalSet getSetFromCharSetLiteral (GrammarAST charSetAST ) {
380
382
String chars = charSetAST .getText ();
381
383
chars = chars .substring (1 , chars .length () - 1 );
382
- String cset = '"' + chars + '"' ;
383
384
IntervalSet set = new IntervalSet ();
384
385
385
386
if (chars .length () == 0 ) {
386
387
g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED ,
387
388
g .fileName , charSetAST .getToken (), "[]" );
388
389
return set ;
389
390
}
390
- // unescape all valid escape char like \n, leaving escaped dashes as '\-'
391
- // so we can avoid seeing them as '-' range ops.
392
- chars = CharSupport .getStringFromGrammarStringLiteral (cset );
393
- if (chars == null ) {
394
- g .tool .errMgr .grammarError (ErrorType .INVALID_ESCAPE_SEQUENCE ,
395
- g .fileName , charSetAST .getToken ());
396
- return set ;
397
- }
391
+
392
+ int prevCodePoint = -1 ;
393
+ boolean inRange = false ;
398
394
int n = chars .length ();
399
- // now make x-y become set of char
400
395
for (int i = 0 ; i < n ; ) {
401
396
int c = chars .codePointAt (i );
402
397
int offset = Character .charCount (c );
403
- if (c == '\\' && i +offset < n && chars .codePointAt (i +offset ) == '-' ) { // \-
404
- checkSetCollision (charSetAST , set , '-' );
405
- set .add ('-' );
406
- offset ++;
407
- }
408
- else if (i +offset +1 < n && chars .codePointAt (i +offset ) == '-' ) { // range x-y
409
- int x = c ;
410
- int y = chars .codePointAt (i +offset +1 );
411
- if (x <= y ) {
412
- checkSetCollision (charSetAST , set , x , y );
413
- set .add (x ,y );
398
+ if (c == '\\' ) {
399
+ EscapeSequenceParsing .Result escapeParseResult =
400
+ EscapeSequenceParsing .parseEscape (chars , i );
401
+ if (escapeParseResult == null ) {
402
+ g .tool .errMgr .grammarError (ErrorType .INVALID_ESCAPE_SEQUENCE ,
403
+ g .fileName , charSetAST .getToken ());
404
+ return new IntervalSet ();
405
+ } else {
406
+ offset = escapeParseResult .codeUnitLength ;
414
407
}
415
- else {
408
+ switch (escapeParseResult .type ) {
409
+ case UNICODE_CODE_POINT :
410
+ if (inRange ) {
411
+ if (prevCodePoint > escapeParseResult .codePoint ) {
412
+ g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED , g .fileName , charSetAST .getToken (),
413
+ CharSupport .toRange (prevCodePoint , escapeParseResult .codePoint , CharSupport .ToRangeMode .BRACKETED ));
414
+ }
415
+ checkSetCollision (charSetAST , set , prevCodePoint , escapeParseResult .codePoint );
416
+ set .add (prevCodePoint , escapeParseResult .codePoint );
417
+ inRange = false ;
418
+ prevCodePoint = -1 ;
419
+ } else if (prevCodePoint != -1 ) {
420
+ checkSetCollision (charSetAST , set , prevCodePoint );
421
+ set .add (prevCodePoint );
422
+ prevCodePoint = escapeParseResult .codePoint ;
423
+ } else {
424
+ prevCodePoint = escapeParseResult .codePoint ;
425
+ }
426
+ break ;
427
+ case UNICODE_PROPERTY_NAME :
428
+ // fall through
429
+ case UNICODE_PROPERTY_NAME_INVERTED :
430
+ if (prevCodePoint != -1 ) {
431
+ checkSetCollision (charSetAST , set , prevCodePoint );
432
+ set .add (prevCodePoint );
433
+ prevCodePoint = -1 ;
434
+ }
435
+
436
+ if (inRange ) {
437
+ // XXX make a proper error
416
438
g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED ,
417
- g .fileName , charSetAST .getToken (), CharSupport .toRange (x , y , CharSupport .ToRangeMode .BRACKETED ));
439
+ g .fileName , charSetAST .getToken (), "[]" );
440
+ inRange = false ;
441
+ } else {
442
+ IntervalSet propertySet = UnicodeData .getPropertyCodePoints (escapeParseResult .propertyName );
443
+ if (propertySet == null ) {
444
+ // XXX make a proper error
445
+ g .tool .errMgr .grammarError (ErrorType .INVALID_ESCAPE_SEQUENCE ,
446
+ g .fileName , charSetAST .getToken ());
447
+ } else {
448
+ if (escapeParseResult .type == EscapeSequenceParsing .Result .Type .UNICODE_PROPERTY_NAME_INVERTED ) {
449
+ propertySet = propertySet .complement (IntervalSet .COMPLETE_CHAR_SET );
450
+ }
451
+ // We don't check for collision, since these sets can be huge.
452
+ set .addAll (propertySet );
453
+ }
454
+ }
455
+ break ;
456
+ }
457
+ } else if (inRange ) {
458
+ if (prevCodePoint > c ) {
459
+ g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED , g .fileName , charSetAST .getToken (),
460
+ CharSupport .toRange (prevCodePoint , c , CharSupport .ToRangeMode .BRACKETED ));
418
461
}
419
- offset += Character .charCount (y ) + 1 ;
462
+ checkSetCollision (charSetAST , set , prevCodePoint , c );
463
+ set .add (prevCodePoint , c );
464
+ inRange = false ;
465
+ prevCodePoint = -1 ;
466
+ } else if (prevCodePoint != -1 ) {
467
+ if (c == '-' ) {
468
+ inRange = true ;
469
+ } else {
470
+ checkSetCollision (charSetAST , set , prevCodePoint );
471
+ set .add (prevCodePoint );
472
+ prevCodePoint = c ;
473
+ }
474
+ } else {
475
+ if (c == '-' ) {
476
+ // XXX make a proper error
477
+ g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED ,
478
+ g .fileName , charSetAST .getToken (), "[]" );
479
+ } else {
480
+ prevCodePoint = c ;
420
481
}
421
- else {
422
- checkSetCollision (charSetAST , set , c );
423
- set .add (c );
424
482
}
425
483
i += offset ;
426
484
}
485
+ // Whether or not we were in a range, we'll add the last code point found to the set.
486
+ // If the range wasn't terminated, we'll treat it as a standalone codepoint.
487
+ if (prevCodePoint != -1 ) {
488
+ checkSetCollision (charSetAST , set , prevCodePoint );
489
+ set .add (prevCodePoint );
490
+ prevCodePoint = -1 ;
491
+ }
427
492
return set ;
428
493
}
429
494
0 commit comments