10
10
import org .antlr .runtime .Token ;
11
11
import org .antlr .v4 .codegen .CodeGenerator ;
12
12
import org .antlr .v4 .misc .CharSupport ;
13
+ import org .antlr .v4 .misc .EscapeSequenceParsing ;
13
14
import org .antlr .v4 .parse .ANTLRParser ;
14
15
import org .antlr .v4 .runtime .IntStream ;
15
16
import org .antlr .v4 .runtime .Lexer ;
@@ -365,7 +366,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
365
366
return new Handle (left , right );
366
367
}
367
368
368
- /** [Aa\t \u1234a-z\]\-] char sets */
369
+ /** [Aa\t \u1234a-z\]\p{Letter}\ -] char sets */
369
370
@ Override
370
371
public Handle charSetLiteral (GrammarAST charSetAST ) {
371
372
ATNState left = newState (charSetAST );
@@ -379,51 +380,94 @@ public Handle charSetLiteral(GrammarAST charSetAST) {
379
380
public IntervalSet getSetFromCharSetLiteral (GrammarAST charSetAST ) {
380
381
String chars = charSetAST .getText ();
381
382
chars = chars .substring (1 , chars .length () - 1 );
382
- String cset = '"' + chars + '"' ;
383
383
IntervalSet set = new IntervalSet ();
384
384
385
385
if (chars .length () == 0 ) {
386
386
g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED ,
387
387
g .fileName , charSetAST .getToken (), "[]" );
388
388
return set ;
389
389
}
390
- // unescape all valid escape char like \n, leaving escaped dashes as '\-'
391
- // so we can avoid seeing them as '-' range ops.
392
- chars = CharSupport .getStringFromGrammarStringLiteral (cset );
393
- if (chars == null ) {
394
- g .tool .errMgr .grammarError (ErrorType .INVALID_ESCAPE_SEQUENCE ,
395
- g .fileName , charSetAST .getToken ());
396
- return set ;
397
- }
390
+
391
+ int prevCodePoint = -1 ;
392
+ boolean inRange = false ;
398
393
int n = chars .length ();
399
- // now make x-y become set of char
400
394
for (int i = 0 ; i < n ; ) {
401
395
int c = chars .codePointAt (i );
402
396
int offset = Character .charCount (c );
403
- if (c == '\\' && i +offset < n && chars .codePointAt (i +offset ) == '-' ) { // \-
404
- checkSetCollision (charSetAST , set , '-' );
405
- set .add ('-' );
406
- offset ++;
407
- }
408
- else if (i +offset +1 < n && chars .codePointAt (i +offset ) == '-' ) { // range x-y
409
- int x = c ;
410
- int y = chars .codePointAt (i +offset +1 );
411
- if (x <= y ) {
412
- checkSetCollision (charSetAST , set , x , y );
413
- set .add (x ,y );
397
+ if (c == '\\' ) {
398
+ EscapeSequenceParsing .Result escapeParseResult =
399
+ EscapeSequenceParsing .parseEscape (chars , i );
400
+ switch (escapeParseResult .type ) {
401
+ case INVALID :
402
+ g .tool .errMgr .grammarError (ErrorType .INVALID_ESCAPE_SEQUENCE ,
403
+ g .fileName , charSetAST .getToken ());
404
+ return new IntervalSet ();
405
+ case INTERVAL_SET :
406
+ int codePoint = escapeParseResult .intervalSet .getSingleElement ();
407
+ boolean containsMultipleCodePoints = (codePoint == org .antlr .v4 .runtime .Token .INVALID_TYPE );
408
+ if (inRange ) {
409
+ if (containsMultipleCodePoints ) {
410
+ // XXX make a proper error
411
+ g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED ,
412
+ g .fileName , charSetAST .getToken (), "[]" );
413
+ } else if (prevCodePoint > codePoint ) {
414
+ g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED , g .fileName , charSetAST .getToken (),
415
+ CharSupport .toRange (prevCodePoint , codePoint , CharSupport .ToRangeMode .BRACKETED ));
416
+ } else {
417
+ checkSetCollision (charSetAST , set , prevCodePoint , codePoint );
418
+ set .add (prevCodePoint , codePoint );
419
+ }
420
+ inRange = false ;
421
+ prevCodePoint = -1 ;
422
+ } else if (prevCodePoint != -1 ) {
423
+ checkSetCollision (charSetAST , set , prevCodePoint );
424
+ set .add (prevCodePoint );
425
+
426
+ if (containsMultipleCodePoints ) {
427
+ prevCodePoint = -1 ;
428
+ } else {
429
+ prevCodePoint = codePoint ;
430
+ }
431
+
432
+ set .addAll (escapeParseResult .intervalSet );
433
+ }
434
+ break ;
414
435
}
415
- else {
416
- g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED ,
417
- g .fileName , charSetAST .getToken (), CharSupport .toRange (x , y , CharSupport .ToRangeMode .BRACKETED ));
436
+ } else if (inRange ) {
437
+ if (prevCodePoint > c ) {
438
+ g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED , g .fileName , charSetAST .getToken (),
439
+ CharSupport .toRange (prevCodePoint , c , CharSupport .ToRangeMode .BRACKETED ));
440
+ }
441
+ checkSetCollision (charSetAST , set , prevCodePoint , c );
442
+ set .add (prevCodePoint , c );
443
+ inRange = false ;
444
+ prevCodePoint = -1 ;
445
+ } else if (prevCodePoint != -1 ) {
446
+ if (c == '-' ) {
447
+ inRange = true ;
448
+ } else {
449
+ checkSetCollision (charSetAST , set , prevCodePoint );
450
+ set .add (prevCodePoint );
451
+ prevCodePoint = c ;
418
452
}
419
- offset += Character .charCount (y ) + 1 ;
453
+ } else {
454
+ if (c == '-' ) {
455
+ // XXX make a proper error
456
+ g .tool .errMgr .grammarError (ErrorType .EMPTY_STRINGS_AND_SETS_NOT_ALLOWED ,
457
+ g .fileName , charSetAST .getToken (), "[]" );
458
+ } else {
459
+ prevCodePoint = c ;
420
460
}
421
- else {
422
- checkSetCollision (charSetAST , set , c );
423
- set .add (c );
424
461
}
425
462
i += offset ;
426
463
}
464
+ // Whether or not we were in a range, we'll add the last code point found to the set.
465
+ // If the range wasn't terminated, we'll treat it as a standalone codepoint.
466
+ if (prevCodePoint != -1 ) {
467
+ checkSetCollision (charSetAST , set , prevCodePoint );
468
+ set .add (prevCodePoint );
469
+ prevCodePoint = -1 ;
470
+ }
427
471
return set ;
428
472
}
429
473
0 commit comments