New \p{Foo} escape sequence

bhamiltoncx · bhamiltoncx · commit 44ddf864fc24 · 2017-02-23T16:11:48.000-08:00
diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestATNConstruction.java
@@ -115,6 +115,129 @@ public void testA() throws Exception {
 				"s4->RuleStop_A_2\n";
 		checkTokensRule(g, null, expecting);
 	}
+	@Test public void testCharSet() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [abc] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\uABCD] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-43981->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeBMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\uABCD-\\uABFF] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 43981..44031}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\u{10ABCD}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-1092557->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeSMPEscapeRange() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [a-c\\u{10ABCD}-\\u{10ABFF}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{97..99, 1092557..1092607}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyEscape() throws Exception {
+		// The Gothic script is long dead and unlikely to change (which would
+		// cause this test to fail)
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyInvertEscape() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\P{Gothic}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{0..66351, 66379..1114111}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodeMultiplePropertyEscape() throws Exception {
+		// Ditto the Mahajani script. Not going to change soon. I hope.
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{Gothic}\\p{Mahajani}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{66352..66378, 69968..70006}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
+	@Test public void testCharSetUnicodePropertyOverlap() throws Exception {
+		LexerGrammar g = new LexerGrammar(
+			"lexer grammar P;\n"+
+			"A : [\\p{ASCII_Hex_Digit}\\p{Hex_Digit}] ;"
+		);
+		String expecting =
+			"s0->RuleStart_A_1\n" +
+				"RuleStart_A_1->s3\n" +
+				"s3-{48..57, 65..70, 97..102, 65296..65305, 65313..65318, 65345..65350}->s4\n" +
+				"s4->RuleStop_A_2\n";
+		checkTokensRule(g, null, expecting);
+	}
 	@Test public void testRangeOrRange() throws Exception {
 		LexerGrammar g = new LexerGrammar(
 			"lexer grammar P;\n"+
diff --git a/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java b/tool-testsuite/test/org/antlr/v4/test/tool/TestToolSyntaxErrors.java
@@ -529,6 +529,40 @@ public void testSetUp() throws Exception {
 		super.testErrors(pair, true);
 	}
 
+	@Test public void testInvalidUnicodeEscapesInCharSet() {
+		String grammar =
+				"lexer grammar Test;\n" +
+				"INVALID_EXTENDED_UNICODE_EMPTY: [\\u{}];\n" +
+				"INVALID_EXTENDED_UNICODE_NOT_TERMINATED: [\\u{];\n" +
+				"INVALID_EXTENDED_UNICODE_TOO_LONG: [\\u{110000}];\n" +
+				"INVALID_UNICODE_PROPERTY_EMPTY: [\\p{}];\n" +
+				"INVALID_UNICODE_PROPERTY_NOT_TERMINATED: [\\p{];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_EMPTY: [\\P{}];\n" +
+				"INVALID_UNICODE_PROPERTY_UNKNOWN: [\\p{NotAProperty}];\n" +
+				"INVALID_INVERTED_UNICODE_PROPERTY_UNKNOWN: [\\P{NotAProperty}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\p{Foo}-\\p{Bar}];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_2: [\\p{Foo}-Z];\n" +
+				"UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE_3: [A-\\p{Foo}];\n" +
+				"INVERTED_UNICODE_PROPERTY_NOT_ALLOWED_IN_RANGE: [\\P{Foo}-\\P{Bar}];\n";
+
+		String expected =
+				"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:23: multi-character literals are not allowed in lexer sets: 'GH'\n" +
+				"error(" + ErrorType.INVALID_LITERAL_IN_LEXER_SET.code + "): Test.g4:2:29: multi-character literals are not allowed in lexer sets: 'LM'\n" +
+				"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:3:26: string literals and sets cannot be empty: 'F'..'A'\n" +
+				"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:23: string literals and sets cannot be empty: [f-a]\n" +
+				"error(" + ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED.code + "): Test.g4:5:29: string literals and sets cannot be empty: []\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:23: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:6:33: invalid escape sequence\n" +
+				"error(" + ErrorType.INVALID_ESCAPE_SEQUENCE.code + "): Test.g4:7:23: invalid escape sequence\n";
+
+		String[] pair = new String[] {
+				grammar,
+				expected
+		};
+
+		super.testErrors(pair, true);
+	}
+
 	/**
 	 * This test ensures the {@link ErrorType#UNRECOGNIZED_ASSOC_OPTION} warning
 	 * is produced as described in the documentation.
diff --git a/tool/src/org/antlr/v4/automata/LexerATNFactory.java b/tool/src/org/antlr/v4/automata/LexerATNFactory.java
@@ -10,6 +10,7 @@
 import org.antlr.runtime.Token;
 import org.antlr.v4.codegen.CodeGenerator;
 import org.antlr.v4.misc.CharSupport;
+import org.antlr.v4.misc.EscapeSequenceParsing;
 import org.antlr.v4.parse.ANTLRParser;
 import org.antlr.v4.runtime.IntStream;
 import org.antlr.v4.runtime.Lexer;
@@ -365,7 +366,7 @@ public Handle stringLiteral(TerminalAST stringLiteralAST) {
 		return new Handle(left, right);
 	}
 
-	/** [Aa\t \u1234a-z\]\-] char sets */
+	/** [Aa\t \u1234a-z\]\p{Letter}\-] char sets */
 	@Override
 	public Handle charSetLiteral(GrammarAST charSetAST) {
 		ATNState left = newState(charSetAST);
@@ -379,51 +380,94 @@ public Handle charSetLiteral(GrammarAST charSetAST) {
 	public IntervalSet getSetFromCharSetLiteral(GrammarAST charSetAST) {
 		String chars = charSetAST.getText();
 		chars = chars.substring(1, chars.length() - 1);
-		String cset = '"' + chars + '"';
 		IntervalSet set = new IntervalSet();
 
 		if (chars.length() == 0) {
 			g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
 					g.fileName, charSetAST.getToken(), "[]");
 			return set;
 		}
-		// unescape all valid escape char like \n, leaving escaped dashes as '\-'
-		// so we can avoid seeing them as '-' range ops.
-		chars = CharSupport.getStringFromGrammarStringLiteral(cset);
-		if (chars == null) {
-			g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
-			                           g.fileName, charSetAST.getToken());
-			return set;
-		}
+
+		int prevCodePoint = -1;
+		boolean inRange = false;
 		int n = chars.length();
-		// now make x-y become set of char
 		for (int i = 0; i < n; ) {
 			int c = chars.codePointAt(i);
 			int offset = Character.charCount(c);
-			if (c == '\\' && i+offset < n && chars.codePointAt(i+offset) == '-') { // \-
-				checkSetCollision(charSetAST, set, '-');
-				set.add('-');
-				offset++;
-			}
-			else if (i+offset+1 < n && chars.codePointAt(i+offset) == '-') { // range x-y
-				int x = c;
-				int y = chars.codePointAt(i+offset+1);
-				if (x <= y) {
-					checkSetCollision(charSetAST, set, x, y);
-					set.add(x,y);
+			if (c == '\\') {
+				EscapeSequenceParsing.Result escapeParseResult =
+					EscapeSequenceParsing.parseEscape(chars, i);
+				switch (escapeParseResult.type) {
+					case INVALID:
+						g.tool.errMgr.grammarError(ErrorType.INVALID_ESCAPE_SEQUENCE,
+									   g.fileName, charSetAST.getToken());
+						return new IntervalSet();
+					case INTERVAL_SET:
+						int codePoint = escapeParseResult.intervalSet.getSingleElement();
+						boolean containsMultipleCodePoints = (codePoint == org.antlr.v4.runtime.Token.INVALID_TYPE);
+						if (inRange) {
+							if (containsMultipleCodePoints) {
+								// XXX make a proper error
+								g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
+											   g.fileName, charSetAST.getToken(), "[]");
+							} else if (prevCodePoint > codePoint) {
+								g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
+											   CharSupport.toRange(prevCodePoint, codePoint, CharSupport.ToRangeMode.BRACKETED));
+							} else {
+								checkSetCollision(charSetAST, set, prevCodePoint, codePoint);
+								set.add(prevCodePoint, codePoint);
+							}
+							inRange = false;
+							prevCodePoint = -1;
+						} else if (prevCodePoint != -1) {
+							checkSetCollision(charSetAST, set, prevCodePoint);
+							set.add(prevCodePoint);
+
+							if (containsMultipleCodePoints) {
+								prevCodePoint = -1;
+							} else {
+								prevCodePoint = codePoint;
+							}
+
+							set.addAll(escapeParseResult.intervalSet);
+						}
+						break;
 				}
-				else {
-					g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
-								   g.fileName, charSetAST.getToken(), CharSupport.toRange(x, y, CharSupport.ToRangeMode.BRACKETED));
+			} else if (inRange) {
+				if (prevCodePoint > c) {
+					g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED, g.fileName, charSetAST.getToken(),
+								   CharSupport.toRange(prevCodePoint, c, CharSupport.ToRangeMode.BRACKETED));
+				}
+				checkSetCollision(charSetAST, set, prevCodePoint, c);
+				set.add(prevCodePoint, c);
+				inRange = false;
+				prevCodePoint = -1;
+			} else if (prevCodePoint != -1) {
+				if (c == '-') {
+					inRange = true;
+				} else {
+					checkSetCollision(charSetAST, set, prevCodePoint);
+					set.add(prevCodePoint);
+					prevCodePoint = c;
 				}
-				offset += Character.charCount(y) + 1;
+			} else {
+				if (c == '-') {
+					// XXX make a proper error
+					g.tool.errMgr.grammarError(ErrorType.EMPTY_STRINGS_AND_SETS_NOT_ALLOWED,
+								   g.fileName, charSetAST.getToken(), "[]");
+				} else {
+					prevCodePoint = c;
 			}
-			else {
-				checkSetCollision(charSetAST, set, c);
-				set.add(c);
 			}
 			i += offset;
 		}
+		// Whether or not we were in a range, we'll add the last code point found to the set.
+		// If the range wasn't terminated, we'll treat it as a standalone codepoint.
+		if (prevCodePoint != -1) {
+			checkSetCollision(charSetAST, set, prevCodePoint);
+			set.add(prevCodePoint);
+			prevCodePoint = -1;
+		}
 		return set;
 	}