Skip to content

Commit 802f4c4

Browse files
authored
Merge pull request #1796 from bhamiltoncx/unbuffered-char-stream-code-points
Change UnbufferedCharStream to use code points
2 parents 1f0db00 + 8108b34 commit 802f4c4

File tree

2 files changed

+69
-9
lines changed

2 files changed

+69
-9
lines changed

runtime/Java/src/org/antlr/v4/runtime/UnbufferedCharStream.java

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ public class UnbufferedCharStream implements CharStream {
2525
* we keep adding to buffer. Otherwise, {@link #consume consume()} resets so
2626
* we start filling at index 0 again.
2727
*/
28-
protected char[] data;
28+
protected int[] data;
2929

3030
/**
3131
* The number of characters currently in {@link #data data}.
@@ -82,7 +82,7 @@ public UnbufferedCharStream() {
8282
/** Useful for subclasses that pull char from other than this.input. */
8383
public UnbufferedCharStream(int bufferSize) {
8484
n = 0;
85-
data = new char[bufferSize];
85+
data = new int[bufferSize];
8686
}
8787

8888
public UnbufferedCharStream(InputStream input) {
@@ -145,13 +145,36 @@ protected void sync(int want) {
145145
*/
146146
protected int fill(int n) {
147147
for (int i=0; i<n; i++) {
148-
if (this.n > 0 && data[this.n - 1] == (char)IntStream.EOF) {
148+
if (this.n > 0 && data[this.n - 1] == IntStream.EOF) {
149149
return i;
150150
}
151151

152152
try {
153153
int c = nextChar();
154-
add(c);
154+
if (c > Character.MAX_VALUE || c == IntStream.EOF) {
155+
add(c);
156+
} else {
157+
char ch = (char) c;
158+
if (Character.isLowSurrogate(ch)) {
159+
throw new RuntimeException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
160+
} else if (Character.isHighSurrogate(ch)) {
161+
int lowSurrogate = nextChar();
162+
if (lowSurrogate > Character.MAX_VALUE) {
163+
throw new RuntimeException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF");
164+
} else if (lowSurrogate == IntStream.EOF) {
165+
throw new RuntimeException("Invalid UTF-16 (dangling high surrogate at end of file)");
166+
} else {
167+
char lowSurrogateChar = (char) lowSurrogate;
168+
if (Character.isLowSurrogate(lowSurrogateChar)) {
169+
add(Character.toCodePoint(ch, lowSurrogateChar));
170+
} else {
171+
throw new RuntimeException("Invalid UTF-16 (dangling high surrogate");
172+
}
173+
}
174+
} else {
175+
add(c);
176+
}
177+
}
155178
}
156179
catch (IOException ioe) {
157180
throw new RuntimeException(ioe);
@@ -173,7 +196,7 @@ protected void add(int c) {
173196
if ( n>=data.length ) {
174197
data = Arrays.copyOf(data, data.length * 2);
175198
}
176-
data[n++] = (char)c;
199+
data[n++] = c;
177200
}
178201

179202
@Override
@@ -183,8 +206,8 @@ public int LA(int i) {
183206
int index = p + i - 1;
184207
if ( index < 0 ) throw new IndexOutOfBoundsException();
185208
if ( index >= n ) return IntStream.EOF;
186-
char c = data[index];
187-
if ( c==(char)IntStream.EOF ) return IntStream.EOF;
209+
int c = data[index];
210+
if ( c==IntStream.EOF ) return IntStream.EOF;
188211
return c;
189212
}
190213

tool-testsuite/test/org/antlr/v4/test/tool/TestUnbufferedCharStream.java

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,30 @@ public void testLastChar() {
313313
assertEquals(expecting, tokens.getTokens().toString());
314314
}
315315

316+
@Test public void testUnicodeSMP() throws Exception {
317+
TestingUnbufferedCharStream input = createStream("\uD83C\uDF0E");
318+
assertEquals(0x1F30E, input.LA(1));
319+
assertEquals("\uD83C\uDF0E", input.getBuffer());
320+
input.consume();
321+
assertEquals(IntStream.EOF, input.LA(1));
322+
assertEquals("\uFFFF", input.getBuffer());
323+
}
324+
325+
@Test(expected = RuntimeException.class)
326+
public void testDanglingHighSurrogateAtEOFThrows() throws Exception {
327+
createStream("\uD83C");
328+
}
329+
330+
@Test(expected = RuntimeException.class)
331+
public void testDanglingHighSurrogateThrows() throws Exception {
332+
createStream("\uD83C\u0123");
333+
}
334+
335+
@Test(expected = RuntimeException.class)
336+
public void testDanglingLowSurrogateThrows() throws Exception {
337+
createStream("\uDF0E");
338+
}
339+
316340
protected static TestingUnbufferedCharStream createStream(String text) {
317341
return new TestingUnbufferedCharStream(new StringReader(text));
318342
}
@@ -336,15 +360,28 @@ public TestingUnbufferedCharStream(Reader input, int bufferSize) {
336360
*/
337361
public String getRemainingBuffer() {
338362
if ( n==0 ) return "";
339-
return new String(data,p,n-p);
363+
int len = n;
364+
if (data[len-1] == IntStream.EOF) {
365+
// Don't pass -1 to new String().
366+
return new String(data,p,len-p-1) + "\uFFFF";
367+
} else {
368+
return new String(data,p,len-p);
369+
}
340370
}
341371

342372
/** For testing. What's in moving window buffer into data stream.
343373
* From 0..p-1 have been consume.
344374
*/
345375
public String getBuffer() {
346376
if ( n==0 ) return "";
347-
return new String(data,0,n);
377+
int len = n;
378+
// Don't pass -1 to new String().
379+
if (data[len-1] == IntStream.EOF) {
380+
// Don't pass -1 to new String().
381+
return new String(data,0,len-1) + "\uFFFF";
382+
} else {
383+
return new String(data,0,len);
384+
}
348385
}
349386

350387
}

0 commit comments

Comments
 (0)