Skip to content

Commit 04972c2

Browse files
authored
Merge pull request #1798 from bhamiltoncx/csharp-unbuffered-char-stream-code-points
C#: Change UnbufferedCharStream to use 32-bit Unicode code points and 32-bit buffer
2 parents 802f4c4 + 366dbac commit 04972c2

File tree

1 file changed

+53
-13
lines changed

1 file changed

+53
-13
lines changed

runtime/CSharp/runtime/CSharp/Antlr4.Runtime/UnbufferedCharStream.cs

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*/
55
using System;
66
using System.IO;
7+
using System.Text;
78
using Antlr4.Runtime;
89
using Antlr4.Runtime.Misc;
910
using Antlr4.Runtime.Sharpen;
@@ -27,7 +28,7 @@ public class UnbufferedCharStream : ICharStream
2728
/// resets so
2829
/// we start filling at index 0 again.
2930
/// </remarks>
30-
protected internal char[] data;
31+
protected internal int[] data;
3132

3233
/// <summary>
3334
/// The number of characters currently in
@@ -119,7 +120,7 @@ public UnbufferedCharStream()
119120
public UnbufferedCharStream(int bufferSize)
120121
{
121122
n = 0;
122-
data = new char[bufferSize];
123+
data = new int[bufferSize];
123124
}
124125

125126
public UnbufferedCharStream(Stream input)
@@ -211,13 +212,52 @@ protected internal virtual int Fill(int n)
211212
{
212213
for (int i = 0; i < n; i++)
213214
{
214-
if (this.n > 0 && data[this.n - 1] == unchecked((char)IntStreamConstants.EOF))
215+
if (this.n > 0 && data[this.n - 1] == IntStreamConstants.EOF)
215216
{
216217
return i;
217218
}
218219

219220
int c = NextChar();
220-
Add(c);
221+
if (c > char.MaxValue || c == IntStreamConstants.EOF)
222+
{
223+
Add(c);
224+
}
225+
else
226+
{
227+
char ch = unchecked((char)c);
228+
if (Char.IsLowSurrogate(ch))
229+
{
230+
throw new ArgumentException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
231+
}
232+
else if (Char.IsHighSurrogate(ch))
233+
{
234+
int lowSurrogate = NextChar();
235+
if (lowSurrogate > char.MaxValue)
236+
{
237+
throw new ArgumentException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF");
238+
}
239+
else if (lowSurrogate == IntStreamConstants.EOF)
240+
{
241+
throw new ArgumentException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
242+
}
243+
else
244+
{
245+
char lowSurrogateChar = unchecked((char)lowSurrogate);
246+
if (Char.IsLowSurrogate(lowSurrogateChar))
247+
{
248+
Add(Char.ConvertToUtf32(ch, lowSurrogateChar));
249+
}
250+
else
251+
{
252+
throw new ArgumentException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
253+
}
254+
}
255+
}
256+
else
257+
{
258+
Add(c);
259+
}
260+
}
221261
}
222262
return n;
223263
}
@@ -239,7 +279,7 @@ protected internal virtual void Add(int c)
239279
{
240280
data = Arrays.CopyOf(data, data.Length * 2);
241281
}
242-
data[n++] = (char)c;
282+
data[n++] = c;
243283
}
244284

245285
public virtual int LA(int i)
@@ -259,12 +299,7 @@ public virtual int LA(int i)
259299
{
260300
return IntStreamConstants.EOF;
261301
}
262-
char c = data[index];
263-
if (c == unchecked((char)IntStreamConstants.EOF))
264-
{
265-
return IntStreamConstants.EOF;
266-
}
267-
return c;
302+
return data[index];
268303
}
269304

270305
/// <summary>Return a marker that we can release later.</summary>
@@ -395,7 +430,7 @@ public virtual string GetText(Interval interval)
395430
throw new ArgumentException("invalid interval");
396431
}
397432
int bufferStartIndex = BufferStartIndex;
398-
if (n > 0 && data[n - 1] == char.MaxValue)
433+
if (n > 0 && data[n - 1] == IntStreamConstants.EOF)
399434
{
400435
if (interval.a + interval.Length > bufferStartIndex + n)
401436
{
@@ -408,7 +443,12 @@ public virtual string GetText(Interval interval)
408443
}
409444
// convert from absolute to local index
410445
int i = interval.a - bufferStartIndex;
411-
return new string(data, i, interval.Length);
446+
// build a UTF-16 string from the Unicode code points in data
447+
var sb = new StringBuilder(interval.Length);
448+
for (int offset = 0; offset < interval.Length; offset++) {
449+
sb.Append(Char.ConvertFromUtf32(data[i + offset]));
450+
}
451+
return sb.ToString();
412452
}
413453

414454
protected internal int BufferStartIndex

0 commit comments

Comments
 (0)