4
4
*/
5
5
using System ;
6
6
using System . IO ;
7
+ using System . Text ;
7
8
using Antlr4 . Runtime ;
8
9
using Antlr4 . Runtime . Misc ;
9
10
using Antlr4 . Runtime . Sharpen ;
@@ -27,7 +28,7 @@ public class UnbufferedCharStream : ICharStream
27
28
/// resets so
28
29
/// we start filling at index 0 again.
29
30
/// </remarks>
30
- protected internal char [ ] data ;
31
+ protected internal int [ ] data ;
31
32
32
33
/// <summary>
33
34
/// The number of characters currently in
@@ -119,7 +120,7 @@ public UnbufferedCharStream()
119
120
public UnbufferedCharStream ( int bufferSize )
120
121
{
121
122
n = 0 ;
122
- data = new char [ bufferSize ] ;
123
+ data = new int [ bufferSize ] ;
123
124
}
124
125
125
126
public UnbufferedCharStream ( Stream input )
@@ -211,13 +212,52 @@ protected internal virtual int Fill(int n)
211
212
{
212
213
for ( int i = 0 ; i < n ; i ++ )
213
214
{
214
- if ( this . n > 0 && data [ this . n - 1 ] == unchecked ( ( char ) IntStreamConstants . EOF ) )
215
+ if ( this . n > 0 && data [ this . n - 1 ] == IntStreamConstants . EOF )
215
216
{
216
217
return i ;
217
218
}
218
219
219
220
int c = NextChar ( ) ;
220
- Add ( c ) ;
221
+ if ( c > char . MaxValue || c == IntStreamConstants . EOF )
222
+ {
223
+ Add ( c ) ;
224
+ }
225
+ else
226
+ {
227
+ char ch = unchecked ( ( char ) c ) ;
228
+ if ( Char . IsLowSurrogate ( ch ) )
229
+ {
230
+ throw new ArgumentException ( "Invalid UTF-16 (low surrogate with no preceding high surrogate)" ) ;
231
+ }
232
+ else if ( Char . IsHighSurrogate ( ch ) )
233
+ {
234
+ int lowSurrogate = NextChar ( ) ;
235
+ if ( lowSurrogate > char . MaxValue )
236
+ {
237
+ throw new ArgumentException ( "Invalid UTF-16 (high surrogate followed by code point > U+FFFF" ) ;
238
+ }
239
+ else if ( lowSurrogate == IntStreamConstants . EOF )
240
+ {
241
+ throw new ArgumentException ( "Invalid UTF-16 (low surrogate with no preceding high surrogate)" ) ;
242
+ }
243
+ else
244
+ {
245
+ char lowSurrogateChar = unchecked ( ( char ) lowSurrogate ) ;
246
+ if ( Char . IsLowSurrogate ( lowSurrogateChar ) )
247
+ {
248
+ Add ( Char . ConvertToUtf32 ( ch , lowSurrogateChar ) ) ;
249
+ }
250
+ else
251
+ {
252
+ throw new ArgumentException ( "Invalid UTF-16 (low surrogate with no preceding high surrogate)" ) ;
253
+ }
254
+ }
255
+ }
256
+ else
257
+ {
258
+ Add ( c ) ;
259
+ }
260
+ }
221
261
}
222
262
return n ;
223
263
}
@@ -239,7 +279,7 @@ protected internal virtual void Add(int c)
239
279
{
240
280
data = Arrays . CopyOf ( data , data . Length * 2 ) ;
241
281
}
242
- data [ n ++ ] = ( char ) c ;
282
+ data [ n ++ ] = c ;
243
283
}
244
284
245
285
public virtual int LA ( int i )
@@ -259,12 +299,7 @@ public virtual int LA(int i)
259
299
{
260
300
return IntStreamConstants . EOF ;
261
301
}
262
- char c = data [ index ] ;
263
- if ( c == unchecked ( ( char ) IntStreamConstants . EOF ) )
264
- {
265
- return IntStreamConstants . EOF ;
266
- }
267
- return c ;
302
+ return data [ index ] ;
268
303
}
269
304
270
305
/// <summary>Return a marker that we can release later.</summary>
@@ -395,7 +430,7 @@ public virtual string GetText(Interval interval)
395
430
throw new ArgumentException ( "invalid interval" ) ;
396
431
}
397
432
int bufferStartIndex = BufferStartIndex ;
398
- if ( n > 0 && data [ n - 1 ] == char . MaxValue )
433
+ if ( n > 0 && data [ n - 1 ] == IntStreamConstants . EOF )
399
434
{
400
435
if ( interval . a + interval . Length > bufferStartIndex + n )
401
436
{
@@ -408,7 +443,12 @@ public virtual string GetText(Interval interval)
408
443
}
409
444
// convert from absolute to local index
410
445
int i = interval . a - bufferStartIndex ;
411
- return new string ( data , i , interval . Length ) ;
446
+ // build a UTF-16 string from the Unicode code points in data
447
+ var sb = new StringBuilder ( interval . Length ) ;
448
+ for ( int offset = 0 ; offset < interval . Length ; offset ++ ) {
449
+ sb . Append ( Char . ConvertFromUtf32 ( data [ i + offset ] ) ) ;
450
+ }
451
+ return sb . ToString ( ) ;
412
452
}
413
453
414
454
protected internal int BufferStartIndex
0 commit comments