@@ -142,85 +142,119 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
142142
143143std::ostream& printEscapedJSON (std::ostream& os, const std::string_view str) {
144144 os << ' "' ;
145- for (size_t i = 0 ; i < str.size (); i++) {
146- int u0 = str[i];
147- switch (u0) {
148- case ' \t ' :
149- os << " \\ t" ;
145+ constexpr uint32_t replacementCharacter = 0xFFFD ;
146+ bool lastWasLeadingSurrogate = false ;
147+ for (size_t i = 0 ; i < str.size ();) {
148+ // Decode from WTF-8 into a unicode code point.
149+ uint8_t leading = str[i];
150+ size_t trailingBytes;
151+ uint32_t u;
152+ if ((leading & 0b10000000 ) == 0b00000000 ) {
153+ // 0xxxxxxx
154+ trailingBytes = 0 ;
155+ u = leading;
156+ } else if ((leading & 0b11100000 ) == 0b11000000 ) {
157+ // 110xxxxx 10xxxxxx
158+ trailingBytes = 1 ;
159+ u = (leading & 0b00011111 ) << 6 ;
160+ } else if ((leading & 0b11110000 ) == 0b11100000 ) {
161+ // 1110xxxx 10xxxxxx 10xxxxxx
162+ trailingBytes = 2 ;
163+ u = (leading & 0b00001111 ) << 12 ;
164+ } else if ((leading & 0b11111000 ) == 0b11110000 ) {
165+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
166+ trailingBytes = 3 ;
167+ u = (leading & 0b00000111 ) << 18 ;
168+ } else {
169+ std::cerr << " warning: Bad WTF-8 leading byte (" << std::hex
170+ << int (leading) << std::dec << " ). Replacing.\n " ;
171+ trailingBytes = 0 ;
172+ u = replacementCharacter;
173+ }
174+
175+ ++i;
176+
177+ if (i + trailingBytes > str.size ()) {
178+ std::cerr << " warning: Unexpected end of string. Replacing.\n " ;
179+ u = replacementCharacter;
180+ } else {
181+ for (size_t j = 0 ; j < trailingBytes; ++j) {
182+ uint8_t trailing = str[i + j];
183+ if ((trailing & 0b11000000 ) != 0b10000000 ) {
184+ std::cerr << " warning: Bad WTF-8 trailing byte (" << std::hex
185+ << int (trailing) << std::dec << " ). Replacing.\n " ;
186+ u = replacementCharacter;
187+ break ;
188+ }
189+ // Shift 6 bits for every remaining trailing byte after this one.
190+ u |= (trailing & 0b00111111 ) << (6 * (trailingBytes - j - 1 ));
191+ }
192+ }
193+
194+ i += trailingBytes;
195+
196+ bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF ;
197+ bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF ;
198+ if (lastWasLeadingSurrogate && isTrailingSurrogate) {
199+ std::cerr << " warning: Invalid surrogate sequence in WTF-8.\n " ;
200+ }
201+ lastWasLeadingSurrogate = isLeadingSurrogate;
202+
203+ // Encode unicode code point into JSON.
204+ switch (u) {
205+ case ' "' :
206+ os << " \\\" " ;
207+ continue ;
208+ case ' \\ ' :
209+ os << " \\\\ " ;
210+ continue ;
211+ case ' \b ' :
212+ os << " \\ b" ;
213+ continue ;
214+ case ' \f ' :
215+ os << " \\ f" ;
150216 continue ;
151217 case ' \n ' :
152218 os << " \\ n" ;
153219 continue ;
154220 case ' \r ' :
155221 os << " \\ r" ;
156222 continue ;
157- case ' "' :
158- os << " \\\" " ;
159- continue ;
160- case ' \' ' :
161- os << " '" ;
162- continue ;
163- case ' \\ ' :
164- os << " \\\\ " ;
223+ case ' \t ' :
224+ os << " \\ t" ;
165225 continue ;
166- default : {
167- // Emit something like \u006e, the JSON escaping of a 16-bit number.
168- auto uEscape = [&](uint32_t v) {
169- if (v > 0xffff ) {
170- std::cerr << " warning: Bad 16-bit escapee " << int (u0) << ' \n ' ;
171- }
172- os << std::hex;
173- os << " \\ u" ;
174- os << ((v >> 12 ) & 0xf );
175- os << ((v >> 8 ) & 0xf );
176- os << ((v >> 4 ) & 0xf );
177- os << (v & 0xf );
178- os << std::dec;
179- };
180-
181- // Based off of
182- // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
183- if (!(u0 & 0x80 )) {
184- if (u0 >= 32 && u0 < 127 ) {
185- // This requires no escaping at all.
186- os << char (u0);
187- } else {
188- uEscape (u0);
189- }
190- continue ;
191- }
192-
193- // This uses 2 bytes.
194- i++;
195- int u1 = str[i] & 63 ;
196- if ((u0 & 0xE0 ) == 0xC0 ) {
197- uEscape ((((u0 & 31 ) << 6 ) | u1));
198- continue ;
199- }
226+ default :
227+ break ;
228+ }
200229
201- // This uses 3 bytes.
202- i++;
203- int u2 = str[i] & 63 ;
204- if ((u0 & 0xF0 ) == 0xE0 ) {
205- u0 = ((u0 & 15 ) << 12 ) | (u1 << 6 ) | u2;
206- } else {
207- // This uses 4 bytes.
208- if ((u0 & 0xF8 ) != 0xF0 ) {
209- std::cerr << " warning: Bad UTF-8 leading byte " << int (u0) << ' \n ' ;
210- }
211- i++;
212- u0 = ((u0 & 7 ) << 18 ) | (u1 << 12 ) | (u2 << 6 ) | (str[i] & 63 );
213- }
230+ // TODO: To minimize size, consider additionally escaping only other control
231+ // characters (u <= 0x1F) and surrogates, emitting everything else directly
232+ // assuming a UTF-8 encoding of the JSON text. We don't do this now because
233+ // Print.cpp would consider the contents unprintable, messing up our test.
234+ bool isNaivelyPrintable = 32 <= u && u < 127 ;
235+ if (isNaivelyPrintable) {
236+ assert (u < 0x80 && " need additional logic to emit valid UTF-8" );
237+ os << uint8_t (u);
238+ continue ;
239+ }
214240
215- if (u0 < 0x10000 ) {
216- uEscape (u0);
217- } else {
218- // There are two separate code points here.
219- auto ch = u0 - 0x10000 ;
220- uEscape (0xD800 | (ch >> 10 ));
221- uEscape (0xDC00 | (ch & 0x3FF ));
222- }
223- }
241+ // Escape as '\uXXXX` for code points less than 0x10000 or as a
242+ // '\uXXXX\uYYYY' surrogate pair otherwise.
243+ auto printEscape = [&os](uint32_t codePoint) {
244+ assert (codePoint < 0x10000 );
245+ os << std::hex << " \\ u" ;
246+ os << ((codePoint & 0xF000 ) >> 12 );
247+ os << ((codePoint & 0x0F00 ) >> 8 );
248+ os << ((codePoint & 0x00F0 ) >> 4 );
249+ os << (codePoint & 0x000F );
250+ os << std::dec;
251+ };
252+ if (u < 0x10000 ) {
253+ printEscape (u);
254+ } else {
255+ assert (u <= 0x10FFFF && " unexpectedly high code point" );
256+ printEscape (0xD800 + ((u - 0x10000 ) >> 10 ));
257+ printEscape (0xDC00 + ((u - 0x10000 ) & 0x3FF ));
224258 }
225259 }
226260 return os << ' "' ;
0 commit comments