Skip to content

Commit b9c876e

Browse files
tlivelyradekdoulik
authored andcommitted
Improve JSON string encoding (WebAssembly#6328)
Catch and report all kinds of WTF-8 encoding errors in the source strings, including invalid leading bytes, invalid trailing bytes, unexpected ends of strings, and invalid surrogate sequences. Insert replacement characters into the output as necessary. Add a TODO about minimizing size by escaping only those code points mandated to be escaped by the JSON spec. Generally improve readability of the code.
1 parent 232ec0f commit b9c876e

File tree

2 files changed

+119
-74
lines changed

2 files changed

+119
-74
lines changed

src/support/string.cpp

Lines changed: 103 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -142,85 +142,119 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
142142

143143
std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
144144
os << '"';
145-
for (size_t i = 0; i < str.size(); i++) {
146-
int u0 = str[i];
147-
switch (u0) {
148-
case '\t':
149-
os << "\\t";
145+
constexpr uint32_t replacementCharacter = 0xFFFD;
146+
bool lastWasLeadingSurrogate = false;
147+
for (size_t i = 0; i < str.size();) {
148+
// Decode from WTF-8 into a unicode code point.
149+
uint8_t leading = str[i];
150+
size_t trailingBytes;
151+
uint32_t u;
152+
if ((leading & 0b10000000) == 0b00000000) {
153+
// 0xxxxxxx
154+
trailingBytes = 0;
155+
u = leading;
156+
} else if ((leading & 0b11100000) == 0b11000000) {
157+
// 110xxxxx 10xxxxxx
158+
trailingBytes = 1;
159+
u = (leading & 0b00011111) << 6;
160+
} else if ((leading & 0b11110000) == 0b11100000) {
161+
// 1110xxxx 10xxxxxx 10xxxxxx
162+
trailingBytes = 2;
163+
u = (leading & 0b00001111) << 12;
164+
} else if ((leading & 0b11111000) == 0b11110000) {
165+
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
166+
trailingBytes = 3;
167+
u = (leading & 0b00000111) << 18;
168+
} else {
169+
std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex
170+
<< int(leading) << std::dec << "). Replacing.\n";
171+
trailingBytes = 0;
172+
u = replacementCharacter;
173+
}
174+
175+
++i;
176+
177+
if (i + trailingBytes > str.size()) {
178+
std::cerr << "warning: Unexpected end of string. Replacing.\n";
179+
u = replacementCharacter;
180+
} else {
181+
for (size_t j = 0; j < trailingBytes; ++j) {
182+
uint8_t trailing = str[i + j];
183+
if ((trailing & 0b11000000) != 0b10000000) {
184+
std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex
185+
<< int(trailing) << std::dec << "). Replacing.\n";
186+
u = replacementCharacter;
187+
break;
188+
}
189+
// Shift 6 bits for every remaining trailing byte after this one.
190+
u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
191+
}
192+
}
193+
194+
i += trailingBytes;
195+
196+
bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF;
197+
bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF;
198+
if (lastWasLeadingSurrogate && isTrailingSurrogate) {
199+
std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n";
200+
}
201+
lastWasLeadingSurrogate = isLeadingSurrogate;
202+
203+
// Encode unicode code point into JSON.
204+
switch (u) {
205+
case '"':
206+
os << "\\\"";
207+
continue;
208+
case '\\':
209+
os << "\\\\";
210+
continue;
211+
case '\b':
212+
os << "\\b";
213+
continue;
214+
case '\f':
215+
os << "\\f";
150216
continue;
151217
case '\n':
152218
os << "\\n";
153219
continue;
154220
case '\r':
155221
os << "\\r";
156222
continue;
157-
case '"':
158-
os << "\\\"";
159-
continue;
160-
case '\'':
161-
os << "'";
162-
continue;
163-
case '\\':
164-
os << "\\\\";
223+
case '\t':
224+
os << "\\t";
165225
continue;
166-
default: {
167-
// Emit something like \u006e, the JSON escaping of a 16-bit number.
168-
auto uEscape = [&](uint32_t v) {
169-
if (v > 0xffff) {
170-
std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n';
171-
}
172-
os << std::hex;
173-
os << "\\u";
174-
os << ((v >> 12) & 0xf);
175-
os << ((v >> 8) & 0xf);
176-
os << ((v >> 4) & 0xf);
177-
os << (v & 0xf);
178-
os << std::dec;
179-
};
180-
181-
// Based off of
182-
// https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
183-
if (!(u0 & 0x80)) {
184-
if (u0 >= 32 && u0 < 127) {
185-
// This requires no escaping at all.
186-
os << char(u0);
187-
} else {
188-
uEscape(u0);
189-
}
190-
continue;
191-
}
192-
193-
// This uses 2 bytes.
194-
i++;
195-
int u1 = str[i] & 63;
196-
if ((u0 & 0xE0) == 0xC0) {
197-
uEscape((((u0 & 31) << 6) | u1));
198-
continue;
199-
}
226+
default:
227+
break;
228+
}
200229

201-
// This uses 3 bytes.
202-
i++;
203-
int u2 = str[i] & 63;
204-
if ((u0 & 0xF0) == 0xE0) {
205-
u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
206-
} else {
207-
// This uses 4 bytes.
208-
if ((u0 & 0xF8) != 0xF0) {
209-
std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n';
210-
}
211-
i++;
212-
u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63);
213-
}
230+
// TODO: To minimize size, consider additionally escaping only other control
231+
// characters (u <= 0x1F) and surrogates, emitting everything else directly
232+
// assuming a UTF-8 encoding of the JSON text. We don't do this now because
233+
// Print.cpp would consider the contents unprintable, messing up our test.
234+
bool isNaivelyPrintable = 32 <= u && u < 127;
235+
if (isNaivelyPrintable) {
236+
assert(u < 0x80 && "need additional logic to emit valid UTF-8");
237+
os << uint8_t(u);
238+
continue;
239+
}
214240

215-
if (u0 < 0x10000) {
216-
uEscape(u0);
217-
} else {
218-
// There are two separate code points here.
219-
auto ch = u0 - 0x10000;
220-
uEscape(0xD800 | (ch >> 10));
221-
uEscape(0xDC00 | (ch & 0x3FF));
222-
}
223-
}
241+
// Escape as '\uXXXX` for code points less than 0x10000 or as a
242+
// '\uXXXX\uYYYY' surrogate pair otherwise.
243+
auto printEscape = [&os](uint32_t codePoint) {
244+
assert(codePoint < 0x10000);
245+
os << std::hex << "\\u";
246+
os << ((codePoint & 0xF000) >> 12);
247+
os << ((codePoint & 0x0F00) >> 8);
248+
os << ((codePoint & 0x00F0) >> 4);
249+
os << (codePoint & 0x000F);
250+
os << std::dec;
251+
};
252+
if (u < 0x10000) {
253+
printEscape(u);
254+
} else {
255+
assert(u <= 0x10FFFF && "unexpectedly high code point");
256+
printEscape(0xD800 + ((u - 0x10000) >> 10));
257+
printEscape(0xDC00 + ((u - 0x10000) & 0x3FF));
224258
}
225259
}
226260
return os << '"';

test/lit/passes/string-lowering.wast

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,19 @@
1414
(string.const "foo")
1515
)
1616
(drop
17-
(string.const "needs\tescaping\00.'#%\"- .\r\n\\.ꙮ")
17+
(string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ")
18+
)
19+
(drop
20+
(string.const "invalid WTF-8 leading byte \FF")
21+
)
22+
(drop
23+
(string.const "invalid trailing byte \C0\00")
24+
)
25+
(drop
26+
(string.const "unexpected end \C0")
27+
)
28+
(drop
29+
(string.const "invalid surrogate sequence \ED\A0\81\ED\B0\B7")
1830
)
1931
)
2032
)
@@ -24,7 +36,7 @@
2436
;;
2537
;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s
2638
;;
27-
;; CHECK: custom section "string.consts", size 59, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\.\\ua66e\"]"
39+
;; CHECK: custom section "string.consts", size 202, contents: "[\"bar\",\"foo\",\"invalid WTF-8 leading byte \\ufffd\",\"invalid surrogate sequence \\ud801\\udc37\",\"invalid trailing byte \\ufffd\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\",\"unexpected end \\ufffd\"]"
2840

2941
;; The custom section should parse OK using JSON.parse from node.
3042
;; (Note we run --remove-unused-module-elements to remove externref-using
@@ -33,6 +45,5 @@
3345
;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm
3446
;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS
3547
;;
36-
;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.\ua66e"]
37-
;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.ꙮ"]
38-
48+
;; CHECK-JS: string: ["bar","foo","invalid WTF-8 leading byte \ufffd","invalid surrogate sequence \ud801\udc37","invalid trailing byte \ufffd","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e","unexpected end \ufffd"]
49+
;; CHECK-JS: JSON: ["bar","foo","invalid WTF-8 leading byte �","invalid surrogate sequence 𐐷","invalid trailing byte �","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ","unexpected end �"]

0 commit comments

Comments
 (0)