-
Notifications
You must be signed in to change notification settings - Fork 830
Improve JSON string encoding #6328
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -142,85 +142,119 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) { | |
|
|
||
| std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) { | ||
| os << '"'; | ||
| for (size_t i = 0; i < str.size(); i++) { | ||
| int u0 = str[i]; | ||
| switch (u0) { | ||
| case '\t': | ||
| os << "\\t"; | ||
| constexpr uint32_t replacementCharacter = 0xFFFD; | ||
| bool lastWasLeadingSurrogate = false; | ||
| for (size_t i = 0; i < str.size();) { | ||
| // Decode from WTF-8 into a unicode code point. | ||
| uint8_t leading = str[i]; | ||
| size_t trailingBytes; | ||
| uint32_t u; | ||
| if ((leading & 0b10000000) == 0b00000000) { | ||
| // 0xxxxxxx | ||
| trailingBytes = 0; | ||
| u = leading; | ||
| } else if ((leading & 0b11100000) == 0b11000000) { | ||
| // 110xxxxx 10xxxxxx | ||
| trailingBytes = 1; | ||
| u = (leading & 0b00011111) << 6; | ||
| } else if ((leading & 0b11110000) == 0b11100000) { | ||
| // 1110xxxx 10xxxxxx 10xxxxxx | ||
| trailingBytes = 2; | ||
| u = (leading & 0b00001111) << 12; | ||
| } else if ((leading & 0b11111000) == 0b11110000) { | ||
| // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx | ||
| trailingBytes = 3; | ||
| u = (leading & 0b00000111) << 18; | ||
| } else { | ||
| std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex | ||
| << int(leading) << std::dec << "). Replacing.\n"; | ||
| trailingBytes = 0; | ||
| u = replacementCharacter; | ||
| } | ||
|
|
||
| ++i; | ||
|
|
||
| if (i + trailingBytes > str.size()) { | ||
| std::cerr << "warning: Unexpected end of string. Replacing.\n"; | ||
| u = replacementCharacter; | ||
| } else { | ||
| for (size_t j = 0; j < trailingBytes; ++j) { | ||
| uint8_t trailing = str[i + j]; | ||
| if ((trailing & 0b11000000) != 0b10000000) { | ||
| std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex | ||
| << int(trailing) << std::dec << "). Replacing.\n"; | ||
| u = replacementCharacter; | ||
| break; | ||
| } | ||
| // Shift 6 bits for every remaining trailing byte after this one. | ||
| u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1)); | ||
| } | ||
| } | ||
|
|
||
| i += trailingBytes; | ||
|
|
||
| bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF; | ||
| bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF; | ||
| if (lastWasLeadingSurrogate && isTrailingSurrogate) { | ||
| std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n"; | ||
| } | ||
| lastWasLeadingSurrogate = isLeadingSurrogate; | ||
|
|
||
| // Encode unicode code point into JSON. | ||
| switch (u) { | ||
| case '"': | ||
| os << "\\\""; | ||
| continue; | ||
| case '\\': | ||
| os << "\\\\"; | ||
| continue; | ||
| case '\b': | ||
| os << "\\b"; | ||
| continue; | ||
| case '\f': | ||
| os << "\\f"; | ||
| continue; | ||
| case '\n': | ||
| os << "\\n"; | ||
| continue; | ||
| case '\r': | ||
| os << "\\r"; | ||
| continue; | ||
| case '"': | ||
| os << "\\\""; | ||
| continue; | ||
| case '\'': | ||
| os << "'"; | ||
| continue; | ||
| case '\\': | ||
| os << "\\\\"; | ||
| case '\t': | ||
| os << "\\t"; | ||
| continue; | ||
| default: { | ||
| // Emit something like \u006e, the JSON escaping of a 16-bit number. | ||
| auto uEscape = [&](uint32_t v) { | ||
| if (v > 0xffff) { | ||
| std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n'; | ||
| } | ||
| os << std::hex; | ||
| os << "\\u"; | ||
| os << ((v >> 12) & 0xf); | ||
| os << ((v >> 8) & 0xf); | ||
| os << ((v >> 4) & 0xf); | ||
| os << (v & 0xf); | ||
| os << std::dec; | ||
| }; | ||
|
|
||
| // Based off of | ||
| // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91 | ||
| if (!(u0 & 0x80)) { | ||
| if (u0 >= 32 && u0 < 127) { | ||
| // This requires no escaping at all. | ||
| os << char(u0); | ||
| } else { | ||
| uEscape(u0); | ||
| } | ||
| continue; | ||
| } | ||
|
|
||
| // This uses 2 bytes. | ||
| i++; | ||
| int u1 = str[i] & 63; | ||
| if ((u0 & 0xE0) == 0xC0) { | ||
| uEscape((((u0 & 31) << 6) | u1)); | ||
| continue; | ||
| } | ||
| default: | ||
| break; | ||
| } | ||
|
|
||
| // This uses 3 bytes. | ||
| i++; | ||
| int u2 = str[i] & 63; | ||
| if ((u0 & 0xF0) == 0xE0) { | ||
| u0 = ((u0 & 15) << 12) | (u1 << 6) | u2; | ||
| } else { | ||
| // This uses 4 bytes. | ||
| if ((u0 & 0xF8) != 0xF0) { | ||
| std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n'; | ||
| } | ||
| i++; | ||
| u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63); | ||
| } | ||
| // TODO: To minimize size, consider additionally escaping only other control | ||
| // characters (u <= 0x1F) and surrogates, emitting everything else directly | ||
| // assuming a UTF-8 encoding of the JSON text. We don't do this now because | ||
| // Print.cpp would consider the contents unprintable, messing up our test. | ||
| bool isNaivelyPrintable = 32 <= u && u < 127; | ||
| if (isNaivelyPrintable) { | ||
| assert(u < 0x80 && "need additional logic to emit valid UTF-8"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We only get here if
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is meant to guard against future improvement where we would start emitting most code points directly as UTF-8 instead of escaping them. After the initial refactoring to allow most code points to hit this code path by default, this assertion will trigger if we don't add extra logic for code points |
||
| os << uint8_t(u); | ||
| continue; | ||
| } | ||
|
|
||
| if (u0 < 0x10000) { | ||
| uEscape(u0); | ||
| } else { | ||
| // There are two separate code points here. | ||
| auto ch = u0 - 0x10000; | ||
| uEscape(0xD800 | (ch >> 10)); | ||
| uEscape(0xDC00 | (ch & 0x3FF)); | ||
| } | ||
| } | ||
| // Escape as '\uXXXX` for code points less than 0x10000 or as a | ||
| // '\uXXXX\uYYYY' surrogate pair otherwise. | ||
| auto printEscape = [&os](uint32_t codePoint) { | ||
| assert(codePoint < 0x10000); | ||
| os << std::hex << "\\u"; | ||
| os << ((codePoint & 0xF000) >> 12); | ||
| os << ((codePoint & 0x0F00) >> 8); | ||
| os << ((codePoint & 0x00F0) >> 4); | ||
| os << (codePoint & 0x000F); | ||
| os << std::dec; | ||
| }; | ||
| if (u < 0x10000) { | ||
| printEscape(u); | ||
| } else { | ||
| assert(u <= 0x10FFFF && "unexpectedly high code point"); | ||
| printEscape(0xD800 + ((u - 0x10000) >> 10)); | ||
| printEscape(0xDC00 + ((u - 0x10000) & 0x3FF)); | ||
| } | ||
| } | ||
| return os << '"'; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are
\band\fnecessary here? (What spec would I read that in?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(if you add them to the test without this PR, does the test fail?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See page 4 of the JSON spec: https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf. We could also use
\uXXXXescapes, which is what the previous code would have done, but these are shorter.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, thanks.
Maybe worth adding those to the test?