Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 103 additions & 69 deletions src/support/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,85 +142,119 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {

std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
os << '"';
for (size_t i = 0; i < str.size(); i++) {
int u0 = str[i];
switch (u0) {
case '\t':
os << "\\t";
constexpr uint32_t replacementCharacter = 0xFFFD;
bool lastWasLeadingSurrogate = false;
for (size_t i = 0; i < str.size();) {
// Decode from WTF-8 into a unicode code point.
uint8_t leading = str[i];
size_t trailingBytes;
uint32_t u;
if ((leading & 0b10000000) == 0b00000000) {
// 0xxxxxxx
trailingBytes = 0;
u = leading;
} else if ((leading & 0b11100000) == 0b11000000) {
// 110xxxxx 10xxxxxx
trailingBytes = 1;
u = (leading & 0b00011111) << 6;
} else if ((leading & 0b11110000) == 0b11100000) {
// 1110xxxx 10xxxxxx 10xxxxxx
trailingBytes = 2;
u = (leading & 0b00001111) << 12;
} else if ((leading & 0b11111000) == 0b11110000) {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
trailingBytes = 3;
u = (leading & 0b00000111) << 18;
} else {
std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex
<< int(leading) << std::dec << "). Replacing.\n";
trailingBytes = 0;
u = replacementCharacter;
}

++i;

if (i + trailingBytes > str.size()) {
std::cerr << "warning: Unexpected end of string. Replacing.\n";
u = replacementCharacter;
} else {
for (size_t j = 0; j < trailingBytes; ++j) {
uint8_t trailing = str[i + j];
if ((trailing & 0b11000000) != 0b10000000) {
std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex
<< int(trailing) << std::dec << "). Replacing.\n";
u = replacementCharacter;
break;
}
// Shift 6 bits for every remaining trailing byte after this one.
u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
}
}

i += trailingBytes;

bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF;
bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF;
if (lastWasLeadingSurrogate && isTrailingSurrogate) {
std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n";
}
lastWasLeadingSurrogate = isLeadingSurrogate;

// Encode unicode code point into JSON.
switch (u) {
case '"':
os << "\\\"";
continue;
case '\\':
os << "\\\\";
continue;
case '\b':
os << "\\b";
continue;
case '\f':
os << "\\f";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are \b and \f necessary here? (What spec would I read that in?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(if you add them to the test without this PR, does the test fail?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See page 4 of the JSON spec: https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf. We could also use \uXXXX escapes, which is what the previous code would have done, but these are shorter.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, thanks.

Maybe worth adding those to the test?

continue;
case '\n':
os << "\\n";
continue;
case '\r':
os << "\\r";
continue;
case '"':
os << "\\\"";
continue;
case '\'':
os << "'";
continue;
case '\\':
os << "\\\\";
case '\t':
os << "\\t";
continue;
default: {
// Emit something like \u006e, the JSON escaping of a 16-bit number.
auto uEscape = [&](uint32_t v) {
if (v > 0xffff) {
std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n';
}
os << std::hex;
os << "\\u";
os << ((v >> 12) & 0xf);
os << ((v >> 8) & 0xf);
os << ((v >> 4) & 0xf);
os << (v & 0xf);
os << std::dec;
};

// Based off of
// https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
if (!(u0 & 0x80)) {
if (u0 >= 32 && u0 < 127) {
// This requires no escaping at all.
os << char(u0);
} else {
uEscape(u0);
}
continue;
}

// This uses 2 bytes.
i++;
int u1 = str[i] & 63;
if ((u0 & 0xE0) == 0xC0) {
uEscape((((u0 & 31) << 6) | u1));
continue;
}
default:
break;
}

// This uses 3 bytes.
i++;
int u2 = str[i] & 63;
if ((u0 & 0xF0) == 0xE0) {
u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
} else {
// This uses 4 bytes.
if ((u0 & 0xF8) != 0xF0) {
std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n';
}
i++;
u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63);
}
// TODO: To minimize size, consider additionally escaping only other control
// characters (u <= 0x1F) and surrogates, emitting everything else directly
// assuming a UTF-8 encoding of the JSON text. We don't do this now because
// Print.cpp would consider the contents unprintable, messing up our test.
bool isNaivelyPrintable = 32 <= u && u < 127;
if (isNaivelyPrintable) {
assert(u < 0x80 && "need additional logic to emit valid UTF-8");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We only get here if u < 127, so the only difference with u < 0x80 = 128 is when u == 128? If so perhaps check that directly?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is meant to guard against future improvement where we would start emitting most code points directly as UTF-8 instead of escaping them. After the initial refactoring to allow most code points to hit this code path by default, this assertion will trigger if we don't add extra logic for code points 0x80 and greater. The assertion would be less robust in the context of that future change if it checked against 128 specifically.

os << uint8_t(u);
continue;
}

if (u0 < 0x10000) {
uEscape(u0);
} else {
// There are two separate code points here.
auto ch = u0 - 0x10000;
uEscape(0xD800 | (ch >> 10));
uEscape(0xDC00 | (ch & 0x3FF));
}
}
// Escape as '\uXXXX` for code points less than 0x10000 or as a
// '\uXXXX\uYYYY' surrogate pair otherwise.
auto printEscape = [&os](uint32_t codePoint) {
assert(codePoint < 0x10000);
os << std::hex << "\\u";
os << ((codePoint & 0xF000) >> 12);
os << ((codePoint & 0x0F00) >> 8);
os << ((codePoint & 0x00F0) >> 4);
os << (codePoint & 0x000F);
os << std::dec;
};
if (u < 0x10000) {
printEscape(u);
} else {
assert(u <= 0x10FFFF && "unexpectedly high code point");
printEscape(0xD800 + ((u - 0x10000) >> 10));
printEscape(0xDC00 + ((u - 0x10000) & 0x3FF));
}
}
return os << '"';
Expand Down
21 changes: 16 additions & 5 deletions test/lit/passes/string-lowering.wast
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,19 @@
(string.const "foo")
)
(drop
(string.const "needs\tescaping\00.'#%\"- .\r\n\\.ꙮ")
(string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ")
)
(drop
(string.const "invalid WTF-8 leading byte \FF")
)
(drop
(string.const "invalid trailing byte \C0\00")
)
(drop
(string.const "unexpected end \C0")
)
(drop
(string.const "invalid surrogate sequence \ED\A0\81\ED\B0\B7")
)
)
)
Expand All @@ -24,7 +36,7 @@
;;
;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s
;;
;; CHECK: custom section "string.consts", size 59, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\.\\ua66e\"]"
;; CHECK: custom section "string.consts", size 202, contents: "[\"bar\",\"foo\",\"invalid WTF-8 leading byte \\ufffd\",\"invalid surrogate sequence \\ud801\\udc37\",\"invalid trailing byte \\ufffd\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\",\"unexpected end \\ufffd\"]"

;; The custom section should parse OK using JSON.parse from node.
;; (Note we run --remove-unused-module-elements to remove externref-using
Expand All @@ -33,6 +45,5 @@
;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm
;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS
;;
;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.\ua66e"]
;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.ꙮ"]

;; CHECK-JS: string: ["bar","foo","invalid WTF-8 leading byte \ufffd","invalid surrogate sequence \ud801\udc37","invalid trailing byte \ufffd","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e","unexpected end \ufffd"]
;; CHECK-JS: JSON: ["bar","foo","invalid WTF-8 leading byte �","invalid surrogate sequence 𐐷","invalid trailing byte �","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ","unexpected end �"]