Improve JSON string encoding (WebAssembly#6328)

tlively · radekdoulik · commit b9c876ea2df3 · 2024-07-12T12:09:29.000+02:00
Catch and report all kinds of WTF-8 encoding errors in the source strings,
including invalid leading bytes, invalid trailing bytes, unexpected ends of
strings, and invalid surrogate sequences. Insert replacement characters into the
output as necessary. Add a TODO about minimizing size by escaping only those
code points mandated to be escaped by the JSON spec. Generally improve
readability of the code.
diff --git a/src/support/string.cpp b/src/support/string.cpp
@@ -142,85 +142,119 @@ std::ostream& printEscaped(std::ostream& os, const std::string_view str) {
 
 std::ostream& printEscapedJSON(std::ostream& os, const std::string_view str) {
   os << '"';
-  for (size_t i = 0; i < str.size(); i++) {
-    int u0 = str[i];
-    switch (u0) {
-      case '\t':
-        os << "\\t";
+  constexpr uint32_t replacementCharacter = 0xFFFD;
+  bool lastWasLeadingSurrogate = false;
+  for (size_t i = 0; i < str.size();) {
+    // Decode from WTF-8 into a unicode code point.
+    uint8_t leading = str[i];
+    size_t trailingBytes;
+    uint32_t u;
+    if ((leading & 0b10000000) == 0b00000000) {
+      // 0xxxxxxx
+      trailingBytes = 0;
+      u = leading;
+    } else if ((leading & 0b11100000) == 0b11000000) {
+      // 110xxxxx 10xxxxxx
+      trailingBytes = 1;
+      u = (leading & 0b00011111) << 6;
+    } else if ((leading & 0b11110000) == 0b11100000) {
+      // 1110xxxx 10xxxxxx 10xxxxxx
+      trailingBytes = 2;
+      u = (leading & 0b00001111) << 12;
+    } else if ((leading & 0b11111000) == 0b11110000) {
+      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+      trailingBytes = 3;
+      u = (leading & 0b00000111) << 18;
+    } else {
+      std::cerr << "warning: Bad WTF-8 leading byte (" << std::hex
+                << int(leading) << std::dec << "). Replacing.\n";
+      trailingBytes = 0;
+      u = replacementCharacter;
+    }
+
+    ++i;
+
+    if (i + trailingBytes > str.size()) {
+      std::cerr << "warning: Unexpected end of string. Replacing.\n";
+      u = replacementCharacter;
+    } else {
+      for (size_t j = 0; j < trailingBytes; ++j) {
+        uint8_t trailing = str[i + j];
+        if ((trailing & 0b11000000) != 0b10000000) {
+          std::cerr << "warning: Bad WTF-8 trailing byte (" << std::hex
+                    << int(trailing) << std::dec << "). Replacing.\n";
+          u = replacementCharacter;
+          break;
+        }
+        // Shift 6 bits for every remaining trailing byte after this one.
+        u |= (trailing & 0b00111111) << (6 * (trailingBytes - j - 1));
+      }
+    }
+
+    i += trailingBytes;
+
+    bool isLeadingSurrogate = 0xD800 <= u && u <= 0xDBFF;
+    bool isTrailingSurrogate = 0xDC00 <= u && u <= 0xDFFF;
+    if (lastWasLeadingSurrogate && isTrailingSurrogate) {
+      std::cerr << "warning: Invalid surrogate sequence in WTF-8.\n";
+    }
+    lastWasLeadingSurrogate = isLeadingSurrogate;
+
+    // Encode unicode code point into JSON.
+    switch (u) {
+      case '"':
+        os << "\\\"";
+        continue;
+      case '\\':
+        os << "\\\\";
+        continue;
+      case '\b':
+        os << "\\b";
+        continue;
+      case '\f':
+        os << "\\f";
         continue;
       case '\n':
         os << "\\n";
         continue;
       case '\r':
         os << "\\r";
         continue;
-      case '"':
-        os << "\\\"";
-        continue;
-      case '\'':
-        os << "'";
-        continue;
-      case '\\':
-        os << "\\\\";
+      case '\t':
+        os << "\\t";
         continue;
-      default: {
-        // Emit something like \u006e, the JSON escaping of a 16-bit number.
-        auto uEscape = [&](uint32_t v) {
-          if (v > 0xffff) {
-            std::cerr << "warning: Bad 16-bit escapee " << int(u0) << '\n';
-          }
-          os << std::hex;
-          os << "\\u";
-          os << ((v >> 12) & 0xf);
-          os << ((v >> 8) & 0xf);
-          os << ((v >> 4) & 0xf);
-          os << (v & 0xf);
-          os << std::dec;
-        };
-
-        // Based off of
-        // https://github.com/emscripten-core/emscripten/blob/59e6b8f1262d75585d8416b728e8cbb3db176fe2/src/library_strings.js#L72-L91
-        if (!(u0 & 0x80)) {
-          if (u0 >= 32 && u0 < 127) {
-            // This requires no escaping at all.
-            os << char(u0);
-          } else {
-            uEscape(u0);
-          }
-          continue;
-        }
-
-        // This uses 2 bytes.
-        i++;
-        int u1 = str[i] & 63;
-        if ((u0 & 0xE0) == 0xC0) {
-          uEscape((((u0 & 31) << 6) | u1));
-          continue;
-        }
+      default:
+        break;
+    }
 
-        // This uses 3 bytes.
-        i++;
-        int u2 = str[i] & 63;
-        if ((u0 & 0xF0) == 0xE0) {
-          u0 = ((u0 & 15) << 12) | (u1 << 6) | u2;
-        } else {
-          // This uses 4 bytes.
-          if ((u0 & 0xF8) != 0xF0) {
-            std::cerr << "warning: Bad UTF-8 leading byte " << int(u0) << '\n';
-          }
-          i++;
-          u0 = ((u0 & 7) << 18) | (u1 << 12) | (u2 << 6) | (str[i] & 63);
-        }
+    // TODO: To minimize size, consider additionally escaping only other control
+    // characters (u <= 0x1F) and surrogates, emitting everything else directly
+    // assuming a UTF-8 encoding of the JSON text. We don't do this now because
+    // Print.cpp would consider the contents unprintable, messing up our test.
+    bool isNaivelyPrintable = 32 <= u && u < 127;
+    if (isNaivelyPrintable) {
+      assert(u < 0x80 && "need additional logic to emit valid UTF-8");
+      os << uint8_t(u);
+      continue;
+    }
 
-        if (u0 < 0x10000) {
-          uEscape(u0);
-        } else {
-          // There are two separate code points here.
-          auto ch = u0 - 0x10000;
-          uEscape(0xD800 | (ch >> 10));
-          uEscape(0xDC00 | (ch & 0x3FF));
-        }
-      }
+    // Escape as '\uXXXX` for code points less than 0x10000 or as a
+    // '\uXXXX\uYYYY' surrogate pair otherwise.
+    auto printEscape = [&os](uint32_t codePoint) {
+      assert(codePoint < 0x10000);
+      os << std::hex << "\\u";
+      os << ((codePoint & 0xF000) >> 12);
+      os << ((codePoint & 0x0F00) >> 8);
+      os << ((codePoint & 0x00F0) >> 4);
+      os << (codePoint & 0x000F);
+      os << std::dec;
+    };
+    if (u < 0x10000) {
+      printEscape(u);
+    } else {
+      assert(u <= 0x10FFFF && "unexpectedly high code point");
+      printEscape(0xD800 + ((u - 0x10000) >> 10));
+      printEscape(0xDC00 + ((u - 0x10000) & 0x3FF));
     }
   }
   return os << '"';
diff --git a/test/lit/passes/string-lowering.wast b/test/lit/passes/string-lowering.wast
@@ -14,7 +14,19 @@
       (string.const "foo")
     )
     (drop
-      (string.const "needs\tescaping\00.'#%\"- .\r\n\\.ꙮ")
+      (string.const "needs\tescaping\00.'#%\"- .\r\n\\08\0C\0A\0D\09.ꙮ")
+    )
+    (drop
+      (string.const "invalid WTF-8 leading byte \FF")
+    )
+    (drop
+      (string.const "invalid trailing byte \C0\00")
+    )
+    (drop
+      (string.const "unexpected end \C0")
+    )
+    (drop
+      (string.const "invalid surrogate sequence \ED\A0\81\ED\B0\B7")
     )
   )
 )
@@ -24,7 +36,7 @@
 ;;
 ;; RUN: wasm-opt %s --string-lowering -all -S -o - | filecheck %s
 ;;
-;; CHECK: custom section "string.consts", size 59, contents: "[\"bar\",\"foo\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\.\\ua66e\"]"
+;; CHECK: custom section "string.consts", size 202, contents: "[\"bar\",\"foo\",\"invalid WTF-8 leading byte \\ufffd\",\"invalid surrogate sequence \\ud801\\udc37\",\"invalid trailing byte \\ufffd\",\"needs\\tescaping\\u0000.'#%\\\"- .\\r\\n\\\\08\\f\\n\\r\\t.\\ua66e\",\"unexpected end \\ufffd\"]"
 
 ;; The custom section should parse OK using JSON.parse from node.
 ;; (Note we run --remove-unused-module-elements to remove externref-using
@@ -33,6 +45,5 @@
 ;; RUN: wasm-opt %s --string-lowering --remove-unused-module-elements -all -o %t.wasm
 ;; RUN: node %S/string-lowering.js %t.wasm | filecheck %s --check-prefix=CHECK-JS
 ;;
-;; CHECK-JS: string: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.\ua66e"]
-;; CHECK-JS: JSON: ["bar","foo","needs\tescaping\x00.'#%\"- .\r\n\\.ꙮ"]
-
+;; CHECK-JS: string: ["bar","foo","invalid WTF-8 leading byte \ufffd","invalid surrogate sequence \ud801\udc37","invalid trailing byte \ufffd","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.\ua66e","unexpected end \ufffd"]
+;; CHECK-JS: JSON: ["bar","foo","invalid WTF-8 leading byte �","invalid surrogate sequence 𐐷","invalid trailing byte �","needs\tescaping\x00.'#%\"- .\r\n\\08\f\n\r\t.ꙮ","unexpected end �"]