url: forbid certain confusable characters from being introduced by toASCII

TimothyGu · TimothyGu · commit 8d1461de8784 · 2021-05-11T19:52:02.000-07:00
The legacy url.parse() function attempts to convert Unicode domains (IDNs) into their ASCII/Punycode form through the use of the toASCII function. However, toASCII can introduce various characters that at best invalidate the parsed URL, and at worst cause hostname spoofing: url.parse('http://bad.c℀.good.com/').href === 'http://bad.ca/c.good.com/' (from [1]) While changes to the legacy URL parser are discouraged in general, the security implications here outweigh the desire for strict compatibility. This is since this commit only changes behavior when non-ASCII characters appear in the hostname, an unusual situation for most use cases. Additionally, despite the availability of the WHATWG URL API, url.parse remain widely deployed in the Node.js ecosystem, as exemplified by the recent un-deprecation of the legacy API. This change is similar in spirit to CPython 3.8's change [2] fixing bpo-36216 [3] aka CVE-2019-9636, which also occurred despite potential compatibility concerns. [1]: https://hackerone.com/reports/678487 [2]: python/cpython@16e6f7d [3]: https://bugs.python.org/issue36216
diff --git a/doc/api/errors.md b/doc/api/errors.md
@@ -1677,10 +1677,10 @@ An invalid URI was passed.
 <a id="ERR_INVALID_URL"></a>
 ### `ERR_INVALID_URL`
 
-An invalid URL was passed to the [WHATWG][WHATWG URL API]
-[`URL` constructor][`new URL(input)`] to be parsed. The thrown error object
-typically has an additional property `'input'` that contains the URL that failed
-to parse.
+An invalid URL was passed to the [WHATWG][WHATWG URL API] [`URL`
+constructor][`new URL(input)`] or the legacy [`url.parse()`][] to be parsed.
+The thrown error object typically has an additional property `'input'` that
+contains the URL that failed to parse.
 
 <a id="ERR_INVALID_URL_SCHEME"></a>
 ### `ERR_INVALID_URL_SCHEME`
@@ -2824,6 +2824,7 @@ The native call from `process.cpuUsage` could not be processed.
 [`stream.write()`]: stream.md#stream_writable_write_chunk_encoding_callback
 [`subprocess.kill()`]: child_process.md#child_process_subprocess_kill_signal
 [`subprocess.send()`]: child_process.md#child_process_subprocess_send_message_sendhandle_options_callback
+[`url.parse()`]: url.md#url_url_parse_urlstring_parsequerystring_slashesdenotehost
 [`util.getSystemErrorName(error.errno)`]: util.md#util_util_getsystemerrorname_err
 [`zlib`]: zlib.md
 [crypto digest algorithm]: crypto.md#crypto_crypto_gethashes
diff --git a/lib/url.js b/lib/url.js
@@ -34,7 +34,8 @@ const { toASCII } = require('internal/idna');
 const { encodeStr, hexTable } = require('internal/querystring');
 
 const {
-  ERR_INVALID_ARG_TYPE
+  ERR_INVALID_ARG_TYPE,
+  ERR_INVALID_URL,
 } = require('internal/errors').codes;
 const { validateString } = require('internal/validators');
 
@@ -167,6 +168,20 @@ function isIpv6Hostname(hostname) {
   );
 }
 
+// This prevents some common spoofing bugs due to our use of IDNA toASCII. For
+// compatibility, the set of forbidden characters is the _intersection_ of
+// "forbidden host code point" in the WHATWG URL Standard and the characters in
+// the host parsing loop in Url.prototype.parse, with the following additions:
+//
+// - ':' since this could cause a "protocol spoofing" bug
+// - '@' since this could cause parts of the hostname to be confused with auth
+// - '[' and ']' since this could cause a non-IPv6 hostname to be interpreted
+//   as IPv6 by isIpv6Hostname above
+//
+// All four of these characters are also included in "forbidden host code
+// point". See https://url.spec.whatwg.org/#forbidden-host-code-point
+const forbiddenHostChars = /[\t\n\r #%/:<>?@[\\\]^|]/;
+
 Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
   validateString(url, 'url');
 
@@ -398,6 +413,17 @@ Url.prototype.parse = function parse(url, parseQueryString, slashesDenoteHost) {
       // Use lenient mode (`true`) to try to support even non-compliant
       // URLs.
       this.hostname = toASCII(this.hostname, true);
+
+      if (forbiddenHostChars.test(this.hostname)) {
+        // These forbidden characters could not have entered this.hostname
+        // through the URL string directly since getHostname should have
+        // filtered them out. Thus, they must have creeped in due to IDNA
+        // toASCII conversion. This is probably a sign of potential hostname
+        // spoofing. Rather than moving the non-host part to the pathname as
+        // we've done in getHostname, throw an exception to convey its
+        // severity.
+        throw new ERR_INVALID_URL(url);
+      }
     }
 
     const p = this.port ? ':' + this.port : '';
diff --git a/test/parallel/test-url-parse-invalid-input.js b/test/parallel/test-url-parse-invalid-input.js
@@ -36,3 +36,33 @@ assert.throws(() => { url.parse('http://%E0%A4%A@fail'); },
                 // JS engine errors do not have the `code` property.
                 return e.code === undefined;
               });
+
+if (common.hasIntl) {
+  // An array of Unicode code points whose Unicode NFKD contains a "bad
+  // character".
+  const badIDNA = (() => {
+    const BAD_CHARS = '#%/:?@[\\]^|';
+    const out = [];
+    for (let i = 0x80; i < 0x110000; i++) {
+      const cp = String.fromCodePoint(i);
+      for (const badChar of BAD_CHARS) {
+        if (cp.normalize('NFKD').includes(badChar)) {
+          out.push(cp);
+        }
+      }
+    }
+    return out;
+  })();
+
+  // The generation logic above should at a minimum produce these two
+  // characters.
+  assert(badIDNA.includes('℀'));
+  assert(badIDNA.includes('＠'));
+
+  for (const badCodePoint of badIDNA) {
+    const badURL = `http://fail${badCodePoint}fail.com/`;
+    assert.throws(() => { url.parse(badURL); },
+                  (e) => e.code === 'ERR_INVALID_URL',
+                  `parsing ${badURL}`);
+  }
+}