httputil: Fix support for non-latin1 filenames in multipart uploads

bdarnell · bdarnell · commit 170a58af2c17 · 2025-05-22T10:59:48.000-04:00
The change to be stricter about characters allowed in HTTP headers inadvertently broke support for non-latin1 filenames in multipart uploads (this was missed in testing because our i18n test case only used characters in latin1). This commit adds a hacky workaround without changing any APIs to make it safe for a 6.5.1 patch release; a more robust solution will follow for future releases. Fixes #3502
diff --git a/tornado/httputil.py b/tornado/httputil.py
@@ -70,6 +70,10 @@
 # To be used with str.strip() and related methods.
 HTTP_WHITESPACE = " \t"
 
+# Roughly the inverse of RequestHandler._VALID_HEADER_CHARS, but permits
+# chars greater than \xFF (which may appear after decoding utf8).
+_FORBIDDEN_HEADER_CHARS_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
+
 
 class _ABNF:
     """Class that holds a subset of ABNF rules from RFC 9110 and friends.
@@ -196,14 +200,18 @@ def __init__(self, *args: typing.Any, **kwargs: str) -> None:  # noqa: F811
 
     # new public methods
 
-    def add(self, name: str, value: str) -> None:
+    def add(self, name: str, value: str, *, _chars_are_bytes: bool = True) -> None:
         """Adds a new value for the given key."""
         if not _ABNF.field_name.fullmatch(name):
             raise HTTPInputError("Invalid header name %r" % name)
-        if not _ABNF.field_value.fullmatch(to_unicode(value)):
-            # TODO: the fact we still support bytes here (contrary to type annotations)
-            # and still test for it should probably be changed.
-            raise HTTPInputError("Invalid header value %r" % value)
+        if _chars_are_bytes:
+            if not _ABNF.field_value.fullmatch(to_unicode(value)):
+                # TODO: the fact we still support bytes here (contrary to type annotations)
+                # and still test for it should probably be changed.
+                raise HTTPInputError("Invalid header value %r" % value)
+        else:
+            if _FORBIDDEN_HEADER_CHARS_RE.search(value):
+                raise HTTPInputError("Invalid header value %r" % value)
         norm_name = _normalize_header(name)
         self._last_key = norm_name
         if norm_name in self:
@@ -229,7 +237,7 @@ def get_all(self) -> Iterable[Tuple[str, str]]:
             for value in values:
                 yield (name, value)
 
-    def parse_line(self, line: str) -> None:
+    def parse_line(self, line: str, *, _chars_are_bytes: bool = True) -> None:
         r"""Updates the dictionary with a single header line.
 
         >>> h = HTTPHeaders()
@@ -263,19 +271,25 @@ def parse_line(self, line: str) -> None:
             if self._last_key is None:
                 raise HTTPInputError("first header line cannot start with whitespace")
             new_part = " " + line.strip(HTTP_WHITESPACE)
-            if not _ABNF.field_value.fullmatch(new_part[1:]):
-                raise HTTPInputError("Invalid header continuation %r" % new_part)
+            if _chars_are_bytes:
+                if not _ABNF.field_value.fullmatch(new_part[1:]):
+                    raise HTTPInputError("Invalid header continuation %r" % new_part)
+            else:
+                if _FORBIDDEN_HEADER_CHARS_RE.search(new_part):
+                    raise HTTPInputError("Invalid header value %r" % new_part)
             self._as_list[self._last_key][-1] += new_part
             self._dict[self._last_key] += new_part
         else:
             try:
                 name, value = line.split(":", 1)
             except ValueError:
                 raise HTTPInputError("no colon in header line")
-            self.add(name, value.strip(HTTP_WHITESPACE))
+            self.add(
+                name, value.strip(HTTP_WHITESPACE), _chars_are_bytes=_chars_are_bytes
+            )
 
     @classmethod
-    def parse(cls, headers: str) -> "HTTPHeaders":
+    def parse(cls, headers: str, *, _chars_are_bytes: bool = True) -> "HTTPHeaders":
         """Returns a dictionary from HTTP header text.
 
         >>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n")
@@ -288,17 +302,31 @@ def parse(cls, headers: str) -> "HTTPHeaders":
            mix of `KeyError`, and `ValueError`.
 
         """
+        # _chars_are_bytes is a hack. This method is used in two places, HTTP headers (in which
+        # non-ascii characters are to be interpreted as latin-1) and multipart/form-data (in which
+        # they are to be interpreted as utf-8). For historical reasons, this method handled this by
+        # expecting both callers to decode the headers to strings before parsing them. This wasn't a
+        # problem until we started doing stricter validation of the characters allowed in HTTP
+        # headers (using ABNF rules defined in terms of byte values), which inadvertently started
+        # disallowing non-latin1 characters in multipart/form-data filenames.
+        #
+        # This method should have accepted bytes and a desired encoding, but this change is being
+        # introduced in a patch release that shouldn't change the API. Instead, the _chars_are_bytes
+        # flag decides whether to use HTTP-style ABNF validation (treating the string as bytes
+        # smuggled through the latin1 encoding) or to accept any non-control unicode characters
+        # as required by multipart/form-data. This method will change to accept bytes in a future
+        # release.
         h = cls()
 
         start = 0
         while True:
             lf = headers.find("\n", start)
             if lf == -1:
-                h.parse_line(headers[start:])
+                h.parse_line(headers[start:], _chars_are_bytes=_chars_are_bytes)
                 break
             line = headers[start : lf + 1]
             start = lf + 1
-            h.parse_line(line)
+            h.parse_line(line, _chars_are_bytes=_chars_are_bytes)
         return h
 
     # MutableMapping abstract method implementations.
@@ -946,7 +974,7 @@ def parse_multipart_form_data(
         eoh = part.find(b"\r\n\r\n")
         if eoh == -1:
             raise HTTPInputError("multipart/form-data missing headers")
-        headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"))
+        headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"), _chars_are_bytes=False)
         disp_header = headers.get("Content-Disposition", "")
         disposition, disp_params = _parse_header(disp_header)
         if disposition != "form-data" or not part.endswith(b"\r\n"):
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py
@@ -155,7 +155,7 @@ def test_special_filenames(self):
             self.assertEqual(file["filename"], filename)
             self.assertEqual(file["body"], b"Foo")
 
-    def test_non_ascii_filename(self):
+    def test_non_ascii_filename_rfc5987(self):
         data = b"""\
 --1234
 Content-Disposition: form-data; name="files"; filename="ab.txt"; filename*=UTF-8''%C3%A1b.txt
@@ -170,6 +170,23 @@ def test_non_ascii_filename(self):
         self.assertEqual(file["filename"], "áb.txt")
         self.assertEqual(file["body"], b"Foo")
 
+    def test_non_ascii_filename_raw(self):
+        data = """\
+--1234
+Content-Disposition: form-data; name="files"; filename="测试.txt"
+
+Foo
+--1234--""".encode(
+            "utf-8"
+        ).replace(
+            b"\n", b"\r\n"
+        )
+        args, files = form_data_args()
+        parse_multipart_form_data(b"1234", data, args, files)
+        file = files["files"][0]
+        self.assertEqual(file["filename"], "测试.txt")
+        self.assertEqual(file["body"], b"Foo")
+
     def test_boundary_starts_and_ends_with_quotes(self):
         data = b"""\
 --1234