7070# To be used with str.strip() and related methods.
7171HTTP_WHITESPACE = " \t "
7272
73+ # Roughly the inverse of RequestHandler._VALID_HEADER_CHARS, but permits
74+ # chars greater than \xFF (which may appear after decoding utf8).
75+ _FORBIDDEN_HEADER_CHARS_RE = re .compile (r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]" )
76+
7377
7478class _ABNF :
7579 """Class that holds a subset of ABNF rules from RFC 9110 and friends.
@@ -196,14 +200,18 @@ def __init__(self, *args: typing.Any, **kwargs: str) -> None: # noqa: F811
196200
197201 # new public methods
198202
199- def add (self , name : str , value : str ) -> None :
203+ def add (self , name : str , value : str , * , _chars_are_bytes : bool = True ) -> None :
200204 """Adds a new value for the given key."""
201205 if not _ABNF .field_name .fullmatch (name ):
202206 raise HTTPInputError ("Invalid header name %r" % name )
203- if not _ABNF .field_value .fullmatch (to_unicode (value )):
204- # TODO: the fact we still support bytes here (contrary to type annotations)
205- # and still test for it should probably be changed.
206- raise HTTPInputError ("Invalid header value %r" % value )
207+ if _chars_are_bytes :
208+ if not _ABNF .field_value .fullmatch (to_unicode (value )):
209+ # TODO: the fact we still support bytes here (contrary to type annotations)
210+ # and still test for it should probably be changed.
211+ raise HTTPInputError ("Invalid header value %r" % value )
212+ else :
213+ if _FORBIDDEN_HEADER_CHARS_RE .search (value ):
214+ raise HTTPInputError ("Invalid header value %r" % value )
207215 norm_name = _normalize_header (name )
208216 self ._last_key = norm_name
209217 if norm_name in self :
@@ -229,7 +237,7 @@ def get_all(self) -> Iterable[Tuple[str, str]]:
229237 for value in values :
230238 yield (name , value )
231239
232- def parse_line (self , line : str ) -> None :
240+ def parse_line (self , line : str , * , _chars_are_bytes : bool = True ) -> None :
233241 r"""Updates the dictionary with a single header line.
234242
235243 >>> h = HTTPHeaders()
@@ -263,19 +271,25 @@ def parse_line(self, line: str) -> None:
263271 if self ._last_key is None :
264272 raise HTTPInputError ("first header line cannot start with whitespace" )
265273 new_part = " " + line .strip (HTTP_WHITESPACE )
266- if not _ABNF .field_value .fullmatch (new_part [1 :]):
267- raise HTTPInputError ("Invalid header continuation %r" % new_part )
274+ if _chars_are_bytes :
275+ if not _ABNF .field_value .fullmatch (new_part [1 :]):
276+ raise HTTPInputError ("Invalid header continuation %r" % new_part )
277+ else :
278+ if _FORBIDDEN_HEADER_CHARS_RE .search (new_part ):
279+ raise HTTPInputError ("Invalid header value %r" % new_part )
268280 self ._as_list [self ._last_key ][- 1 ] += new_part
269281 self ._dict [self ._last_key ] += new_part
270282 else :
271283 try :
272284 name , value = line .split (":" , 1 )
273285 except ValueError :
274286 raise HTTPInputError ("no colon in header line" )
275- self .add (name , value .strip (HTTP_WHITESPACE ))
287+ self .add (
288+ name , value .strip (HTTP_WHITESPACE ), _chars_are_bytes = _chars_are_bytes
289+ )
276290
277291 @classmethod
278- def parse (cls , headers : str ) -> "HTTPHeaders" :
292+ def parse (cls , headers : str , * , _chars_are_bytes : bool = True ) -> "HTTPHeaders" :
279293 """Returns a dictionary from HTTP header text.
280294
281295 >>> h = HTTPHeaders.parse("Content-Type: text/html\\ r\\ nContent-Length: 42\\ r\\ n")
@@ -288,17 +302,31 @@ def parse(cls, headers: str) -> "HTTPHeaders":
288302 mix of `KeyError`, and `ValueError`.
289303
290304 """
305+ # _chars_are_bytes is a hack. This method is used in two places, HTTP headers (in which
306+ # non-ascii characters are to be interpreted as latin-1) and multipart/form-data (in which
307+ # they are to be interpreted as utf-8). For historical reasons, this method handled this by
308+ # expecting both callers to decode the headers to strings before parsing them. This wasn't a
309+ # problem until we started doing stricter validation of the characters allowed in HTTP
310+ # headers (using ABNF rules defined in terms of byte values), which inadvertently started
311+ # disallowing non-latin1 characters in multipart/form-data filenames.
312+ #
313+ # This method should have accepted bytes and a desired encoding, but this change is being
314+ # introduced in a patch release that shouldn't change the API. Instead, the _chars_are_bytes
315+ # flag decides whether to use HTTP-style ABNF validation (treating the string as bytes
316+ # smuggled through the latin1 encoding) or to accept any non-control unicode characters
317+ # as required by multipart/form-data. This method will change to accept bytes in a future
318+ # release.
291319 h = cls ()
292320
293321 start = 0
294322 while True :
295323 lf = headers .find ("\n " , start )
296324 if lf == - 1 :
297- h .parse_line (headers [start :])
325+ h .parse_line (headers [start :], _chars_are_bytes = _chars_are_bytes )
298326 break
299327 line = headers [start : lf + 1 ]
300328 start = lf + 1
301- h .parse_line (line )
329+ h .parse_line (line , _chars_are_bytes = _chars_are_bytes )
302330 return h
303331
304332 # MutableMapping abstract method implementations.
@@ -946,7 +974,7 @@ def parse_multipart_form_data(
946974 eoh = part .find (b"\r \n \r \n " )
947975 if eoh == - 1 :
948976 raise HTTPInputError ("multipart/form-data missing headers" )
949- headers = HTTPHeaders .parse (part [:eoh ].decode ("utf-8" ))
977+ headers = HTTPHeaders .parse (part [:eoh ].decode ("utf-8" ), _chars_are_bytes = False )
950978 disp_header = headers .get ("Content-Disposition" , "" )
951979 disposition , disp_params = _parse_header (disp_header )
952980 if disposition != "form-data" or not part .endswith (b"\r \n " ):
0 commit comments