@@ -227,8 +227,8 @@ def html_to_unicode(
227227
228228 It will try in order:
229229
230- * http content type header
231230 * BOM (byte-order mark)
231+ * http content type header
232232 * meta or xml tag declarations
233233 * auto-detection, if the `auto_detect_fun` keyword argument is not ``None``
234234 * default encoding in keyword arg (which defaults to utf8)
@@ -281,27 +281,16 @@ def html_to_unicode(
281281 >>>
282282
283283 '''
284-
285- enc = http_content_type_encoding (content_type_header )
286284 bom_enc , bom = read_bom (html_body_str )
287- if enc is not None :
288- # remove BOM if it agrees with the encoding
289- if enc == bom_enc :
290- bom = cast (bytes , bom )
291- html_body_str = html_body_str [len (bom ) :]
292- elif enc == "utf-16" or enc == "utf-32" :
293- # read endianness from BOM, or default to big endian
294- # tools.ietf.org/html/rfc2781 section 4.3
295- if bom_enc is not None and bom_enc .startswith (enc ):
296- enc = bom_enc
297- bom = cast (bytes , bom )
298- html_body_str = html_body_str [len (bom ) :]
299- else :
300- enc += "-be"
301- return enc , to_unicode (html_body_str , enc )
302285 if bom_enc is not None :
303286 bom = cast (bytes , bom )
304287 return bom_enc , to_unicode (html_body_str [len (bom ) :], bom_enc )
288+
289+ enc = http_content_type_encoding (content_type_header )
290+ if enc is not None :
291+ if enc == "utf-16" or enc == "utf-32" :
292+ enc += "-be"
293+ return enc , to_unicode (html_body_str , enc )
305294 enc = html_body_declared_encoding (html_body_str )
306295 if enc is None and (auto_detect_fun is not None ):
307296 enc = auto_detect_fun (html_body_str )
0 commit comments