Skip to content

Commit bcf722f

Browse files
authored
html_to_unicode(): prioritize the BOM to determine the encoding (#191)
1 parent 1c6c96a commit bcf722f

File tree

2 files changed

+11
-23
lines changed

2 files changed

+11
-23
lines changed

tests/test_encoding.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,12 @@ def _assert_encoding_detected(
220220
def test_BOM(self):
221221
# utf-16 cases already tested, as is the BOM detection function
222222

223-
# http header takes precedence, irrespective of BOM
223+
# BOM takes precedence, ahead of the http header
224224
bom_be_str = codecs.BOM_UTF16_BE + "hi".encode("utf-16-be")
225-
expected = "\ufffd\ufffd\x00h\x00i"
226-
self._assert_encoding("utf-8", bom_be_str, "utf-8", expected)
225+
expected = "hi"
226+
self._assert_encoding("utf-8", bom_be_str, "utf-16-be", expected)
227227

228-
# BOM is stripped when it agrees with the encoding, or used to
229-
# determine encoding
228+
# BOM is stripped when present
230229
bom_utf8_str = codecs.BOM_UTF8 + b"hi"
231230
self._assert_encoding("utf-8", bom_utf8_str, "utf-8", "hi")
232231
self._assert_encoding(None, bom_utf8_str, "utf-8", "hi")

w3lib/encoding.py

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,8 @@ def html_to_unicode(
227227
228228
It will try in order:
229229
230-
* http content type header
231230
* BOM (byte-order mark)
231+
* http content type header
232232
* meta or xml tag declarations
233233
* auto-detection, if the `auto_detect_fun` keyword argument is not ``None``
234234
* default encoding in keyword arg (which defaults to utf8)
@@ -281,27 +281,16 @@ def html_to_unicode(
281281
>>>
282282
283283
'''
284-
285-
enc = http_content_type_encoding(content_type_header)
286284
bom_enc, bom = read_bom(html_body_str)
287-
if enc is not None:
288-
# remove BOM if it agrees with the encoding
289-
if enc == bom_enc:
290-
bom = cast(bytes, bom)
291-
html_body_str = html_body_str[len(bom) :]
292-
elif enc == "utf-16" or enc == "utf-32":
293-
# read endianness from BOM, or default to big endian
294-
# tools.ietf.org/html/rfc2781 section 4.3
295-
if bom_enc is not None and bom_enc.startswith(enc):
296-
enc = bom_enc
297-
bom = cast(bytes, bom)
298-
html_body_str = html_body_str[len(bom) :]
299-
else:
300-
enc += "-be"
301-
return enc, to_unicode(html_body_str, enc)
302285
if bom_enc is not None:
303286
bom = cast(bytes, bom)
304287
return bom_enc, to_unicode(html_body_str[len(bom) :], bom_enc)
288+
289+
enc = http_content_type_encoding(content_type_header)
290+
if enc is not None:
291+
if enc == "utf-16" or enc == "utf-32":
292+
enc += "-be"
293+
return enc, to_unicode(html_body_str, enc)
305294
enc = html_body_declared_encoding(html_body_str)
306295
if enc is None and (auto_detect_fun is not None):
307296
enc = auto_detect_fun(html_body_str)

0 commit comments

Comments
 (0)