ROB: MultiLine bfrange in cmap (#1299)

pubpub-zz · web-flow · commit eb0be4d2ffad · 2022-09-02T21:17:04.000+02:00
ROB : ending list with only one item on the line Fixes #1274 Fixes #1285
diff --git a/PyPDF2/_cmap.py b/PyPDF2/_cmap.py
@@ -180,10 +180,13 @@ def parse_to_unicode(
         return {}, space_code, []
     process_rg: bool = False
     process_char: bool = False
+    multiline_rg: Union[
+        None, Tuple[int, int]
+    ] = None  # tuple = (current_char, remaining size) ; cf #1285 for example of file
     cm = prepare_cm(ft)
     for l in cm.split(b"\n"):
-        process_rg, process_char = process_cm_line(
-            l.strip(b" "), process_rg, process_char, map_dict, int_entry
+        process_rg, process_char, multiline_rg = process_cm_line(
+            l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
         )
 
     for a, value in map_dict.items():
@@ -228,11 +231,12 @@ def process_cm_line(
     l: bytes,
     process_rg: bool,
     process_char: bool,
+    multiline_rg: Union[None, Tuple[int, int]],
     map_dict: Dict[Any, Any],
     int_entry: List[int],
-) -> Tuple[bool, bool]:
+) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
     if l in (b"", b" ") or l[0] == 37:  # 37 = %
-        return process_rg, process_char
+        return process_rg, process_char, multiline_rg
     if b"beginbfrange" in l:
         process_rg = True
     elif b"endbfrange" in l:
@@ -242,22 +246,29 @@ def process_cm_line(
     elif b"endbfchar" in l:
         process_char = False
     elif process_rg:
-        parse_bfrange(l, map_dict, int_entry)
+        multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
     elif process_char:
         parse_bfchar(l, map_dict, int_entry)
-    return process_rg, process_char
+    return process_rg, process_char, multiline_rg
 
 
-def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
+def parse_bfrange(
+    l: bytes,
+    map_dict: Dict[Any, Any],
+    int_entry: List[int],
+    multiline_rg: Union[None, Tuple[int, int]],
+) -> Union[None, Tuple[int, int]]:
     lst = [x for x in l.split(b" ") if x]
-    a = int(lst[0], 16)
-    b = int(lst[1], 16)
+    closure_found = False
     nbi = len(lst[0])
     map_dict[-1] = nbi // 2
     fmt = b"%%0%dX" % nbi
-    if lst[2] == b"[":
-        for sq in lst[3:]:
+    if multiline_rg is not None:
+        a = multiline_rg[0]  # a, b not in the current line
+        b = multiline_rg[1]
+        for sq in lst[1:]:
             if sq == b"]":
+                closure_found = True
                 break
             map_dict[
                 unhexlify(fmt % a).decode(
@@ -268,18 +279,36 @@ def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> N
             int_entry.append(a)
             a += 1
     else:
-        c = int(lst[2], 16)
-        fmt2 = b"%%0%dX" % max(4, len(lst[2]))
-        while a <= b:
-            map_dict[
-                unhexlify(fmt % a).decode(
-                    "charmap" if map_dict[-1] == 1 else "utf-16-be",
-                    "surrogatepass",
-                )
-            ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
-            int_entry.append(a)
-            a += 1
-            c += 1
+        a = int(lst[0], 16)
+        b = int(lst[1], 16)
+        if lst[2] == b"[":
+            for sq in lst[3:]:
+                if sq == b"]":
+                    closure_found = True
+                    break
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+        else:  # case without list
+            c = int(lst[2], 16)
+            fmt2 = b"%%0%dX" % max(4, len(lst[2]))
+            closure_found = True
+            while a <= b:
+                map_dict[
+                    unhexlify(fmt % a).decode(
+                        "charmap" if map_dict[-1] == 1 else "utf-16-be",
+                        "surrogatepass",
+                    )
+                ] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
+                int_entry.append(a)
+                a += 1
+                c += 1
+    return None if closure_found else (a, b)
 
 
 def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
diff --git a/tests/test_cmap.py b/tests/test_cmap.py
@@ -48,6 +48,20 @@ def test_get_font_width_from_default():  # L40
         page.extract_text()
 
 
+def test_multiline_bfrange():
+    # non regression test for iss_1285
+    url = "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf"
+    name = "tika-908104.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    for page in reader.pages:
+        page.extract_text()
+    url = "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf"
+    name = "Giacalone.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    for page in reader.pages:
+        page.extract_text()
+
+
 def test_bfchar_on_2_chars():
     # iss #1293
     url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf"