Skip to content

Commit eb0be4d

Browse files
authored
ROB: MultiLine bfrange in cmap (#1299)
ROB : ending list with only one item on the line Fixes #1274 Fixes #1285
1 parent ba2d32a commit eb0be4d

File tree

2 files changed

+66
-23
lines changed

2 files changed

+66
-23
lines changed

PyPDF2/_cmap.py

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,13 @@ def parse_to_unicode(
180180
return {}, space_code, []
181181
process_rg: bool = False
182182
process_char: bool = False
183+
multiline_rg: Union[
184+
None, Tuple[int, int]
185+
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
183186
cm = prepare_cm(ft)
184187
for l in cm.split(b"\n"):
185-
process_rg, process_char = process_cm_line(
186-
l.strip(b" "), process_rg, process_char, map_dict, int_entry
188+
process_rg, process_char, multiline_rg = process_cm_line(
189+
l.strip(b" "), process_rg, process_char, multiline_rg, map_dict, int_entry
187190
)
188191

189192
for a, value in map_dict.items():
@@ -228,11 +231,12 @@ def process_cm_line(
228231
l: bytes,
229232
process_rg: bool,
230233
process_char: bool,
234+
multiline_rg: Union[None, Tuple[int, int]],
231235
map_dict: Dict[Any, Any],
232236
int_entry: List[int],
233-
) -> Tuple[bool, bool]:
237+
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
234238
if l in (b"", b" ") or l[0] == 37: # 37 = %
235-
return process_rg, process_char
239+
return process_rg, process_char, multiline_rg
236240
if b"beginbfrange" in l:
237241
process_rg = True
238242
elif b"endbfrange" in l:
@@ -242,22 +246,29 @@ def process_cm_line(
242246
elif b"endbfchar" in l:
243247
process_char = False
244248
elif process_rg:
245-
parse_bfrange(l, map_dict, int_entry)
249+
multiline_rg = parse_bfrange(l, map_dict, int_entry, multiline_rg)
246250
elif process_char:
247251
parse_bfchar(l, map_dict, int_entry)
248-
return process_rg, process_char
252+
return process_rg, process_char, multiline_rg
249253

250254

251-
def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
255+
def parse_bfrange(
256+
l: bytes,
257+
map_dict: Dict[Any, Any],
258+
int_entry: List[int],
259+
multiline_rg: Union[None, Tuple[int, int]],
260+
) -> Union[None, Tuple[int, int]]:
252261
lst = [x for x in l.split(b" ") if x]
253-
a = int(lst[0], 16)
254-
b = int(lst[1], 16)
262+
closure_found = False
255263
nbi = len(lst[0])
256264
map_dict[-1] = nbi // 2
257265
fmt = b"%%0%dX" % nbi
258-
if lst[2] == b"[":
259-
for sq in lst[3:]:
266+
if multiline_rg is not None:
267+
a = multiline_rg[0] # a, b not in the current line
268+
b = multiline_rg[1]
269+
for sq in lst[1:]:
260270
if sq == b"]":
271+
closure_found = True
261272
break
262273
map_dict[
263274
unhexlify(fmt % a).decode(
@@ -268,18 +279,36 @@ def parse_bfrange(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> N
268279
int_entry.append(a)
269280
a += 1
270281
else:
271-
c = int(lst[2], 16)
272-
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
273-
while a <= b:
274-
map_dict[
275-
unhexlify(fmt % a).decode(
276-
"charmap" if map_dict[-1] == 1 else "utf-16-be",
277-
"surrogatepass",
278-
)
279-
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
280-
int_entry.append(a)
281-
a += 1
282-
c += 1
282+
a = int(lst[0], 16)
283+
b = int(lst[1], 16)
284+
if lst[2] == b"[":
285+
for sq in lst[3:]:
286+
if sq == b"]":
287+
closure_found = True
288+
break
289+
map_dict[
290+
unhexlify(fmt % a).decode(
291+
"charmap" if map_dict[-1] == 1 else "utf-16-be",
292+
"surrogatepass",
293+
)
294+
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
295+
int_entry.append(a)
296+
a += 1
297+
else: # case without list
298+
c = int(lst[2], 16)
299+
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
300+
closure_found = True
301+
while a <= b:
302+
map_dict[
303+
unhexlify(fmt % a).decode(
304+
"charmap" if map_dict[-1] == 1 else "utf-16-be",
305+
"surrogatepass",
306+
)
307+
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
308+
int_entry.append(a)
309+
a += 1
310+
c += 1
311+
return None if closure_found else (a, b)
283312

284313

285314
def parse_bfchar(l: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:

tests/test_cmap.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,20 @@ def test_get_font_width_from_default(): # L40
4848
page.extract_text()
4949

5050

51+
def test_multiline_bfrange():
52+
# non regression test for iss_1285
53+
url = "https://github.com/alexanderquispe/1REI05/raw/main/reports/report_1/The%20lean%20times%20in%20the%20Peruvian%20economy.pdf"
54+
name = "tika-908104.pdf"
55+
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
56+
for page in reader.pages:
57+
page.extract_text()
58+
url = "https://github.com/yxj-HGNwmb5kdp8ewr/yxj-HGNwmb5kdp8ewr.github.io/raw/master/files/Giacalone%20Llobell%20Jaeger%20(2022)%20Food%20Qual%20Prefer.pdf"
59+
name = "Giacalone.pdf"
60+
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
61+
for page in reader.pages:
62+
page.extract_text()
63+
64+
5165
def test_bfchar_on_2_chars():
5266
# iss #1293
5367
url = "https://github.com/xyegithub/myBlog/raw/main/posts/c94b2364/paper_pdfs/ImageClassification/2007%2CASurveyofImageClassificationBasedTechniques.pdf"

0 commit comments

Comments
 (0)