Skip to content

Commit 7a95708

Browse files
authored
ENH: Auto-detect RTL for text extraction (#1309)
Includes some customization capabilities to extend RTL Closes #1296
1 parent c696192 commit 7a95708

File tree

2 files changed

+127
-16
lines changed

2 files changed

+127
-16
lines changed

PyPDF2/_page.py

Lines changed: 110 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,42 @@
7171
TextStringObject,
7272
encode_pdfdocencoding,
7373
)
74-
from .types import PdfReaderProtocol
74+
75+
CUSTOM_RTL_MIN: int = -1
76+
CUSTOM_RTL_MAX: int = -1
77+
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
78+
79+
80+
def set_custom_rtl(
81+
_min: Union[str, int, None] = None,
82+
_max: Union[str, int, None] = None,
83+
specials: Union[str, List[int], None] = None,
84+
) -> Tuple[int, int, List[int]]:
85+
"""
86+
changes the Right-To-Left and special characters customed parameters:
87+
88+
_min -> CUSTOM_RTL_MIN : None does not change the value ; int or str(converted to ascii code) ; -1 by default
89+
_max -> CUSTOM_RTL_MAX : None does not change the value ; int or str(converted to ascii code) ; -1 by default
90+
those values define a range of custom characters that will be written right to left ;
91+
[-1;-1] set no additional range to be converter
92+
93+
specials -> CUSTOM_RTL_SPECIAL_CHARS: None does not change the current value; str to be converted to list or list of ascii codes ; [] by default
94+
list of codes that will inserted applying the current insertion order ; this consist normally in a list of punctuation characters
95+
"""
96+
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
97+
if isinstance(_min, int):
98+
CUSTOM_RTL_MIN = _min
99+
elif isinstance(_min, str):
100+
CUSTOM_RTL_MIN = ord(_min)
101+
if isinstance(_max, int):
102+
CUSTOM_RTL_MAX = _max
103+
elif isinstance(_max, str):
104+
CUSTOM_RTL_MAX = ord(_max)
105+
if isinstance(specials, str):
106+
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
107+
elif isinstance(specials, list):
108+
CUSTOM_RTL_SPECIAL_CHARS = specials
109+
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
75110

76111

77112
def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
@@ -242,11 +277,13 @@ class PageObject(DictionaryObject):
242277

243278
def __init__(
244279
self,
245-
pdf: Optional[PdfReaderProtocol] = None,
280+
pdf: Optional[Any] = None, # PdfReader
246281
indirect_ref: Optional[IndirectObject] = None,
247282
) -> None:
283+
from ._reader import PdfReader
284+
248285
DictionaryObject.__init__(self)
249-
self.pdf: Optional[PdfReaderProtocol] = pdf
286+
self.pdf: Optional[PdfReader] = pdf
250287
self.indirect_ref = indirect_ref
251288

252289
def hash_value_data(self) -> bytes:
@@ -1103,6 +1140,18 @@ def _debug_for_extract(self) -> str: # pragma: no cover
11031140
out += enc_repr + "\n"
11041141
except Exception:
11051142
pass
1143+
try:
1144+
out += (
1145+
self["/Resources"]["/Font"][fo][ # type:ignore
1146+
"/ToUnicode"
1147+
]
1148+
.get_data()
1149+
.decode()
1150+
+ "\n"
1151+
)
1152+
except Exception:
1153+
pass
1154+
11061155
except KeyError:
11071156
out += "No Font\n"
11081157
return out
@@ -1123,6 +1172,9 @@ def _extract_text(
11231172
this function, as it will change if this function is made more
11241173
sophisticated.
11251174
1175+
Arabic, Hebrew,... are extracted in the good order. If required an custom RTL range of characters
1176+
can be defined; see function set_custom_rtl
1177+
11261178
:param Tuple[int, ...] orientations: list of orientations text_extraction will look for
11271179
default = (0, 90, 180, 270)
11281180
note: currently only 0(Up),90(turned Left), 180(upside Down), 270 (turned Right)
@@ -1135,6 +1187,7 @@ def _extract_text(
11351187
"""
11361188
text: str = ""
11371189
output: str = ""
1190+
rtl_dir: bool = False # right-to-left
11381191
cmaps: Dict[
11391192
str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]]
11401193
] = {}
@@ -1209,7 +1262,9 @@ def current_spacewidth() -> float:
12091262
return _space_width / 1000.0
12101263

12111264
def process_operation(operator: bytes, operands: List) -> None:
1212-
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations
1265+
nonlocal cm_matrix, cm_stack, tm_matrix, tm_prev, output, text, char_scale, space_scale, _space_width, TL, font_size, cmap, orientations, rtl_dir
1266+
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
1267+
12131268
check_crlf_space: bool = False
12141269
# Table 5.4 page 405
12151270
if operator == b"BT":
@@ -1251,6 +1306,7 @@ def process_operation(operator: bytes, operands: List) -> None:
12511306
) = cm_stack.pop()
12521307
except Exception:
12531308
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1309+
# rtl_dir = False
12541310
elif operator == b"cm":
12551311
output += text
12561312
text = ""
@@ -1265,6 +1321,7 @@ def process_operation(operator: bytes, operands: List) -> None:
12651321
],
12661322
cm_matrix,
12671323
)
1324+
# rtl_dir = False
12681325
# Table 5.2 page 398
12691326
elif operator == b"Tz":
12701327
char_scale = float(operands[0]) / 100.0
@@ -1276,6 +1333,7 @@ def process_operation(operator: bytes, operands: List) -> None:
12761333
if text != "":
12771334
output += text # .translate(cmap)
12781335
text = ""
1336+
# rtl_dir = False
12791337
try:
12801338
_space_width = cmaps[operands[0]][1]
12811339
cmap = (
@@ -1344,8 +1402,42 @@ def process_operation(operator: bytes, operands: List) -> None:
13441402
for x in tt
13451403
]
13461404
)
1347-
1348-
text += "".join([cmap[1][x] if x in cmap[1] else x for x in t])
1405+
# "\u0590 - \u08FF \uFB50 - \uFDFF"
1406+
for x in "".join(
1407+
[cmap[1][x] if x in cmap[1] else x for x in t]
1408+
):
1409+
xx = ord(x)
1410+
# fmt: off
1411+
if ( # cases where the current inserting order is kept (punctuation,...)
1412+
(xx <= 0x2F) # punctuations but...
1413+
or (xx >= 0x3A and xx <= 0x40) # numbers (x30-39)
1414+
or (xx >= 0x2000 and xx <= 0x206F) # upper punctuations..
1415+
or (xx >= 0x20A0 and xx <= 0x21FF) # but (numbers) indices/exponents
1416+
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
1417+
):
1418+
text = x + text if rtl_dir else text + x
1419+
elif ( # right-to-left characters set
1420+
(xx >= 0x0590 and xx <= 0x08FF)
1421+
or (xx >= 0xFB1D and xx <= 0xFDFF)
1422+
or (xx >= 0xFE70 and xx <= 0xFEFF)
1423+
or (xx >= CUSTOM_RTL_MIN and xx <= CUSTOM_RTL_MAX)
1424+
):
1425+
# print("<",xx,x)
1426+
if not rtl_dir:
1427+
rtl_dir = True
1428+
# print("RTL",text,"*")
1429+
output += text
1430+
text = ""
1431+
text = x + text
1432+
else: # left-to-right
1433+
# print(">",xx,x,end="")
1434+
if rtl_dir:
1435+
rtl_dir = False
1436+
# print("LTR",text,"*")
1437+
output += text
1438+
text = ""
1439+
text = text + x
1440+
# fmt: on
13491441
else:
13501442
return None
13511443
if check_crlf_space:
@@ -1362,40 +1454,44 @@ def process_operation(operator: bytes, operands: List) -> None:
13621454
if o == 0:
13631455
if deltaY < -0.8 * f:
13641456
if (output + text)[-1] != "\n":
1365-
text += "\n"
1457+
output += text + "\n"
1458+
text = ""
13661459
elif (
13671460
abs(deltaY) < f * 0.3
1368-
and abs(deltaX) > current_spacewidth() * f * 10
1461+
and abs(deltaX) > current_spacewidth() * f * 15
13691462
):
13701463
if (output + text)[-1] != " ":
13711464
text += " "
13721465
elif o == 180:
13731466
if deltaY > 0.8 * f:
13741467
if (output + text)[-1] != "\n":
1375-
text += "\n"
1468+
output += text + "\n"
1469+
text = ""
13761470
elif (
13771471
abs(deltaY) < f * 0.3
1378-
and abs(deltaX) > current_spacewidth() * f * 10
1472+
and abs(deltaX) > current_spacewidth() * f * 15
13791473
):
13801474
if (output + text)[-1] != " ":
13811475
text += " "
13821476
elif o == 90:
13831477
if deltaX > 0.8 * f:
13841478
if (output + text)[-1] != "\n":
1385-
text += "\n"
1479+
output += text + "\n"
1480+
text = ""
13861481
elif (
13871482
abs(deltaX) < f * 0.3
1388-
and abs(deltaY) > current_spacewidth() * f * 10
1483+
and abs(deltaY) > current_spacewidth() * f * 15
13891484
):
13901485
if (output + text)[-1] != " ":
13911486
text += " "
13921487
elif o == 270:
13931488
if deltaX < -0.8 * f:
13941489
if (output + text)[-1] != "\n":
1395-
text += "\n"
1490+
output += text + "\n"
1491+
text = ""
13961492
elif (
13971493
abs(deltaX) < f * 0.3
1398-
and abs(deltaY) > current_spacewidth() * f * 10
1494+
and abs(deltaY) > current_spacewidth() * f * 15
13991495
):
14001496
if (output + text)[-1] != " ":
14011497
text += " "

tests/test_page.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pytest
88

99
from PyPDF2 import PdfReader, PdfWriter, Transformation
10-
from PyPDF2._page import PageObject
10+
from PyPDF2._page import PageObject, set_custom_rtl
1111
from PyPDF2.constants import PageAttributes as PG
1212
from PyPDF2.errors import PdfReadWarning
1313
from PyPDF2.generic import (
@@ -227,11 +227,26 @@ def test_multi_language():
227227
reader = PdfReader(RESOURCE_ROOT / "multilang.pdf")
228228
txt = reader.pages[0].extract_text()
229229
assert "Hello World" in txt, "English not correctly extracted"
230-
# Arabic is for the moment left on side
230+
# iss #1296
231+
assert "مرحبا بالعالم" in txt, "Arabic not correctly extracted"
231232
assert "Привет, мир" in txt, "Russian not correctly extracted"
232233
assert "你好世界" in txt, "Chinese not correctly extracted"
233234
assert "สวัสดีชาวโลก" in txt, "Thai not correctly extracted"
234235
assert "こんにちは世界" in txt, "Japanese not correctly extracted"
236+
# check customizations
237+
set_custom_rtl(None, None, "Russian:")
238+
assert (
239+
":naissuR" in reader.pages[0].extract_text()
240+
), "(1) CUSTOM_RTL_SPECIAL_CHARS failed"
241+
set_custom_rtl(None, None, [ord(x) for x in "Russian:"])
242+
assert (
243+
":naissuR" in reader.pages[0].extract_text()
244+
), "(2) CUSTOM_RTL_SPECIAL_CHARS failed"
245+
set_custom_rtl(0, 255, None)
246+
assert ":hsilgnE" in reader.pages[0].extract_text(), "CUSTOM_RTL_MIN/MAX failed"
247+
set_custom_rtl("A", "z", [])
248+
assert ":hsilgnE" in reader.pages[0].extract_text(), "CUSTOM_RTL_MIN/MAX failed"
249+
set_custom_rtl(-1, -1, []) # to prevent further errors
235250

236251

237252
def test_extract_text_single_quote_op():

0 commit comments

Comments
 (0)