7171 TextStringObject ,
7272 encode_pdfdocencoding ,
7373)
74- from .types import PdfReaderProtocol
74+
75+ CUSTOM_RTL_MIN : int = - 1
76+ CUSTOM_RTL_MAX : int = - 1
77+ CUSTOM_RTL_SPECIAL_CHARS : List [int ] = []
78+
79+
80+ def set_custom_rtl (
81+ _min : Union [str , int , None ] = None ,
82+ _max : Union [str , int , None ] = None ,
83+ specials : Union [str , List [int ], None ] = None ,
84+ ) -> Tuple [int , int , List [int ]]:
85+ """
86+ changes the Right-To-Left and special characters customed parameters:
87+
88+ _min -> CUSTOM_RTL_MIN : None does not change the value ; int or str(converted to ascii code) ; -1 by default
89+ _max -> CUSTOM_RTL_MAX : None does not change the value ; int or str(converted to ascii code) ; -1 by default
90+ those values define a range of custom characters that will be written right to left ;
91+ [-1;-1] set no additional range to be converter
92+
93+ specials -> CUSTOM_RTL_SPECIAL_CHARS: None does not change the current value; str to be converted to list or list of ascii codes ; [] by default
94+ list of codes that will inserted applying the current insertion order ; this consist normally in a list of punctuation characters
95+ """
96+ global CUSTOM_RTL_MIN , CUSTOM_RTL_MAX , CUSTOM_RTL_SPECIAL_CHARS
97+ if isinstance (_min , int ):
98+ CUSTOM_RTL_MIN = _min
99+ elif isinstance (_min , str ):
100+ CUSTOM_RTL_MIN = ord (_min )
101+ if isinstance (_max , int ):
102+ CUSTOM_RTL_MAX = _max
103+ elif isinstance (_max , str ):
104+ CUSTOM_RTL_MAX = ord (_max )
105+ if isinstance (specials , str ):
106+ CUSTOM_RTL_SPECIAL_CHARS = [ord (x ) for x in specials ]
107+ elif isinstance (specials , list ):
108+ CUSTOM_RTL_SPECIAL_CHARS = specials
109+ return CUSTOM_RTL_MIN , CUSTOM_RTL_MAX , CUSTOM_RTL_SPECIAL_CHARS
75110
76111
77112def _get_rectangle (self : Any , name : str , defaults : Iterable [str ]) -> RectangleObject :
@@ -242,11 +277,13 @@ class PageObject(DictionaryObject):
242277
243278 def __init__ (
244279 self ,
245- pdf : Optional [PdfReaderProtocol ] = None ,
280+ pdf : Optional [Any ] = None , # PdfReader
246281 indirect_ref : Optional [IndirectObject ] = None ,
247282 ) -> None :
283+ from ._reader import PdfReader
284+
248285 DictionaryObject .__init__ (self )
249- self .pdf : Optional [PdfReaderProtocol ] = pdf
286+ self .pdf : Optional [PdfReader ] = pdf
250287 self .indirect_ref = indirect_ref
251288
252289 def hash_value_data (self ) -> bytes :
@@ -1103,6 +1140,18 @@ def _debug_for_extract(self) -> str: # pragma: no cover
11031140 out += enc_repr + "\n "
11041141 except Exception :
11051142 pass
1143+ try :
1144+ out += (
1145+ self ["/Resources" ]["/Font" ][fo ][ # type:ignore
1146+ "/ToUnicode"
1147+ ]
1148+ .get_data ()
1149+ .decode ()
1150+ + "\n "
1151+ )
1152+ except Exception :
1153+ pass
1154+
11061155 except KeyError :
11071156 out += "No Font\n "
11081157 return out
@@ -1123,6 +1172,9 @@ def _extract_text(
11231172 this function, as it will change if this function is made more
11241173 sophisticated.
11251174
1175+ Arabic, Hebrew,... are extracted in the good order. If required an custom RTL range of characters
1176+ can be defined; see function set_custom_rtl
1177+
11261178 :param Tuple[int, ...] orientations: list of orientations text_extraction will look for
11271179 default = (0, 90, 180, 270)
11281180 note: currently only 0(Up),90(turned Left), 180(upside Down), 270 (turned Right)
@@ -1135,6 +1187,7 @@ def _extract_text(
11351187 """
11361188 text : str = ""
11371189 output : str = ""
1190+ rtl_dir : bool = False # right-to-left
11381191 cmaps : Dict [
11391192 str , Tuple [str , float , Union [str , Dict [int , str ]], Dict [str , str ]]
11401193 ] = {}
@@ -1209,7 +1262,9 @@ def current_spacewidth() -> float:
12091262 return _space_width / 1000.0
12101263
12111264 def process_operation (operator : bytes , operands : List ) -> None :
1212- nonlocal cm_matrix , cm_stack , tm_matrix , tm_prev , output , text , char_scale , space_scale , _space_width , TL , font_size , cmap , orientations
1265+ nonlocal cm_matrix , cm_stack , tm_matrix , tm_prev , output , text , char_scale , space_scale , _space_width , TL , font_size , cmap , orientations , rtl_dir
1266+ global CUSTOM_RTL_MIN , CUSTOM_RTL_MAX , CUSTOM_RTL_SPECIAL_CHARS
1267+
12131268 check_crlf_space : bool = False
12141269 # Table 5.4 page 405
12151270 if operator == b"BT" :
@@ -1251,6 +1306,7 @@ def process_operation(operator: bytes, operands: List) -> None:
12511306 ) = cm_stack .pop ()
12521307 except Exception :
12531308 cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
1309+ # rtl_dir = False
12541310 elif operator == b"cm" :
12551311 output += text
12561312 text = ""
@@ -1265,6 +1321,7 @@ def process_operation(operator: bytes, operands: List) -> None:
12651321 ],
12661322 cm_matrix ,
12671323 )
1324+ # rtl_dir = False
12681325 # Table 5.2 page 398
12691326 elif operator == b"Tz" :
12701327 char_scale = float (operands [0 ]) / 100.0
@@ -1276,6 +1333,7 @@ def process_operation(operator: bytes, operands: List) -> None:
12761333 if text != "" :
12771334 output += text # .translate(cmap)
12781335 text = ""
1336+ # rtl_dir = False
12791337 try :
12801338 _space_width = cmaps [operands [0 ]][1 ]
12811339 cmap = (
@@ -1344,8 +1402,42 @@ def process_operation(operator: bytes, operands: List) -> None:
13441402 for x in tt
13451403 ]
13461404 )
1347-
1348- text += "" .join ([cmap [1 ][x ] if x in cmap [1 ] else x for x in t ])
1405+ # "\u0590 - \u08FF \uFB50 - \uFDFF"
1406+ for x in "" .join (
1407+ [cmap [1 ][x ] if x in cmap [1 ] else x for x in t ]
1408+ ):
1409+ xx = ord (x )
1410+ # fmt: off
1411+ if ( # cases where the current inserting order is kept (punctuation,...)
1412+ (xx <= 0x2F ) # punctuations but...
1413+ or (xx >= 0x3A and xx <= 0x40 ) # numbers (x30-39)
1414+ or (xx >= 0x2000 and xx <= 0x206F ) # upper punctuations..
1415+ or (xx >= 0x20A0 and xx <= 0x21FF ) # but (numbers) indices/exponents
1416+ or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
1417+ ):
1418+ text = x + text if rtl_dir else text + x
1419+ elif ( # right-to-left characters set
1420+ (xx >= 0x0590 and xx <= 0x08FF )
1421+ or (xx >= 0xFB1D and xx <= 0xFDFF )
1422+ or (xx >= 0xFE70 and xx <= 0xFEFF )
1423+ or (xx >= CUSTOM_RTL_MIN and xx <= CUSTOM_RTL_MAX )
1424+ ):
1425+ # print("<",xx,x)
1426+ if not rtl_dir :
1427+ rtl_dir = True
1428+ # print("RTL",text,"*")
1429+ output += text
1430+ text = ""
1431+ text = x + text
1432+ else : # left-to-right
1433+ # print(">",xx,x,end="")
1434+ if rtl_dir :
1435+ rtl_dir = False
1436+ # print("LTR",text,"*")
1437+ output += text
1438+ text = ""
1439+ text = text + x
1440+ # fmt: on
13491441 else :
13501442 return None
13511443 if check_crlf_space :
@@ -1362,40 +1454,44 @@ def process_operation(operator: bytes, operands: List) -> None:
13621454 if o == 0 :
13631455 if deltaY < - 0.8 * f :
13641456 if (output + text )[- 1 ] != "\n " :
1365- text += "\n "
1457+ output += text + "\n "
1458+ text = ""
13661459 elif (
13671460 abs (deltaY ) < f * 0.3
1368- and abs (deltaX ) > current_spacewidth () * f * 10
1461+ and abs (deltaX ) > current_spacewidth () * f * 15
13691462 ):
13701463 if (output + text )[- 1 ] != " " :
13711464 text += " "
13721465 elif o == 180 :
13731466 if deltaY > 0.8 * f :
13741467 if (output + text )[- 1 ] != "\n " :
1375- text += "\n "
1468+ output += text + "\n "
1469+ text = ""
13761470 elif (
13771471 abs (deltaY ) < f * 0.3
1378- and abs (deltaX ) > current_spacewidth () * f * 10
1472+ and abs (deltaX ) > current_spacewidth () * f * 15
13791473 ):
13801474 if (output + text )[- 1 ] != " " :
13811475 text += " "
13821476 elif o == 90 :
13831477 if deltaX > 0.8 * f :
13841478 if (output + text )[- 1 ] != "\n " :
1385- text += "\n "
1479+ output += text + "\n "
1480+ text = ""
13861481 elif (
13871482 abs (deltaX ) < f * 0.3
1388- and abs (deltaY ) > current_spacewidth () * f * 10
1483+ and abs (deltaY ) > current_spacewidth () * f * 15
13891484 ):
13901485 if (output + text )[- 1 ] != " " :
13911486 text += " "
13921487 elif o == 270 :
13931488 if deltaX < - 0.8 * f :
13941489 if (output + text )[- 1 ] != "\n " :
1395- text += "\n "
1490+ output += text + "\n "
1491+ text = ""
13961492 elif (
13971493 abs (deltaX ) < f * 0.3
1398- and abs (deltaY ) > current_spacewidth () * f * 10
1494+ and abs (deltaY ) > current_spacewidth () * f * 15
13991495 ):
14001496 if (output + text )[- 1 ] != " " :
14011497 text += " "
0 commit comments