@@ -96,7 +96,7 @@ def encode_example(self, value: Union[str, bytes, dict, "pdfplumber.pdf.PDF"]) -
96
96
return {"path" : None , "bytes" : value }
97
97
elif pdfplumber is not None and isinstance (value , pdfplumber .pdf .PDF ):
98
98
# convert the pdfplumber.pdf.PDF to bytes
99
- return self . encode_pdfplumber_pdf (value )
99
+ return encode_pdfplumber_pdf (value )
100
100
elif value .get ("path" ) is not None and os .path .isfile (value ["path" ]):
101
101
# we set "bytes": None to not duplicate the data if they're already available locally
102
102
return {"bytes" : None , "path" : value .get ("path" )}
@@ -108,26 +108,6 @@ def encode_example(self, value: Union[str, bytes, dict, "pdfplumber.pdf.PDF"]) -
108
108
f"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in { value } ."
109
109
)
110
110
111
- def encode_pdfplumber_pdf (pdf : "pdfplumber.pdf.PDF" ) -> dict :
112
- """
113
- Encode a pdfplumber.pdf.PDF object into a dictionary.
114
-
115
- If the PDF has an associated file path, returns the path. Otherwise, serializes
116
- the PDF content into bytes.
117
-
118
- Args:
119
- pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.
120
-
121
- Returns:
122
- dict: A dictionary with "path" or "bytes" field.
123
- """
124
- if hasattr (pdf , "stream" ) and hasattr (pdf .stream , "name" ) and pdf .stream .name :
125
- # Return the path if the PDF has an associated file path
126
- return {"path" : pdf .stream .name , "bytes" : None }
127
- else :
128
- # Convert the PDF to bytes if no path is available
129
- return {"path" : None , "bytes" : pdf_to_bytes (pdf )}
130
-
131
111
def decode_example (self , value : dict , token_per_repo_id = None ) -> "pdfplumber.pdf.PDF" :
132
112
"""Decode example pdf file into pdf data.
133
113
@@ -235,3 +215,24 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr
235
215
path_array = pa .array ([None ] * len (storage ), type = pa .string ())
236
216
storage = pa .StructArray .from_arrays ([bytes_array , path_array ], ["bytes" , "path" ], mask = storage .is_null ())
237
217
return array_cast (storage , self .pa_type )
218
+
219
+
220
+ def encode_pdfplumber_pdf (pdf : "pdfplumber.pdf.PDF" ) -> dict :
221
+ """
222
+ Encode a pdfplumber.pdf.PDF object into a dictionary.
223
+
224
+ If the PDF has an associated file path, returns the path. Otherwise, serializes
225
+ the PDF content into bytes.
226
+
227
+ Args:
228
+ pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.
229
+
230
+ Returns:
231
+ dict: A dictionary with "path" or "bytes" field.
232
+ """
233
+ if hasattr (pdf , "stream" ) and hasattr (pdf .stream , "name" ) and pdf .stream .name :
234
+ # Return the path if the PDF has an associated file path
235
+ return {"path" : pdf .stream .name , "bytes" : None }
236
+ else :
237
+ # Convert the PDF to bytes if no path is available
238
+ return {"path" : None , "bytes" : pdf_to_bytes (pdf )}
0 commit comments