@@ -96,7 +96,7 @@ def encode_example(self, value: Union[str, bytes, dict, "pdfplumber.pdf.PDF"]) -
9696 return {"path" : None , "bytes" : value }
9797 elif pdfplumber is not None and isinstance (value , pdfplumber .pdf .PDF ):
9898 # convert the pdfplumber.pdf.PDF to bytes
99- return self . encode_pdfplumber_pdf (value )
99+ return encode_pdfplumber_pdf (value )
100100 elif value .get ("path" ) is not None and os .path .isfile (value ["path" ]):
101101 # we set "bytes": None to not duplicate the data if they're already available locally
102102 return {"bytes" : None , "path" : value .get ("path" )}
@@ -108,26 +108,6 @@ def encode_example(self, value: Union[str, bytes, dict, "pdfplumber.pdf.PDF"]) -
108108 f"A pdf sample should have one of 'path' or 'bytes' but they are missing or None in { value } ."
109109 )
110110
111- def encode_pdfplumber_pdf (pdf : "pdfplumber.pdf.PDF" ) -> dict :
112- """
113- Encode a pdfplumber.pdf.PDF object into a dictionary.
114-
115- If the PDF has an associated file path, returns the path. Otherwise, serializes
116- the PDF content into bytes.
117-
118- Args:
119- pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.
120-
121- Returns:
122- dict: A dictionary with "path" or "bytes" field.
123- """
124- if hasattr (pdf , "stream" ) and hasattr (pdf .stream , "name" ) and pdf .stream .name :
125- # Return the path if the PDF has an associated file path
126- return {"path" : pdf .stream .name , "bytes" : None }
127- else :
128- # Convert the PDF to bytes if no path is available
129- return {"path" : None , "bytes" : pdf_to_bytes (pdf )}
130-
131111 def decode_example (self , value : dict , token_per_repo_id = None ) -> "pdfplumber.pdf.PDF" :
132112 """Decode example pdf file into pdf data.
133113
@@ -235,3 +215,24 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.ListArr
235215 path_array = pa .array ([None ] * len (storage ), type = pa .string ())
236216 storage = pa .StructArray .from_arrays ([bytes_array , path_array ], ["bytes" , "path" ], mask = storage .is_null ())
237217 return array_cast (storage , self .pa_type )
218+
219+
220+ def encode_pdfplumber_pdf (pdf : "pdfplumber.pdf.PDF" ) -> dict :
221+ """
222+ Encode a pdfplumber.pdf.PDF object into a dictionary.
223+
224+ If the PDF has an associated file path, returns the path. Otherwise, serializes
225+ the PDF content into bytes.
226+
227+ Args:
228+ pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.
229+
230+ Returns:
231+ dict: A dictionary with "path" or "bytes" field.
232+ """
233+ if hasattr (pdf , "stream" ) and hasattr (pdf .stream , "name" ) and pdf .stream .name :
234+ # Return the path if the PDF has an associated file path
235+ return {"path" : pdf .stream .name , "bytes" : None }
236+ else :
237+ # Convert the PDF to bytes if no path is available
238+ return {"path" : None , "bytes" : pdf_to_bytes (pdf )}
0 commit comments