16
16
from gradio .data_classes import FileData
17
17
import numpy as np
18
18
19
+ from io import BytesIO
20
+ import base64
21
+
19
22
from fastchat .constants import (
20
23
TEXT_MODERATION_MSG ,
21
24
IMAGE_MODERATION_MSG ,
@@ -217,29 +220,38 @@ def wrap_pdfchat_query(query, document):
217
220
218
221
# def parse_pdf(file_path):
219
222
# from llama_parse import LlamaParse
223
+ # from llama_index.core.schema import ImageDocument, TextNode
224
+
225
+ # from PIL import Image
220
226
221
- # assert (
222
- # "LLAMA_CLOUD_API_KEY" in os.environ
223
- # ), "Make sure to specify LlamaParse API key."
224
-
225
- # for _ in range(LLAMA_PARSE_MAX_RETRY):
226
- # try:
227
- # documents = LlamaParse(
228
- # result_type="markdown",
229
- # verbose=True,
230
- # languages=list(LLAMAPARSE_SUPPORTED_LANGS.values()),
231
- # accurate_mode=True,
232
- # ).load_data(file_path)
233
- # assert len(documents) > 0
234
- # break
235
- # except AssertionError as e:
236
- # continue
237
-
238
- # output = "\n".join(
239
- # [f"Page {i+1}:\n{doc.text}\n" for i, doc in enumerate(documents)]
227
+ # parser = LlamaParse(
228
+ # api_key=os.getenv("LLAMA_CLOUD_API_KEY"),
229
+ # result_type="markdown",
240
230
# )
241
231
242
- # return output
232
+ # def get_image_nodes(json_objs: List[dict], download_path: str):
233
+ # image_dicts = parser.get_images(json_objs, download_path=download_path)
234
+ # return [ImageDocument(image_path=image_dict["path"]) for image_dict in image_dicts]
235
+
236
+ # json_objs = parser.get_json_result(file_path)
237
+ # json_list = json_objs[0]["pages"]
238
+
239
+ # text = ""
240
+ # for page in json_list:
241
+ # text += f"Page {page['page']}:\n{page['md']}\n"
242
+ # if (page['images']):
243
+ # for i, image in enumerate(page['images']):
244
+ # text += f"page{page['page']}_figure{i + 1}\n"
245
+
246
+ # image_documents = get_image_nodes(json_objs, ".")
247
+ # images = []
248
+
249
+ # for image_doc in image_documents:
250
+ # image_path = image_doc.image_path
251
+ # image = Image.open(image_path)
252
+ # images.append(image)
253
+
254
+ # return text, images
243
255
244
256
245
257
PDFPARSE_MAX_RETRY = 2
@@ -259,29 +271,48 @@ def wrap_pdfchat_query(query, document):
259
271
"languages" : "," .join (PDFPARSE_SUPPORTED_LANGS .values ()),
260
272
}
261
273
274
+ def convert_base64_to_pil_image (b64_string ):
275
+ from PIL import Image
276
+
277
+ image_data = base64 .b64decode (b64_string )
278
+ image_bytes = BytesIO (image_data )
279
+ image = Image .open (image_bytes )
280
+
281
+ return image
262
282
263
283
def parse_pdf (file_path ):
264
- from marker .config .parser import ConfigParser
265
- from marker .models import create_model_dict
266
- from marker .converters .pdf import PdfConverter
267
-
268
- output_md , output_images = None , None
269
- for _ in range (PDFPARSE_MAX_RETRY ):
270
- try :
271
- config_parser = ConfigParser (MARKER_PDFPARSE_CONFIG )
272
-
273
- converter = PdfConverter (
274
- config = config_parser .generate_config_dict (),
275
- artifact_dict = create_model_dict (),
276
- processor_list = config_parser .get_processors (),
277
- renderer = config_parser .get_renderer (),
278
- )
279
- rendered = converter (file_path )
280
- output_md = rendered .markdown
281
- output_images = list (rendered .images .values ())
284
+ import requests
285
+
286
+ url = "https://www.datalab.to/api/v1/marker"
287
+
288
+ form_data = {
289
+ 'file' : ('test.pdf' , open (file_path , 'rb' ), 'application/pdf' ),
290
+ 'langs' : (None , "English" ),
291
+ "force_ocr" : (None , False ),
292
+ "paginate" : (None , False ),
293
+ 'output_format' : (None , 'markdown' ),
294
+ "use_llm" : (None , True ),
295
+ "strip_existing_ocr" : (None , False ),
296
+ "disable_image_extraction" : (None , False )
297
+ }
298
+
299
+ headers = {"X-Api-Key" : os .getenv ("X-Api-Key" )}
300
+ response = requests .post (url , files = form_data , headers = headers )
301
+ data = response .json ()
302
+
303
+ max_polls = 300
304
+ check_url = data ["request_check_url" ]
305
+
306
+ for i in range (max_polls ):
307
+ time .sleep (2 )
308
+ response = requests .get (check_url , headers = headers )
309
+ data = response .json ()
310
+
311
+ if data ["status" ] == "complete" :
282
312
break
283
- except AssertionError as e :
284
- continue
313
+
314
+ output_md = data ["markdown" ]
315
+ output_images = [convert_base64_to_pil_image (b64_image ) for b64_image in data ["images" ].values ()]
285
316
286
317
return output_md , output_images
287
318
0 commit comments