1
+ import base64
1
2
import importlib
2
3
import json
3
4
import logging
4
5
import ssl
5
6
import tempfile
7
+ import time
6
8
from pathlib import Path
7
9
8
10
import certifi
@@ -149,6 +151,11 @@ def set_outputs_visibility_direct(x, y):
149
151
return content , file
150
152
151
153
154
+ def set_task_id_visibility (x ):
155
+ task_id_row = gr .Row (visible = x )
156
+ return task_id_row
157
+
158
+
152
159
def set_outputs_visibility_process (x ):
153
160
content = gr .Row (visible = not x )
154
161
file = gr .Row (visible = x )
@@ -160,6 +167,7 @@ def set_download_button_label(label_text: gr.State):
160
167
161
168
162
169
def clear_outputs ():
170
+ task_id_rendered = ""
163
171
markdown_content = ""
164
172
json_content = ""
165
173
json_rendered_content = ""
@@ -168,6 +176,7 @@ def clear_outputs():
168
176
doctags_content = ""
169
177
170
178
return (
179
+ task_id_rendered ,
171
180
markdown_content ,
172
181
markdown_content ,
173
182
json_content ,
@@ -210,6 +219,51 @@ def change_ocr_lang(ocr_engine):
210
219
return "english,chinese"
211
220
212
221
222
+ def wait_task_finish (task_id : str , return_as_file : bool ):
223
+ conversion_sucess = False
224
+ task_finished = False
225
+ task_status = ""
226
+ ssl_ctx = get_ssl_context ()
227
+ while not task_finished :
228
+ try :
229
+ response = httpx .get (
230
+ f"{ get_api_endpoint ()} /v1alpha/status/poll/{ task_id } ?wait=5" ,
231
+ verify = ssl_ctx ,
232
+ timeout = 15 ,
233
+ )
234
+ task_status = response .json ()["task_status" ]
235
+ if task_status == "success" :
236
+ conversion_sucess = True
237
+ task_finished = True
238
+
239
+ if task_status in ("failure" , "revoked" ):
240
+ conversion_sucess = False
241
+ task_finished = True
242
+ raise RuntimeError (f"Task failed with status { task_status !r} " )
243
+ time .sleep (5 )
244
+ except Exception as e :
245
+ logger .error (f"Error processing file(s): { e } " )
246
+ conversion_sucess = False
247
+ task_finished = True
248
+ raise gr .Error (f"Error processing file(s): { e } " , print_exception = False )
249
+
250
+ if conversion_sucess :
251
+ try :
252
+ response = httpx .get (
253
+ f"{ get_api_endpoint ()} /v1alpha/result/{ task_id } " ,
254
+ timeout = 15 ,
255
+ verify = ssl_ctx ,
256
+ )
257
+ output = response_to_output (response , return_as_file )
258
+ return output
259
+ except Exception as e :
260
+ logger .error (f"Error getting task result: { e } " )
261
+
262
+ raise gr .Error (
263
+ f"Error getting task result, conversion finished with status: { task_status } "
264
+ )
265
+
266
+
213
267
def process_url (
214
268
input_sources ,
215
269
to_formats ,
@@ -256,7 +310,7 @@ def process_url(
256
310
try :
257
311
ssl_ctx = get_ssl_context ()
258
312
response = httpx .post (
259
- f"{ get_api_endpoint ()} /v1alpha/convert/source" ,
313
+ f"{ get_api_endpoint ()} /v1alpha/convert/source/async " ,
260
314
json = parameters ,
261
315
verify = ssl_ctx ,
262
316
timeout = 60 ,
@@ -269,12 +323,19 @@ def process_url(
269
323
error_message = data .get ("detail" , "An unknown error occurred." )
270
324
logger .error (f"Error processing file: { error_message } " )
271
325
raise gr .Error (f"Error processing file: { error_message } " , print_exception = False )
272
- output = response_to_output (response , return_as_file )
273
- return output
326
+
327
+ task_id_rendered = response .json ()["task_id" ]
328
+ return task_id_rendered
329
+
330
+
331
+ def file_to_base64 (file ):
332
+ with open (file .name , "rb" ) as f :
333
+ encoded_string = base64 .b64encode (f .read ()).decode ("utf-8" )
334
+ return encoded_string
274
335
275
336
276
337
def process_file (
277
- files ,
338
+ file ,
278
339
to_formats ,
279
340
image_export_mode ,
280
341
ocr ,
@@ -290,12 +351,13 @@ def process_file(
290
351
do_picture_classification ,
291
352
do_picture_description ,
292
353
):
293
- if not files or len ( files ) == 0 or files [ 0 ] == "" :
354
+ if not file or file == "" :
294
355
logger .error ("No files provided." )
295
356
raise gr .Error ("No files provided." , print_exception = False )
296
- files_data = [( "files" , (file . name , open ( file . name , "rb" ))) for file in files ]
357
+ files_data = [{ "base64_string" : file_to_base64 (file ), "filename" : file . name } ]
297
358
298
359
parameters = {
360
+ "file_sources" : files_data ,
299
361
"to_formats" : to_formats ,
300
362
"image_export_mode" : image_export_mode ,
301
363
"ocr" : str (ocr ).lower (),
@@ -315,9 +377,8 @@ def process_file(
315
377
try :
316
378
ssl_ctx = get_ssl_context ()
317
379
response = httpx .post (
318
- f"{ get_api_endpoint ()} /v1alpha/convert/file" ,
319
- files = files_data ,
320
- data = parameters ,
380
+ f"{ get_api_endpoint ()} /v1alpha/convert/source/async" ,
381
+ json = parameters ,
321
382
verify = ssl_ctx ,
322
383
timeout = 60 ,
323
384
)
@@ -329,8 +390,9 @@ def process_file(
329
390
error_message = data .get ("detail" , "An unknown error occurred." )
330
391
logger .error (f"Error processing file: { error_message } " )
331
392
raise gr .Error (f"Error processing file: { error_message } " , print_exception = False )
332
- output = response_to_output (response , return_as_file )
333
- return output
393
+
394
+ task_id_rendered = response .json ()["task_id" ]
395
+ return task_id_rendered
334
396
335
397
336
398
def response_to_output (response , return_as_file ):
@@ -444,24 +506,24 @@ def response_to_output(response, return_as_file):
444
506
)
445
507
446
508
# URL Processing Tab
447
- with gr .Tab ("Convert URL(s) " ):
509
+ with gr .Tab ("Convert URL" ):
448
510
with gr .Row ():
449
511
with gr .Column (scale = 4 ):
450
512
url_input = gr .Textbox (
451
- label = "Input Sources (comma-separated URLs) " ,
513
+ label = "URL Input Source " ,
452
514
placeholder = "https://arxiv.org/pdf/2206.01062" ,
453
515
)
454
516
with gr .Column (scale = 1 ):
455
- url_process_btn = gr .Button ("Process URL(s) " , scale = 1 )
517
+ url_process_btn = gr .Button ("Process URL" , scale = 1 )
456
518
url_reset_btn = gr .Button ("Reset" , scale = 1 )
457
519
458
520
# File Processing Tab
459
- with gr .Tab ("Convert File(s) " ):
521
+ with gr .Tab ("Convert File" ):
460
522
with gr .Row ():
461
523
with gr .Column (scale = 4 ):
462
524
file_input = gr .File (
463
525
elem_id = "file_input_zone" ,
464
- label = "Upload Files " ,
526
+ label = "Upload File " ,
465
527
file_types = [
466
528
".pdf" ,
467
529
".docx" ,
@@ -476,11 +538,11 @@ def response_to_output(response, return_as_file):
476
538
".png" ,
477
539
".gif" ,
478
540
],
479
- file_count = "multiple " ,
541
+ file_count = "single " ,
480
542
scale = 4 ,
481
543
)
482
544
with gr .Column (scale = 1 ):
483
- file_process_btn = gr .Button ("Process File(s) " , scale = 1 )
545
+ file_process_btn = gr .Button ("Process File" , scale = 1 )
484
546
file_reset_btn = gr .Button ("Reset" , scale = 1 )
485
547
486
548
# Options
@@ -540,7 +602,9 @@ def response_to_output(response, return_as_file):
540
602
)
541
603
with gr .Column (scale = 1 ):
542
604
abort_on_error = gr .Checkbox (label = "Abort on Error" , value = False )
543
- return_as_file = gr .Checkbox (label = "Return as File" , value = False )
605
+ return_as_file = gr .Checkbox (
606
+ label = "Return as File" , visible = False , value = False
607
+ ) # Disable until async handle output as file
544
608
with gr .Row ():
545
609
with gr .Column ():
546
610
do_code_enrichment = gr .Checkbox (
@@ -557,6 +621,10 @@ def response_to_output(response, return_as_file):
557
621
label = "Enable picture description" , value = False
558
622
)
559
623
624
+ # Task id output
625
+ with gr .Row (visible = False ) as task_id_output :
626
+ task_id_rendered = gr .Textbox (label = "Task id" , interactive = False )
627
+
560
628
# Document output
561
629
with gr .Row (visible = False ) as content_output :
562
630
with gr .Tab ("Markdown" ):
@@ -586,36 +654,34 @@ def response_to_output(response, return_as_file):
586
654
# UI Actions #
587
655
##############
588
656
657
+ # Disable until async handle output as file
589
658
# Handle Return as File
590
- url_input .change (
591
- auto_set_return_as_file ,
592
- inputs = [url_input , file_input , image_export_mode ],
593
- outputs = [return_as_file ],
594
- )
595
- file_input .change (
596
- auto_set_return_as_file ,
597
- inputs = [url_input , file_input , image_export_mode ],
598
- outputs = [return_as_file ],
599
- )
600
- image_export_mode .change (
601
- auto_set_return_as_file ,
602
- inputs = [url_input , file_input , image_export_mode ],
603
- outputs = [return_as_file ],
604
- )
659
+ # url_input.change(
660
+ # auto_set_return_as_file,
661
+ # inputs=[url_input, file_input, image_export_mode],
662
+ # outputs=[return_as_file],
663
+ # )
664
+ # file_input.change(
665
+ # auto_set_return_as_file,
666
+ # inputs=[url_input, file_input, image_export_mode],
667
+ # outputs=[return_as_file],
668
+ # )
669
+ # image_export_mode.change(
670
+ # auto_set_return_as_file,
671
+ # inputs=[url_input, file_input, image_export_mode],
672
+ # outputs=[return_as_file],
673
+ # )
605
674
606
675
# URL processing
607
676
url_process_btn .click (
608
677
set_options_visibility , inputs = [false_bool ], outputs = [options ]
609
678
).then (
610
679
set_download_button_label , inputs = [processing_text ], outputs = [download_file_btn ]
611
- ).then (
612
- set_outputs_visibility_process ,
613
- inputs = [return_as_file ],
614
- outputs = [content_output , file_output ],
615
680
).then (
616
681
clear_outputs ,
617
682
inputs = None ,
618
683
outputs = [
684
+ task_id_rendered ,
619
685
output_markdown ,
620
686
output_markdown_rendered ,
621
687
output_json ,
@@ -625,6 +691,10 @@ def response_to_output(response, return_as_file):
625
691
output_text ,
626
692
output_doctags ,
627
693
],
694
+ ).then (
695
+ set_task_id_visibility ,
696
+ inputs = [true_bool ],
697
+ outputs = [task_id_output ],
628
698
).then (
629
699
process_url ,
630
700
inputs = [
@@ -644,6 +714,16 @@ def response_to_output(response, return_as_file):
644
714
do_picture_classification ,
645
715
do_picture_description ,
646
716
],
717
+ outputs = [
718
+ task_id_rendered ,
719
+ ],
720
+ ).then (
721
+ set_outputs_visibility_process ,
722
+ inputs = [return_as_file ],
723
+ outputs = [content_output , file_output ],
724
+ ).then (
725
+ wait_task_finish ,
726
+ inputs = [task_id_rendered , return_as_file ],
647
727
outputs = [
648
728
output_markdown ,
649
729
output_markdown_rendered ,
@@ -674,21 +754,20 @@ def response_to_output(response, return_as_file):
674
754
set_outputs_visibility_direct ,
675
755
inputs = [false_bool , false_bool ],
676
756
outputs = [content_output , file_output ],
677
- ).then (clear_url_input , inputs = None , outputs = [url_input ])
757
+ ).then (set_task_id_visibility , inputs = [false_bool ], outputs = [task_id_output ]).then (
758
+ clear_url_input , inputs = None , outputs = [url_input ]
759
+ )
678
760
679
761
# File processing
680
762
file_process_btn .click (
681
763
set_options_visibility , inputs = [false_bool ], outputs = [options ]
682
764
).then (
683
765
set_download_button_label , inputs = [processing_text ], outputs = [download_file_btn ]
684
- ).then (
685
- set_outputs_visibility_process ,
686
- inputs = [return_as_file ],
687
- outputs = [content_output , file_output ],
688
766
).then (
689
767
clear_outputs ,
690
768
inputs = None ,
691
769
outputs = [
770
+ task_id_rendered ,
692
771
output_markdown ,
693
772
output_markdown_rendered ,
694
773
output_json ,
@@ -698,6 +777,10 @@ def response_to_output(response, return_as_file):
698
777
output_text ,
699
778
output_doctags ,
700
779
],
780
+ ).then (
781
+ set_task_id_visibility ,
782
+ inputs = [true_bool ],
783
+ outputs = [task_id_output ],
701
784
).then (
702
785
process_file ,
703
786
inputs = [
@@ -717,6 +800,16 @@ def response_to_output(response, return_as_file):
717
800
do_picture_classification ,
718
801
do_picture_description ,
719
802
],
803
+ outputs = [
804
+ task_id_rendered ,
805
+ ],
806
+ ).then (
807
+ set_outputs_visibility_process ,
808
+ inputs = [return_as_file ],
809
+ outputs = [content_output , file_output ],
810
+ ).then (
811
+ wait_task_finish ,
812
+ inputs = [task_id_rendered , return_as_file ],
720
813
outputs = [
721
814
output_markdown ,
722
815
output_markdown_rendered ,
@@ -747,4 +840,6 @@ def response_to_output(response, return_as_file):
747
840
set_outputs_visibility_direct ,
748
841
inputs = [false_bool , false_bool ],
749
842
outputs = [content_output , file_output ],
750
- ).then (clear_file_input , inputs = None , outputs = [file_input ])
843
+ ).then (set_task_id_visibility , inputs = [false_bool ], outputs = [task_id_output ]).then (
844
+ clear_file_input , inputs = None , outputs = [file_input ]
845
+ )
0 commit comments