Skip to content

Commit 6b3d281

Browse files
authored
feat: Expose more conversion options (#142)
Signed-off-by: Michele Dolfi <[email protected]>
1 parent b598872 commit 6b3d281

File tree

8 files changed

+1083
-955
lines changed

8 files changed

+1083
-955
lines changed

docling_serve/datamodel/convert.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88
EasyOcrOptions,
99
PdfBackend,
1010
TableFormerMode,
11+
TableStructureOptions,
12+
)
13+
from docling.datamodel.settings import (
14+
DEFAULT_PAGE_RANGE,
15+
PageRange,
1116
)
1217
from docling.models.factories import get_ocr_factory
1318
from docling_core.types.doc import ImageRefMode
@@ -121,16 +126,32 @@ class ConvertDocumentsOptions(BaseModel):
121126
table_mode: Annotated[
122127
TableFormerMode,
123128
Field(
124-
TableFormerMode.FAST,
125129
description=(
126130
"Mode to use for table structure, String. "
127131
f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
128132
"Optional, defaults to fast."
129133
),
130-
examples=[TableFormerMode.FAST],
134+
examples=[TableStructureOptions().mode],
131135
# pattern="fast|accurate",
132136
),
133-
] = TableFormerMode.FAST
137+
] = TableStructureOptions().mode
138+
139+
page_range: Annotated[
140+
PageRange,
141+
Field(
142+
description="Only convert a range of pages. The page number starts at 1.",
143+
examples=[(1, 4)],
144+
),
145+
] = DEFAULT_PAGE_RANGE
146+
147+
document_timeout: Annotated[
148+
float,
149+
Field(
150+
description="The timeout for processing each document, in seconds.",
151+
gt=0,
152+
le=docling_serve_settings.max_document_timeout,
153+
),
154+
] = docling_serve_settings.max_document_timeout
134155

135156
abort_on_error: Annotated[
136157
bool,

docling_serve/docling_conversion.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def get_pdf_pipeline_opts(
110110
ocr_options.lang = request.ocr_lang
111111

112112
pipeline_options = PdfPipelineOptions(
113+
document_timeout=request.document_timeout,
113114
do_ocr=request.do_ocr,
114115
ocr_options=ocr_options,
115116
do_table_structure=request.do_table_structure,
@@ -180,6 +181,9 @@ def convert_documents(
180181
results: Iterator[ConversionResult] = converter.convert_all(
181182
sources,
182183
headers=headers,
184+
page_range=options.page_range,
185+
max_file_size=docling_serve_settings.max_file_size,
186+
max_num_pages=docling_serve_settings.max_num_pages,
183187
)
184188

185189
return results

docling_serve/gradio_ui.py

Lines changed: 36 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111
import gradio as gr
1212
import httpx
1313

14+
from docling.datamodel.pipeline_options import (
15+
PdfBackend,
16+
TableFormerMode,
17+
TableStructureOptions,
18+
)
19+
1420
from docling_serve.helper_functions import _to_list_of_strings
1521
from docling_serve.settings import docling_serve_settings, uvicorn_settings
1622

@@ -358,20 +364,22 @@ def process_file(
358364

359365
parameters = {
360366
"file_sources": files_data,
361-
"to_formats": to_formats,
362-
"image_export_mode": image_export_mode,
363-
"ocr": str(ocr).lower(),
364-
"force_ocr": str(force_ocr).lower(),
365-
"ocr_engine": ocr_engine,
366-
"ocr_lang": _to_list_of_strings(ocr_lang),
367-
"pdf_backend": pdf_backend,
368-
"table_mode": table_mode,
369-
"abort_on_error": str(abort_on_error).lower(),
370-
"return_as_file": str(return_as_file).lower(),
371-
"do_code_enrichment": str(do_code_enrichment).lower(),
372-
"do_formula_enrichment": str(do_formula_enrichment).lower(),
373-
"do_picture_classification": str(do_picture_classification).lower(),
374-
"do_picture_description": str(do_picture_description).lower(),
367+
"options": {
368+
"to_formats": to_formats,
369+
"image_export_mode": image_export_mode,
370+
"ocr": ocr,
371+
"force_ocr": force_ocr,
372+
"ocr_engine": ocr_engine,
373+
"ocr_lang": _to_list_of_strings(ocr_lang),
374+
"pdf_backend": pdf_backend,
375+
"table_mode": table_mode,
376+
"abort_on_error": abort_on_error,
377+
"return_as_file": return_as_file,
378+
"do_code_enrichment": do_code_enrichment,
379+
"do_formula_enrichment": do_formula_enrichment,
380+
"do_picture_classification": do_picture_classification,
381+
"do_picture_description": do_picture_description,
382+
},
375383
}
376384

377385
try:
@@ -511,7 +519,7 @@ def response_to_output(response, return_as_file):
511519
with gr.Column(scale=4):
512520
url_input = gr.Textbox(
513521
label="URL Input Source",
514-
placeholder="https://arxiv.org/pdf/2206.01062",
522+
placeholder="https://arxiv.org/pdf/2501.17887",
515523
)
516524
with gr.Column(scale=1):
517525
url_process_btn = gr.Button("Process URL", scale=1)
@@ -530,6 +538,7 @@ def response_to_output(response, return_as_file):
530538
".pptx",
531539
".html",
532540
".xlsx",
541+
".json",
533542
".asciidoc",
534543
".txt",
535544
".md",
@@ -551,14 +560,14 @@ def response_to_output(response, return_as_file):
551560
with gr.Column(scale=1):
552561
to_formats = gr.CheckboxGroup(
553562
[
554-
("Markdown", "md"),
555563
("Docling (JSON)", "json"),
564+
("Markdown", "md"),
556565
("HTML", "html"),
557566
("Plain Text", "text"),
558567
("Doc Tags", "doctags"),
559568
],
560569
label="To Formats",
561-
value=["md"],
570+
value=["json", "md"],
562571
)
563572
with gr.Column(scale=1):
564573
image_export_mode = gr.Radio(
@@ -590,15 +599,17 @@ def response_to_output(response, return_as_file):
590599
)
591600
ocr_engine.change(change_ocr_lang, inputs=[ocr_engine], outputs=[ocr_lang])
592601
with gr.Row():
593-
with gr.Column(scale=2):
602+
with gr.Column(scale=4):
594603
pdf_backend = gr.Radio(
595-
["pypdfium2", "dlparse_v1", "dlparse_v2"],
604+
[v.value for v in PdfBackend],
596605
label="PDF Backend",
597-
value="dlparse_v2",
606+
value=PdfBackend.DLPARSE_V4.value,
598607
)
599608
with gr.Column(scale=2):
600609
table_mode = gr.Radio(
601-
["fast", "accurate"], label="Table Mode", value="fast"
610+
[(v.value.capitalize(), v.value) for v in TableFormerMode],
611+
label="Table Mode",
612+
value=TableStructureOptions().mode.value,
602613
)
603614
with gr.Column(scale=1):
604615
abort_on_error = gr.Checkbox(label="Abort on Error", value=False)
@@ -627,16 +638,16 @@ def response_to_output(response, return_as_file):
627638

628639
# Document output
629640
with gr.Row(visible=False) as content_output:
641+
with gr.Tab("Docling (JSON)"):
642+
output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
643+
with gr.Tab("Docling-Rendered"):
644+
output_json_rendered = gr.HTML(label="Response")
630645
with gr.Tab("Markdown"):
631646
output_markdown = gr.Code(
632647
language="markdown", wrap_lines=True, show_label=False
633648
)
634649
with gr.Tab("Markdown-Rendered"):
635650
output_markdown_rendered = gr.Markdown(label="Response")
636-
with gr.Tab("Docling (JSON)"):
637-
output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
638-
with gr.Tab("Docling-Rendered"):
639-
output_json_rendered = gr.HTML()
640651
with gr.Tab("HTML"):
641652
output_html = gr.Code(language="html", wrap_lines=True, show_label=False)
642653
with gr.Tab("HTML-Rendered"):

docling_serve/settings.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import sys
12
from pathlib import Path
23
from typing import Optional, Union
34

@@ -38,6 +39,10 @@ class DoclingServeSettings(BaseSettings):
3839
options_cache_size: int = 2
3940
allow_external_plugins: bool = False
4041

42+
max_document_timeout: float = 3_600 * 24 * 7 # 7 days
43+
max_num_pages: int = sys.maxsize
44+
max_file_size: int = sys.maxsize
45+
4146
cors_origins: list[str] = ["*"]
4247
cors_methods: list[str] = ["*"]
4348
cors_headers: list[str] = ["*"]

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ dependencies = [
4343

4444
[project.optional-dependencies]
4545
ui = [
46-
"gradio~=5.9"
46+
"gradio~=5.9",
47+
"pydantic<2.11.0", # fix compatibility between gradio and new pydantic 2.11
4748
]
4849
tesserocr = [
4950
"tesserocr~=2.7"

tests/test_1-file-all-outputs.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -92,16 +92,11 @@ def safe_slice(value, length=100):
9292
msg=f'JSON document should contain \'{{\\n "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
9393
)
9494
# HTML check
95-
check.is_in(
96-
"html_content",
97-
data.get("document", {}),
98-
msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
99-
)
10095
if data.get("document", {}).get("html_content") is not None:
10196
check.is_in(
102-
'<!DOCTYPE html>\n<html lang="en">\n<head>',
97+
"<!DOCTYPE html>\n<html>\n<head>",
10398
data["document"]["html_content"],
104-
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
99+
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html>'. Received: {safe_slice(data['document']['html_content'])}",
105100
)
106101
# Text check
107102
check.is_in(
@@ -123,7 +118,7 @@ def safe_slice(value, length=100):
123118
)
124119
if data.get("document", {}).get("doctags_content") is not None:
125120
check.is_in(
126-
"<document>\n<section_header_level_1><location>",
121+
"<doctag><page_header><loc",
127122
data["document"]["doctags_content"],
128-
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
123+
msg=f"DocTags document should contain '<doctag><page_header><loc'. Received: {safe_slice(data['document']['doctags_content'])}",
129124
)

tests/test_1-url-all-outputs.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,9 @@ def safe_slice(value, length=100):
9393
)
9494
if data.get("document", {}).get("html_content") is not None:
9595
check.is_in(
96-
'<!DOCTYPE html>\n<html lang="en">\n<head>',
96+
"<!DOCTYPE html>\n<html>\n<head>",
9797
data["document"]["html_content"],
98-
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
98+
msg=f"HTML document should contain '<!DOCTYPE html>\\n<html>'. Received: {safe_slice(data['document']['html_content'])}",
9999
)
100100
# Text check
101101
check.is_in(
@@ -117,7 +117,7 @@ def safe_slice(value, length=100):
117117
)
118118
if data.get("document", {}).get("doctags_content") is not None:
119119
check.is_in(
120-
"<document>\n<section_header_level_1><location>",
120+
"<doctag><page_header><loc",
121121
data["document"]["doctags_content"],
122-
msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
122+
msg=f"DocTags document should contain '<doctag><page_header><loc'. Received: {safe_slice(data['document']['doctags_content'])}",
123123
)

0 commit comments

Comments
 (0)