feat: Expose more conversion options (#142)

dolfim-ibm · web-flow · commit 6b3d281f0290 · 2025-04-22T10:41:47.000+02:00
Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;
diff --git a/docling_serve/datamodel/convert.py b/docling_serve/datamodel/convert.py
@@ -8,6 +8,11 @@
     EasyOcrOptions,
     PdfBackend,
     TableFormerMode,
+    TableStructureOptions,
+)
+from docling.datamodel.settings import (
+    DEFAULT_PAGE_RANGE,
+    PageRange,
 )
 from docling.models.factories import get_ocr_factory
 from docling_core.types.doc import ImageRefMode
@@ -121,16 +126,32 @@ class ConvertDocumentsOptions(BaseModel):
     table_mode: Annotated[
         TableFormerMode,
         Field(
-            TableFormerMode.FAST,
             description=(
                 "Mode to use for table structure, String. "
                 f"Allowed values: {', '.join([v.value for v in TableFormerMode])}. "
                 "Optional, defaults to fast."
             ),
-            examples=[TableFormerMode.FAST],
+            examples=[TableStructureOptions().mode],
             # pattern="fast|accurate",
         ),
-    ] = TableFormerMode.FAST
+    ] = TableStructureOptions().mode
+
+    page_range: Annotated[
+        PageRange,
+        Field(
+            description="Only convert a range of pages. The page number starts at 1.",
+            examples=[(1, 4)],
+        ),
+    ] = DEFAULT_PAGE_RANGE
+
+    document_timeout: Annotated[
+        float,
+        Field(
+            description="The timeout for processing each document, in seconds.",
+            gt=0,
+            le=docling_serve_settings.max_document_timeout,
+        ),
+    ] = docling_serve_settings.max_document_timeout
 
     abort_on_error: Annotated[
         bool,
diff --git a/docling_serve/docling_conversion.py b/docling_serve/docling_conversion.py
@@ -110,6 +110,7 @@ def get_pdf_pipeline_opts(
             ocr_options.lang = request.ocr_lang
 
     pipeline_options = PdfPipelineOptions(
+        document_timeout=request.document_timeout,
         do_ocr=request.do_ocr,
         ocr_options=ocr_options,
         do_table_structure=request.do_table_structure,
@@ -180,6 +181,9 @@ def convert_documents(
     results: Iterator[ConversionResult] = converter.convert_all(
         sources,
         headers=headers,
+        page_range=options.page_range,
+        max_file_size=docling_serve_settings.max_file_size,
+        max_num_pages=docling_serve_settings.max_num_pages,
     )
 
     return results
diff --git a/docling_serve/gradio_ui.py b/docling_serve/gradio_ui.py
@@ -11,6 +11,12 @@
 import gradio as gr
 import httpx
 
+from docling.datamodel.pipeline_options import (
+    PdfBackend,
+    TableFormerMode,
+    TableStructureOptions,
+)
+
 from docling_serve.helper_functions import _to_list_of_strings
 from docling_serve.settings import docling_serve_settings, uvicorn_settings
 
@@ -358,20 +364,22 @@ def process_file(
 
     parameters = {
         "file_sources": files_data,
-        "to_formats": to_formats,
-        "image_export_mode": image_export_mode,
-        "ocr": str(ocr).lower(),
-        "force_ocr": str(force_ocr).lower(),
-        "ocr_engine": ocr_engine,
-        "ocr_lang": _to_list_of_strings(ocr_lang),
-        "pdf_backend": pdf_backend,
-        "table_mode": table_mode,
-        "abort_on_error": str(abort_on_error).lower(),
-        "return_as_file": str(return_as_file).lower(),
-        "do_code_enrichment": str(do_code_enrichment).lower(),
-        "do_formula_enrichment": str(do_formula_enrichment).lower(),
-        "do_picture_classification": str(do_picture_classification).lower(),
-        "do_picture_description": str(do_picture_description).lower(),
+        "options": {
+            "to_formats": to_formats,
+            "image_export_mode": image_export_mode,
+            "ocr": ocr,
+            "force_ocr": force_ocr,
+            "ocr_engine": ocr_engine,
+            "ocr_lang": _to_list_of_strings(ocr_lang),
+            "pdf_backend": pdf_backend,
+            "table_mode": table_mode,
+            "abort_on_error": abort_on_error,
+            "return_as_file": return_as_file,
+            "do_code_enrichment": do_code_enrichment,
+            "do_formula_enrichment": do_formula_enrichment,
+            "do_picture_classification": do_picture_classification,
+            "do_picture_description": do_picture_description,
+        },
     }
 
     try:
@@ -511,7 +519,7 @@ def response_to_output(response, return_as_file):
             with gr.Column(scale=4):
                 url_input = gr.Textbox(
                     label="URL Input Source",
-                    placeholder="https://arxiv.org/pdf/2206.01062",
+                    placeholder="https://arxiv.org/pdf/2501.17887",
                 )
             with gr.Column(scale=1):
                 url_process_btn = gr.Button("Process URL", scale=1)
@@ -530,6 +538,7 @@ def response_to_output(response, return_as_file):
                         ".pptx",
                         ".html",
                         ".xlsx",
+                        ".json",
                         ".asciidoc",
                         ".txt",
                         ".md",
@@ -551,14 +560,14 @@ def response_to_output(response, return_as_file):
             with gr.Column(scale=1):
                 to_formats = gr.CheckboxGroup(
                     [
-                        ("Markdown", "md"),
                         ("Docling (JSON)", "json"),
+                        ("Markdown", "md"),
                         ("HTML", "html"),
                         ("Plain Text", "text"),
                         ("Doc Tags", "doctags"),
                     ],
                     label="To Formats",
-                    value=["md"],
+                    value=["json", "md"],
                 )
             with gr.Column(scale=1):
                 image_export_mode = gr.Radio(
@@ -590,15 +599,17 @@ def response_to_output(response, return_as_file):
                 )
             ocr_engine.change(change_ocr_lang, inputs=[ocr_engine], outputs=[ocr_lang])
         with gr.Row():
-            with gr.Column(scale=2):
+            with gr.Column(scale=4):
                 pdf_backend = gr.Radio(
-                    ["pypdfium2", "dlparse_v1", "dlparse_v2"],
+                    [v.value for v in PdfBackend],
                     label="PDF Backend",
-                    value="dlparse_v2",
+                    value=PdfBackend.DLPARSE_V4.value,
                 )
             with gr.Column(scale=2):
                 table_mode = gr.Radio(
-                    ["fast", "accurate"], label="Table Mode", value="fast"
+                    [(v.value.capitalize(), v.value) for v in TableFormerMode],
+                    label="Table Mode",
+                    value=TableStructureOptions().mode.value,
                 )
             with gr.Column(scale=1):
                 abort_on_error = gr.Checkbox(label="Abort on Error", value=False)
@@ -627,16 +638,16 @@ def response_to_output(response, return_as_file):
 
     # Document output
     with gr.Row(visible=False) as content_output:
+        with gr.Tab("Docling (JSON)"):
+            output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
+        with gr.Tab("Docling-Rendered"):
+            output_json_rendered = gr.HTML(label="Response")
         with gr.Tab("Markdown"):
             output_markdown = gr.Code(
                 language="markdown", wrap_lines=True, show_label=False
             )
         with gr.Tab("Markdown-Rendered"):
             output_markdown_rendered = gr.Markdown(label="Response")
-        with gr.Tab("Docling (JSON)"):
-            output_json = gr.Code(language="json", wrap_lines=True, show_label=False)
-        with gr.Tab("Docling-Rendered"):
-            output_json_rendered = gr.HTML()
         with gr.Tab("HTML"):
             output_html = gr.Code(language="html", wrap_lines=True, show_label=False)
         with gr.Tab("HTML-Rendered"):
diff --git a/docling_serve/settings.py b/docling_serve/settings.py
@@ -1,3 +1,4 @@
+import sys
 from pathlib import Path
 from typing import Optional, Union
 
@@ -38,6 +39,10 @@ class DoclingServeSettings(BaseSettings):
     options_cache_size: int = 2
     allow_external_plugins: bool = False
 
+    max_document_timeout: float = 3_600 * 24 * 7  # 7 days
+    max_num_pages: int = sys.maxsize
+    max_file_size: int = sys.maxsize
+
     cors_origins: list[str] = ["*"]
     cors_methods: list[str] = ["*"]
     cors_headers: list[str] = ["*"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,8 @@ dependencies = [
 
 [project.optional-dependencies]
 ui = [
-    "gradio~=5.9"
+    "gradio~=5.9",
+    "pydantic<2.11.0",  # fix compatibility between gradio and new pydantic 2.11
 ]
 tesserocr = [
     "tesserocr~=2.7"
diff --git a/tests/test_1-file-all-outputs.py b/tests/test_1-file-all-outputs.py
@@ -92,16 +92,11 @@ def safe_slice(value, length=100):
             msg=f'JSON document should contain \'{{\\n  "schema_name": "DoclingDocument\'". Received: {safe_slice(data["document"]["json_content"])}',
         )
     # HTML check
-    check.is_in(
-        "html_content",
-        data.get("document", {}),
-        msg=f"Response should contain 'html_content' key. Received keys: {list(data.get('document', {}).keys())}",
-    )
     if data.get("document", {}).get("html_content") is not None:
         check.is_in(
-            '<!DOCTYPE html>\n<html lang="en">\n<head>',
+            "<!DOCTYPE html>\n<html>\n<head>",
             data["document"]["html_content"],
-            msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
+            msg=f"HTML document should contain '<!DOCTYPE html>\\n<html>'. Received: {safe_slice(data['document']['html_content'])}",
         )
     # Text check
     check.is_in(
@@ -123,7 +118,7 @@ def safe_slice(value, length=100):
     )
     if data.get("document", {}).get("doctags_content") is not None:
         check.is_in(
-            "<document>\n<section_header_level_1><location>",
+            "<doctag><page_header><loc",
             data["document"]["doctags_content"],
-            msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
+            msg=f"DocTags document should contain '<doctag><page_header><loc'. Received: {safe_slice(data['document']['doctags_content'])}",
         )
diff --git a/tests/test_1-url-all-outputs.py b/tests/test_1-url-all-outputs.py
@@ -93,9 +93,9 @@ def safe_slice(value, length=100):
     )
     if data.get("document", {}).get("html_content") is not None:
         check.is_in(
-            '<!DOCTYPE html>\n<html lang="en">\n<head>',
+            "<!DOCTYPE html>\n<html>\n<head>",
             data["document"]["html_content"],
-            msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",
+            msg=f"HTML document should contain '<!DOCTYPE html>\\n<html>'. Received: {safe_slice(data['document']['html_content'])}",
         )
     # Text check
     check.is_in(
@@ -117,7 +117,7 @@ def safe_slice(value, length=100):
     )
     if data.get("document", {}).get("doctags_content") is not None:
         check.is_in(
-            "<document>\n<section_header_level_1><location>",
+            "<doctag><page_header><loc",
             data["document"]["doctags_content"],
-            msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",
+            msg=f"DocTags document should contain '<doctag><page_header><loc'. Received: {safe_slice(data['document']['doctags_content'])}",
         )
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,8 @@ dependencies = [`
`43`	`43`
`44`	`44`	`[project.optional-dependencies]`
`45`	`45`	`ui = [`
`46`		`- "gradio~=5.9"`
	`46`	`+ "gradio~=5.9",`
	`47`	`+ "pydantic<2.11.0", # fix compatibility between gradio and new pydantic 2.11`
`47`	`48`	`]`
`48`	`49`	`tesserocr = [`
`49`	`50`	`"tesserocr~=2.7"`
Original file line number	Diff line number	Diff line change
`@@ -93,9 +93,9 @@ def safe_slice(value, length=100):`
`93`	`93`	`)`
`94`	`94`	`if data.get("document", {}).get("html_content") is not None:`
`95`	`95`	`check.is_in(`
`96`		`- '<!DOCTYPE html>\n<html lang="en">\n<head>',`
	`96`	`+ "<!DOCTYPE html>\n<html>\n<head>",`
`97`	`97`	`data["document"]["html_content"],`
`98`		`- msg=f"HTML document should contain '<!DOCTYPE html>\\n<html lang=\"en'>. Received: {safe_slice(data['document']['html_content'])}",`
	`98`	`+ msg=f"HTML document should contain '<!DOCTYPE html>\\n<html>'. Received: {safe_slice(data['document']['html_content'])}",`
`99`	`99`	`)`
`100`	`100`	`# Text check`
`101`	`101`	`check.is_in(`
`@@ -117,7 +117,7 @@ def safe_slice(value, length=100):`
`117`	`117`	`)`
`118`	`118`	`if data.get("document", {}).get("doctags_content") is not None:`
`119`	`119`	`check.is_in(`
`120`		`- "<document>\n<section_header_level_1><location>",`
	`120`	`+ "<doctag><page_header><loc",`
`121`	`121`	`data["document"]["doctags_content"],`
`122`		`- msg=f"DocTags document should contain '<document>\\n<section_header_level_1><location>'. Received: {safe_slice(data['document']['doctags_content'])}",`
	`122`	`+ msg=f"DocTags document should contain '<doctag><page_header><loc'. Received: {safe_slice(data['document']['doctags_content'])}",`
`123`	`123`	`)`