Merge remote-tracking branch 'origin/main' into jakep/new_trainer

jakep-allenai · jakep-allenai · commit a4752b5ef983 · 2025-07-23T03:32:49.000Z
diff --git a/README.md b/README.md
@@ -138,7 +138,7 @@ conda activate olmocr
 pip install olmocr[bench]
 
 # For actually converting the files with your own GPU
-pip install olmocr.[gpu]  --extra-index-url https://download.pytorch.org/whl/cu128
+pip install olmocr[gpu]  --extra-index-url https://download.pytorch.org/whl/cu128
 
 # Recommended: Install flash infer for faster inference on GPU
 pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
@@ -242,11 +242,11 @@ python -m olmocr.pipeline ./localworkspace --markdown --pdfs olmocr-sample.pdf
 
 ```bash
 python -m olmocr.pipeline --help
-usage: pipeline.py [-h] [--pdfs PDFS] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP]
-                   [--max_page_retries MAX_PAGE_RETRIES] [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS] [--apply_filter] [--stats] [--model MODEL]
-                   [--model_max_context MODEL_MAX_CONTEXT] [--model_chat_template MODEL_CHAT_TEMPLATE] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM]
-                   [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER]
-                   [--beaker_gpus BEAKER_GPUS] [--beaker_priority BEAKER_PRIORITY]
+usage: pipeline.py [-h] [--pdfs [PDFS ...]] [--workspace_profile WORKSPACE_PROFILE] [--pdf_profile PDF_PROFILE] [--pages_per_group PAGES_PER_GROUP] [--max_page_retries MAX_PAGE_RETRIES]
+                   [--max_page_error_rate MAX_PAGE_ERROR_RATE] [--workers WORKERS] [--apply_filter] [--stats] [--markdown] [--model MODEL] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION]
+                   [--max_model_len MAX_MODEL_LEN] [--model_max_context MODEL_MAX_CONTEXT] [--model_chat_template MODEL_CHAT_TEMPLATE] [--target_longest_image_dim TARGET_LONGEST_IMAGE_DIM]
+                   [--target_anchor_text_len TARGET_ANCHOR_TEXT_LEN] [--beaker] [--beaker_workspace BEAKER_WORKSPACE] [--beaker_cluster BEAKER_CLUSTER] [--beaker_gpus BEAKER_GPUS]
+                   [--beaker_priority BEAKER_PRIORITY] [--port PORT] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--data-parallel-size DATA_PARALLEL_SIZE]
                    workspace
 
 Manager for running millions of PDFs through a batch inference pipeline
@@ -273,6 +273,10 @@ options:
   --markdown            Also write natural text to markdown files preserving the folder structure of the input pdfs
   --model MODEL         List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the
                         one which is fastest to access
+  --gpu-memory-utilization GPU_MEMORY_UTILIZATION
+                        Fraction of VRAM vLLM may pre-allocate for KV-cache (passed through to vllm serve).
+  --max_model_len MAX_MODEL_LEN
+                        Upper bound (tokens) vLLM will allocate KV-cache for; passed through to vllm serve as --max-model-len.
   --model_max_context MODEL_MAX_CONTEXT
                         Maximum context length that the model was fine tuned under
   --model_chat_template MODEL_CHAT_TEMPLATE
diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py
@@ -584,6 +584,12 @@ async def vllm_server_task(model_name_or_path, args, semaphore):
         str(args.data_parallel_size),
     ]
 
+    if args.gpu_memory_utilization is not None:
+        cmd.extend(["--gpu-memory-utilization", str(args.gpu_memory_utilization)])
+   
+    if args.max_model_len is not None:
+        cmd.extend(["--max-model-len", str(args.max_model_len)]) 
+
     proc = await asyncio.create_subprocess_exec(
         *cmd,
         stdout=asyncio.subprocess.PIPE,
@@ -1008,6 +1014,10 @@ async def main():
         help="List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the one which is fastest to access",
         default="allenai/olmOCR-7B-0225-preview",
     )
+
+    parser.add_argument("--gpu-memory-utilization", type=float, help="Fraction of VRAM vLLM may pre-allocate for KV-cache " "(passed through to vllm serve).")
+    parser.add_argument("--max_model_len", type=int, help="Upper bound (tokens) vLLM will allocate KV-cache for; " "passed through to vllm serve as --max-model-len.",)
+
     parser.add_argument("--model_max_context", type=int, default="8192", help="Maximum context length that the model was fine tuned under")
     parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288)
     parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1)
@@ -1028,6 +1038,10 @@ async def main():
     parser.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
     args = parser.parse_args()
 
+    logger.info(
+        "If you run out of GPU memory during start-up or get 'KV cache is larger than available memory' errors, retry with lower values, e.g. --gpu_memory_utilization 0.80  --max_model_len 16384"
+    )
+  
     global workspace_s3, pdf_s3
     # set the global BASE_SERVER_PORT from args
     global BASE_SERVER_PORT
diff --git a/pyproject.toml b/pyproject.toml
@@ -132,6 +132,8 @@ olmocr = [
     "py.typed",
     "viewer/*.html",
     "eval/*.html",
+    "bench/katex/*.js",
+    "bench/katex/*.css",
 ]
 
 [tool.setuptools.dynamic]

Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,8 @@ olmocr = [`
`132`	`132`	`"py.typed",`
`133`	`133`	`"viewer/*.html",`
`134`	`134`	`"eval/*.html",`
	`135`	`+ "bench/katex/*.js",`
	`136`	`+ "bench/katex/*.css",`
`135`	`137`	`]`
`136`	`138`
`137`	`139`	`[tool.setuptools.dynamic]`