You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Manager for running millions of PDFs through a batch inference pipeline
@@ -273,6 +273,10 @@ options:
273
273
--markdown Also write natural text to markdown files preserving the folder structure of the input pdfs
274
274
--model MODEL List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the
275
275
one which is fastest to access
276
+
--gpu-memory-utilization GPU_MEMORY_UTILIZATION
277
+
Fraction of VRAM vLLM may pre-allocate for KV-cache (passed through to vllm serve).
278
+
--max_model_len MAX_MODEL_LEN
279
+
Upper bound (tokens) vLLM will allocate KV-cache for; passed through to vllm serve as --max-model-len.
276
280
--model_max_context MODEL_MAX_CONTEXT
277
281
Maximum context length that the model was fine tuned under
help="List of paths where you can find the model to convert this pdf. You can specify several different paths here, and the script will try to use the one which is fastest to access",
1009
1015
default="allenai/olmOCR-7B-0225-preview",
1010
1016
)
1017
+
1018
+
parser.add_argument("--gpu-memory-utilization", type=float, help="Fraction of VRAM vLLM may pre-allocate for KV-cache ""(passed through to vllm serve).")
1019
+
parser.add_argument("--max_model_len", type=int, help="Upper bound (tokens) vLLM will allocate KV-cache for; ""passed through to vllm serve as --max-model-len.",)
1020
+
1011
1021
parser.add_argument("--model_max_context", type=int, default="8192", help="Maximum context length that the model was fine tuned under")
1012
1022
parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288)
1013
1023
parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1)
@@ -1028,6 +1038,10 @@ async def main():
1028
1038
parser.add_argument("--data-parallel-size", "-dp", type=int, default=1, help="Data parallel size for vLLM")
1029
1039
args=parser.parse_args()
1030
1040
1041
+
logger.info(
1042
+
"If you run out of GPU memory during start-up or get 'KV cache is larger than available memory' errors, retry with lower values, e.g. --gpu_memory_utilization 0.80 --max_model_len 16384"
0 commit comments