Skip to content

Commit 6e82724

Browse files
committed
Lint fixes
1 parent 5ec4967 commit 6e82724

File tree

10 files changed

+404
-474
lines changed

10 files changed

+404
-474
lines changed

olmocr/pipeline.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
1818
from concurrent.futures.process import BrokenProcessPool
1919
from dataclasses import dataclass
20-
from functools import cache, partial
20+
from functools import cache
2121
from io import BytesIO
2222
from urllib.parse import urlparse
2323

@@ -34,7 +34,6 @@
3434
check_torch_gpu_available,
3535
)
3636
from olmocr.data.renderpdf import render_pdf_to_base64png
37-
from olmocr.train.dataloader import FrontMatterParser
3837
from olmocr.filter.filter import Language, PdfFilter
3938
from olmocr.image_utils import convert_image_to_pdf_bytes, is_jpeg, is_png
4039
from olmocr.metrics import MetricsKeeper, WorkerTracker
@@ -48,6 +47,7 @@
4847
get_s3_bytes_with_backoff,
4948
parse_s3_path,
5049
)
50+
from olmocr.train.dataloader import FrontMatterParser
5151
from olmocr.version import VERSION
5252
from olmocr.work_queue import LocalWorkQueue, S3WorkQueue, WorkQueue
5353

@@ -227,7 +227,9 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
227227

228228
# Enable guided decoding regex if needed
229229
if args.guided_decoding:
230-
query["guided_regex"] = r"---\nprimary_language: (?:[a-z]{2}|null)\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction: (?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n(?:---|---\n[\s\S]+)"
230+
query["guided_regex"] = (
231+
r"---\nprimary_language: (?:[a-z]{2}|null)\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction: (?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n(?:---|---\n[\s\S]+)"
232+
)
231233

232234
logger.info(f"Built page query for {pdf_orig_path}-{page_num}")
233235

@@ -247,7 +249,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
247249
local_anchor_text_len = max(1, local_anchor_text_len // 2)
248250
logger.info(f"Reducing anchor text len to {local_anchor_text_len} for {pdf_orig_path}-{page_num}")
249251
raise ValueError("Response exceeded model_max_context, cannot use this response")
250-
252+
251253
if base_response_data["choices"][0]["finish_reason"] != "stop":
252254
local_anchor_text_len = max(1, local_anchor_text_len // 2)
253255
logger.info(f"Reducing anchor text len to {local_anchor_text_len} for {pdf_orig_path}-{page_num}")
@@ -329,6 +331,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
329331
is_fallback=True,
330332
)
331333

334+
332335
async def process_pdf(args, worker_id: int, pdf_orig_path: str):
333336
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf:
334337
try:
@@ -586,9 +589,9 @@ async def vllm_server_task(model_name_or_path, args, semaphore):
586589

587590
if args.gpu_memory_utilization is not None:
588591
cmd.extend(["--gpu-memory-utilization", str(args.gpu_memory_utilization)])
589-
592+
590593
if args.max_model_len is not None:
591-
cmd.extend(["--max-model-len", str(args.max_model_len)])
594+
cmd.extend(["--max-model-len", str(args.max_model_len)])
592595

593596
proc = await asyncio.create_subprocess_exec(
594597
*cmd,
@@ -1016,7 +1019,11 @@ async def main():
10161019
)
10171020

10181021
parser.add_argument("--gpu-memory-utilization", type=float, help="Fraction of VRAM vLLM may pre-allocate for KV-cache " "(passed through to vllm serve).")
1019-
parser.add_argument("--max_model_len", type=int, help="Upper bound (tokens) vLLM will allocate KV-cache for; " "passed through to vllm serve as --max-model-len.",)
1022+
parser.add_argument(
1023+
"--max_model_len",
1024+
type=int,
1025+
help="Upper bound (tokens) vLLM will allocate KV-cache for; " "passed through to vllm serve as --max-model-len.",
1026+
)
10201027

10211028
parser.add_argument("--model_max_context", type=int, default="8192", help="Maximum context length that the model was fine tuned under")
10221029
parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1288)
@@ -1041,7 +1048,7 @@ async def main():
10411048
logger.info(
10421049
"If you run out of GPU memory during start-up or get 'KV cache is larger than available memory' errors, retry with lower values, e.g. --gpu_memory_utilization 0.80 --max_model_len 16384"
10431050
)
1044-
1051+
10451052
global workspace_s3, pdf_s3
10461053
# set the global BASE_SERVER_PORT from args
10471054
global BASE_SERVER_PORT
@@ -1227,12 +1234,12 @@ async def main():
12271234

12281235
# Output finished_on_attempt statistics
12291236
logger.info("\nPages finished by attempt number:")
1230-
total_finished = sum(total_metrics.get(f'finished_on_attempt_{i}', 0) for i in range(args.max_page_retries))
1237+
total_finished = sum(total_metrics.get(f"finished_on_attempt_{i}", 0) for i in range(args.max_page_retries))
12311238
cumulative = 0
1232-
1239+
12331240
for i in range(args.max_page_retries):
1234-
if f'finished_on_attempt_{i}' in total_metrics:
1235-
count = total_metrics[f'finished_on_attempt_{i}']
1241+
if f"finished_on_attempt_{i}" in total_metrics:
1242+
count = total_metrics[f"finished_on_attempt_{i}"]
12361243
cumulative += count
12371244
percentage = (count / total_finished * 100) if total_finished > 0 else 0
12381245
cumulative_percentage = (cumulative / total_finished * 100) if total_finished > 0 else 0
@@ -1253,4 +1260,4 @@ async def main():
12531260

12541261

12551262
if __name__ == "__main__":
1256-
asyncio.run(main())
1263+
asyncio.run(main())

olmocr/prompts/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from .prompts import (
22
PageResponse,
33
build_finetuning_prompt,
4-
build_openai_silver_data_prompt,
54
build_no_anchoring_yaml_prompt,
5+
build_openai_silver_data_prompt,
66
extract_raw_text,
77
openai_response_format_schema,
88
)

olmocr/prompts/prompts.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@ def build_finetuning_prompt(base_text: str) -> str:
109109

110110
def build_no_anchoring_yaml_prompt() -> str:
111111
return (
112-
f"Attached is one page of a document that you must process. "
113-
f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n"
114-
f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
112+
"Attached is one page of a document that you must process. "
113+
"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n"
114+
"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
115115
)
116116

117117

0 commit comments

Comments
 (0)