Skip to content

Commit 587b73f

Browse files
committed
Try with more aggressive anchor changing
1 parent 8f5d5bd commit 587b73f

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

olmocr/pipeline.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class PageResult:
106106

107107

108108
async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, target_anchor_text_len: int, image_rotation: int = 0) -> dict:
109-
MAX_TOKENS = 5000
109+
MAX_TOKENS = 4500
110110
assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query"
111111

112112
# Allow the page rendering to process in the background while we get the anchor text (which blocks the main thread)
@@ -294,6 +294,10 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
294294
raise
295295
except json.JSONDecodeError as e:
296296
logger.warning(f"JSON decode error on attempt {attempt} for {pdf_orig_path}-{page_num}: {e}")
297+
298+
local_anchor_text_len = max(1, local_anchor_text_len // 2)
299+
logger.info(f"Reducing anchor text len to {local_anchor_text_len} for {pdf_orig_path}-{page_num}")
300+
297301
attempt += 1
298302
except ValueError as e:
299303
logger.warning(f"ValueError on attempt {attempt} for {pdf_orig_path}-{page_num}: {type(e)} - {e}")

0 commit comments

Comments
 (0)