Skip to content

Commit 287c827

Browse files
committed
Starting to cleanup and merge yaml front matter stuff in
1 parent 1092213 commit 287c827

File tree

3 files changed

+29
-15
lines changed

3 files changed

+29
-15
lines changed

olmocr/pipeline.py

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,11 @@
3434
check_torch_gpu_available,
3535
)
3636
from olmocr.data.renderpdf import render_pdf_to_base64png
37+
from olmocr.train.dataloader import FrontMatterParser
3738
from olmocr.filter.filter import Language, PdfFilter
3839
from olmocr.image_utils import convert_image_to_pdf_bytes, is_jpeg, is_png
3940
from olmocr.metrics import MetricsKeeper, WorkerTracker
40-
from olmocr.prompts import PageResponse, build_finetuning_prompt
41+
from olmocr.prompts import PageResponse, build_no_anchoring_yaml_prompt
4142
from olmocr.prompts.anchor import get_anchor_text
4243
from olmocr.s3_utils import (
4344
download_directory,
@@ -103,7 +104,7 @@ class PageResult:
103104
is_fallback: bool
104105

105106

106-
async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, target_anchor_text_len: int, image_rotation: int = 0) -> dict:
107+
async def build_page_query(local_pdf_path: str, page: int, target_longest_image_dim: int, image_rotation: int = 0) -> dict:
107108
MAX_TOKENS = 4500
108109
assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query"
109110

@@ -122,18 +123,14 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
122123
# Encode the rotated image back to base64
123124
image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
124125

125-
instruction_prompt = (f"Attached is one page of a document that you must process. "
126-
f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n"
127-
f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters.")
128-
129126
return {
130127
"model": "olmocr",
131128
"messages": [
132129
{
133130
"role": "user",
134131
"content": [
135132
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
136-
{"type": "text", "text": instruction_prompt},
133+
{"type": "text", "text": build_no_anchoring_yaml_prompt()},
137134
],
138135
}
139136
],
@@ -211,26 +208,27 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
211208
COMPLETION_URL = f"http://localhost:{BASE_SERVER_PORT}/v1/chat/completions"
212209
MAX_RETRIES = args.max_page_retries
213210
TEMPERATURE_BY_ATTEMPT = [0.1, 0.1, 0.2, 0.3, 0.5, 0.8, 0.1, 0.8]
214-
FORCE_NO_DOCUMENT_ANCHORING_BY_ATTEMPT = [False, False, False, False, False, False, True, True]
215-
assert len(TEMPERATURE_BY_ATTEMPT) == len(FORCE_NO_DOCUMENT_ANCHORING_BY_ATTEMPT)
216211
exponential_backoffs = 0
217212
local_anchor_text_len = args.target_anchor_text_len
218213
local_image_rotation = 0
219214
attempt = 0
220215
await tracker.track_work(worker_id, f"{pdf_orig_path}-{page_num}", "started")
221216

222217
while attempt < MAX_RETRIES:
223-
lookup_attempt = min(attempt, len(FORCE_NO_DOCUMENT_ANCHORING_BY_ATTEMPT) - 1)
218+
lookup_attempt = min(attempt, len(TEMPERATURE_BY_ATTEMPT) - 1)
224219
query = await build_page_query(
225220
pdf_local_path,
226221
page_num,
227222
args.target_longest_image_dim,
228-
local_anchor_text_len if not FORCE_NO_DOCUMENT_ANCHORING_BY_ATTEMPT[lookup_attempt] else -1,
229223
image_rotation=local_image_rotation,
230224
)
231225
# Change temperature as number of attempts increases to overcome repetition issues at expense of quality
232226
query["temperature"] = TEMPERATURE_BY_ATTEMPT[lookup_attempt]
233227

228+
# Enable guided decoding regex if needed
229+
if args.guided_decoding:
230+
query["guided_regex"] = r"---\nprimary_language: (?:[a-z]{2}|null)\nis_rotation_valid: (?:True|False|true|false)\nrotation_correction: (?:0|90|180|270)\nis_table: (?:True|False|true|false)\nis_diagram: (?:True|False|true|false)\n(?:---|---\n[\s\S]+)"
231+
234232
logger.info(f"Built page query for {pdf_orig_path}-{page_num}")
235233

236234
try:
@@ -249,14 +247,22 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
249247
local_anchor_text_len = max(1, local_anchor_text_len // 2)
250248
logger.info(f"Reducing anchor text len to {local_anchor_text_len} for {pdf_orig_path}-{page_num}")
251249
raise ValueError("Response exceeded model_max_context, cannot use this response")
250+
251+
if base_response_data["choices"][0]["finish_reason"] != "stop":
252+
local_anchor_text_len = max(1, local_anchor_text_len // 2)
253+
logger.info(f"Reducing anchor text len to {local_anchor_text_len} for {pdf_orig_path}-{page_num}")
254+
raise ValueError("Response did not finish with reason code 'stop', cannot use this response")
252255

253256
metrics.add_metrics(
254257
server_input_tokens=base_response_data["usage"].get("prompt_tokens", 0),
255258
server_output_tokens=base_response_data["usage"].get("completion_tokens", 0),
256259
)
257260

258-
model_response_json = json.loads(base_response_data["choices"][0]["message"]["content"])
259-
page_response = PageResponse(**model_response_json)
261+
model_response_markdown = base_response_data["choices"][0]["message"]["content"]
262+
263+
parser = FrontMatterParser(front_matter_class=PageResponse)
264+
front_matter, text = parser._extract_front_matter_and_text(model_response_markdown)
265+
page_response = parser._parse_front_matter(front_matter, text)
260266

261267
if not page_response.is_rotation_valid and attempt < MAX_RETRIES - 1:
262268
logger.info(
@@ -323,7 +329,6 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
323329
is_fallback=True,
324330
)
325331

326-
327332
async def process_pdf(args, worker_id: int, pdf_orig_path: str):
328333
with tempfile.NamedTemporaryFile("wb+", suffix=".pdf", delete=False) as tf:
329334
try:
@@ -1005,7 +1010,7 @@ async def main():
10051010
)
10061011
parser.add_argument("--model_max_context", type=int, default="8192", help="Maximum context length that the model was fine tuned under")
10071012
parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1280)
1008-
parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters)", default=-1)
1013+
parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters), not used for new models", default=-1)
10091014
parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs")
10101015

10111016
# Beaker/job running stuff

olmocr/prompts/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
PageResponse,
33
build_finetuning_prompt,
44
build_openai_silver_data_prompt,
5+
build_no_anchoring_yaml_prompt,
56
extract_raw_text,
67
openai_response_format_schema,
78
)

olmocr/prompts/prompts.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,14 @@ def build_finetuning_prompt(base_text: str) -> str:
107107
)
108108

109109

110+
def build_no_anchoring_yaml_prompt() -> str:
111+
return (
112+
f"Attached is one page of a document that you must process. "
113+
f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n"
114+
f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters."
115+
)
116+
117+
110118
# Extracts the anchor text component from an existing prompt string
111119
def extract_raw_text(prompt: str) -> str:
112120
pattern = r"RAW_TEXT_START\s*\n(.*?)\nRAW_TEXT_END"

0 commit comments

Comments
 (0)