Skip to content

Commit 1092213

Browse files
committed
Merge branch 'jakep/new_traininer_nojson_newprompt' into jakep/new_trainer
2 parents 679063a + f014c2a commit 1092213

File tree

2 files changed

+229
-12
lines changed

2 files changed

+229
-12
lines changed

olmocr/pipeline.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,8 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
108108
assert image_rotation in [0, 90, 180, 270], "Invalid image rotation provided in build_page_query"
109109

110110
# Allow the page rendering to process in the background while we get the anchor text (which blocks the main thread)
111-
image_base64 = asyncio.to_thread(render_pdf_to_base64png, local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
111+
image_base64 = await asyncio.to_thread(render_pdf_to_base64png, local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
112112

113-
# GET ANCHOR TEXT IS NOT THREAD SAFE!! Ahhhh..... don't try to do it
114-
# and it's also CPU bound, so it needs to run in a process pool
115-
loop = asyncio.get_running_loop()
116-
anchor_text = loop.run_in_executor(
117-
process_pool, partial(get_anchor_text, pdf_engine="pdfreport", target_length=target_anchor_text_len), local_pdf_path, page
118-
)
119-
120-
image_base64, anchor_text = await asyncio.gather(image_base64, anchor_text) # type: ignore
121113
if image_rotation != 0:
122114
image_bytes = base64.b64decode(image_base64)
123115
with Image.open(BytesIO(image_bytes)) as img:
@@ -130,14 +122,18 @@ async def build_page_query(local_pdf_path: str, page: int, target_longest_image_
130122
# Encode the rotated image back to base64
131123
image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
132124

125+
instruction_prompt = (f"Attached is one page of a document that you must process. "
126+
f"Just return the plain text representation of this document as if you were reading it naturally. Convert equations to LateX and tables to markdown.\n"
127+
f"Return your output as markdown, with a front matter section on top specifying values for the primary_language, is_rotation_valid, rotation_correction, is_table, and is_diagram parameters.")
128+
133129
return {
134130
"model": "olmocr",
135131
"messages": [
136132
{
137133
"role": "user",
138134
"content": [
139135
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
140-
{"type": "text", "text": build_finetuning_prompt(anchor_text)},
136+
{"type": "text", "text": instruction_prompt},
141137
],
142138
}
143139
],
@@ -1008,8 +1004,9 @@ async def main():
10081004
default="allenai/olmOCR-7B-0225-preview",
10091005
)
10101006
parser.add_argument("--model_max_context", type=int, default="8192", help="Maximum context length that the model was fine tuned under")
1011-
parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1024)
1012-
parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters)", default=6000)
1007+
parser.add_argument("--target_longest_image_dim", type=int, help="Dimension on longest side to use for rendering the pdf pages", default=1280)
1008+
parser.add_argument("--target_anchor_text_len", type=int, help="Maximum amount of anchor text to use (characters)", default=-1)
1009+
parser.add_argument("--guided_decoding", action="store_true", help="Enable guided decoding for model YAML type outputs")
10131010

10141011
# Beaker/job running stuff
10151012
parser.add_argument("--beaker", action="store_true", help="Submit this job to beaker instead of running locally")
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
#!/bin/bash
2+
3+
# Runs an olmocr-bench run using the full pipeline (no fallback)
4+
# Without model parameter (default behavior):, uses the default image from hugging face
5+
# ./scripts/run_benchmark.sh
6+
# With model parameter: for testing custom models
7+
# ./scripts/run_benchmark.sh --model your-model-name
8+
9+
set -e
10+
11+
# Parse command line arguments
12+
MODEL=""
13+
while [[ $# -gt 0 ]]; do
14+
case $1 in
15+
--model)
16+
MODEL="$2"
17+
shift 2
18+
;;
19+
*)
20+
echo "Unknown option: $1"
21+
echo "Usage: $0 [--model MODEL_NAME]"
22+
exit 1
23+
;;
24+
esac
25+
done
26+
27+
# Check for uncommitted changes
28+
if ! git diff-index --quiet HEAD --; then
29+
echo "Error: There are uncommitted changes in the repository."
30+
echo "Please commit or stash your changes before running the benchmark."
31+
echo ""
32+
echo "Uncommitted changes:"
33+
git status --short
34+
exit 1
35+
fi
36+
37+
# Use conda environment Python if available, otherwise use system Python
38+
if [ -n "$CONDA_PREFIX" ]; then
39+
PYTHON="$CONDA_PREFIX/bin/python"
40+
echo "Using conda Python from: $CONDA_PREFIX"
41+
else
42+
PYTHON="python"
43+
echo "Warning: No conda environment detected, using system Python"
44+
fi
45+
46+
# Get version from version.py
47+
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
48+
echo "OlmOCR version: $VERSION"
49+
50+
# Get first 10 characters of git hash
51+
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
52+
echo "Git hash: $GIT_HASH"
53+
54+
# Get current git branch name
55+
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
56+
echo "Git branch: $GIT_BRANCH"
57+
58+
# Create full image tag
59+
IMAGE_TAG="olmocr-benchmark-${VERSION}-${GIT_HASH}"
60+
echo "Building Docker image with tag: $IMAGE_TAG"
61+
62+
# Build the Docker image
63+
echo "Building Docker image..."
64+
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
65+
66+
# Get Beaker username
67+
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
68+
echo "Beaker user: $BEAKER_USER"
69+
70+
# Push image to beaker
71+
echo "Trying to push image to Beaker..."
72+
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
73+
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
74+
fi
75+
76+
# Create Python script to run beaker experiment
77+
cat << 'EOF' > /tmp/run_benchmark_experiment.py
78+
import sys
79+
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
80+
81+
# Get image tag, beaker user, git branch, git hash, and optional model from command line
82+
image_tag = sys.argv[1]
83+
beaker_user = sys.argv[2]
84+
git_branch = sys.argv[3]
85+
git_hash = sys.argv[4]
86+
model = sys.argv[5] if len(sys.argv) > 5 else None
87+
88+
# Initialize Beaker client
89+
b = Beaker.from_env(default_workspace="ai2/olmocr")
90+
91+
# Build the pipeline command with optional model parameter
92+
pipeline_cmd = "python -m olmocr.pipeline ./localworkspace --guided_decoding --markdown --pdfs ./olmOCR-bench/bench_data/pdfs/**/*.pdf"
93+
if model:
94+
pipeline_cmd += f" --model {model}"
95+
96+
# Check if AWS credentials secret exists
97+
aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
98+
try:
99+
# Try to get the secret to see if it exists
100+
b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
101+
has_aws_creds = True
102+
print(f"Found AWS credentials secret: {aws_creds_secret}")
103+
except:
104+
has_aws_creds = False
105+
print(f"AWS credentials secret not found: {aws_creds_secret}")
106+
107+
# First experiment: Original benchmark job
108+
commands = []
109+
if has_aws_creds:
110+
commands.extend([
111+
"mkdir -p ~/.aws",
112+
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
113+
])
114+
commands.extend([
115+
"git clone https://huggingface.co/datasets/allenai/olmOCR-bench",
116+
"cd olmOCR-bench && git lfs pull && cd ..",
117+
pipeline_cmd,
118+
"python olmocr/bench/scripts/workspace_to_bench.py localworkspace/ olmOCR-bench/bench_data/olmocr --bench-path ./olmOCR-bench/",
119+
"python -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data"
120+
])
121+
122+
# Build task spec with optional env vars
123+
task_spec_args = {
124+
"name": "olmocr-benchmark",
125+
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
126+
"command": [
127+
"bash", "-c",
128+
" && ".join(commands)
129+
],
130+
"context": TaskContext(
131+
priority=Priority.normal,
132+
preemptible=True,
133+
),
134+
"resources": TaskResources(gpu_count=1),
135+
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
136+
"result": ResultSpec(path="/noop-results"),
137+
}
138+
139+
# Add env vars if AWS credentials exist
140+
if has_aws_creds:
141+
task_spec_args["env_vars"] = [
142+
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
143+
]
144+
145+
# Create first experiment spec
146+
experiment_spec = ExperimentSpec(
147+
description=f"OlmOCR Benchmark Run - Branch: {git_branch}, Commit: {git_hash}",
148+
budget="ai2/oe-data",
149+
tasks=[TaskSpec(**task_spec_args)],
150+
)
151+
152+
# Create the first experiment
153+
experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
154+
print(f"Created benchmark experiment: {experiment.id}")
155+
print(f"View at: https://beaker.org/ex/{experiment.id}")
156+
print("-------")
157+
print("")
158+
159+
# Second experiment: Performance test job
160+
perf_pipeline_cmd = "python -m olmocr.pipeline ./localworkspace --guided_decoding --markdown --pdfs s3://ai2-oe-data/jakep/olmocr/olmOCR-mix-0225/benchmark_set/*.pdf"
161+
if model:
162+
perf_pipeline_cmd += f" --model {model}"
163+
164+
perf_commands = []
165+
if has_aws_creds:
166+
perf_commands.extend([
167+
"mkdir -p ~/.aws",
168+
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
169+
])
170+
perf_commands.append(perf_pipeline_cmd)
171+
172+
# Build performance task spec
173+
perf_task_spec_args = {
174+
"name": "olmocr-performance",
175+
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
176+
"command": [
177+
"bash", "-c",
178+
" && ".join(perf_commands)
179+
],
180+
"context": TaskContext(
181+
priority=Priority.normal,
182+
preemptible=True,
183+
),
184+
"resources": TaskResources(gpu_count=1),
185+
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
186+
"result": ResultSpec(path="/noop-results"),
187+
}
188+
189+
# Add env vars if AWS credentials exist
190+
if has_aws_creds:
191+
perf_task_spec_args["env_vars"] = [
192+
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
193+
]
194+
195+
# Create performance experiment spec
196+
perf_experiment_spec = ExperimentSpec(
197+
description=f"OlmOCR Performance Test - Branch: {git_branch}, Commit: {git_hash}",
198+
budget="ai2/oe-data",
199+
tasks=[TaskSpec(**perf_task_spec_args)],
200+
)
201+
202+
# Create the performance experiment
203+
perf_experiment = b.experiment.create(spec=perf_experiment_spec, workspace="ai2/olmocr")
204+
print(f"Created performance experiment: {perf_experiment.id}")
205+
print(f"View at: https://beaker.org/ex/{perf_experiment.id}")
206+
EOF
207+
208+
# Run the Python script to create the experiments
209+
echo "Creating Beaker experiments..."
210+
if [ -n "$MODEL" ]; then
211+
echo "Using model: $MODEL"
212+
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$MODEL"
213+
else
214+
$PYTHON /tmp/run_benchmark_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
215+
fi
216+
217+
# Clean up temporary file
218+
rm /tmp/run_benchmark_experiment.py
219+
220+
echo "Benchmark experiments submitted successfully!"

0 commit comments

Comments
 (0)