Skip to content

Commit 4dbbf91

Browse files
committed
Compression script
1 parent feb2dab commit 4dbbf91

File tree

1 file changed

+178
-0
lines changed

1 file changed

+178
-0
lines changed

scripts/compress_model.sh

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/bin/bash
2+
3+
# Compresses an OlmOCR model using quantization
4+
# Usage: ./scripts/compress_model.sh <recipe_path> <input_model_path> <output_model_path> [--calibration-pdfs PATTERN]
5+
6+
set -e
7+
8+
# Default calibration PDFs pattern
9+
DEFAULT_CALIBRATION_PDFS="/weka/oe-data-default/jakep/olmOCR-mix-0225-benchmark_set/*.pdf"
10+
11+
# Parse arguments
12+
if [ $# -lt 3 ]; then
13+
echo "Usage: $0 <recipe_path> <input_model_path> <output_model_path> [--calibration-pdfs PATTERN]"
14+
echo "Example: $0 olmocr/train/quantization_configs/qwen2_5vl_w8a8_int8.yaml ./olmocrv2-base/ s3://ai2-oe-data/jakep/olmocr/compressed-model"
15+
echo "Example with custom PDFs: $0 recipe.yaml ./model/ s3://output/ --calibration-pdfs '/path/to/pdfs/*.pdf'"
16+
exit 1
17+
fi
18+
19+
RECIPE="$1"
20+
INPUT_MODEL="$2"
21+
OUTPUT_MODEL="$3"
22+
CALIBRATION_PDFS="$DEFAULT_CALIBRATION_PDFS"
23+
24+
# Check for optional calibration-pdfs argument
25+
shift 3
26+
while [[ $# -gt 0 ]]; do
27+
case $1 in
28+
--calibration-pdfs)
29+
CALIBRATION_PDFS="$2"
30+
shift 2
31+
;;
32+
*)
33+
echo "Unknown option: $1"
34+
exit 1
35+
;;
36+
esac
37+
done
38+
39+
# Check for uncommitted changes
40+
if ! git diff-index --quiet HEAD --; then
41+
echo "Error: There are uncommitted changes in the repository."
42+
echo "Please commit or stash your changes before running the compression."
43+
echo ""
44+
echo "Uncommitted changes:"
45+
git status --short
46+
exit 1
47+
fi
48+
49+
# Use conda environment Python if available, otherwise use system Python
50+
if [ -n "$CONDA_PREFIX" ]; then
51+
PYTHON="$CONDA_PREFIX/bin/python"
52+
echo "Using conda Python from: $CONDA_PREFIX"
53+
else
54+
PYTHON="python"
55+
echo "Warning: No conda environment detected, using system Python"
56+
fi
57+
58+
# Get version from version.py
59+
VERSION=$($PYTHON -c 'import olmocr.version; print(olmocr.version.VERSION)')
60+
echo "OlmOCR version: $VERSION"
61+
62+
# Get first 10 characters of git hash
63+
GIT_HASH=$(git rev-parse HEAD | cut -c1-10)
64+
echo "Git hash: $GIT_HASH"
65+
66+
# Get current git branch name
67+
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD)
68+
echo "Git branch: $GIT_BRANCH"
69+
70+
# Create full image tag
71+
IMAGE_TAG="olmocr-compress-${VERSION}-${GIT_HASH}"
72+
echo "Building Docker image with tag: $IMAGE_TAG"
73+
74+
# Build the Docker image
75+
echo "Building Docker image..."
76+
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
77+
78+
# Get Beaker username
79+
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
80+
echo "Beaker user: $BEAKER_USER"
81+
82+
# Push image to beaker
83+
echo "Trying to push image to Beaker..."
84+
if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2>/dev/null; then
85+
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
86+
fi
87+
88+
# Create Python script to run beaker experiment
89+
cat << 'EOF' > /tmp/run_compress_experiment.py
90+
import sys
91+
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
92+
93+
# Get parameters from command line
94+
image_tag = sys.argv[1]
95+
beaker_user = sys.argv[2]
96+
git_branch = sys.argv[3]
97+
git_hash = sys.argv[4]
98+
recipe = sys.argv[5]
99+
input_model = sys.argv[6]
100+
output_model = sys.argv[7]
101+
calibration_pdfs = sys.argv[8]
102+
103+
# Initialize Beaker client
104+
b = Beaker.from_env(default_workspace="ai2/olmocr")
105+
106+
# Check if AWS credentials secret exists
107+
aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
108+
try:
109+
# Try to get the secret to see if it exists
110+
b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
111+
has_aws_creds = True
112+
print(f"Found AWS credentials secret: {aws_creds_secret}")
113+
except:
114+
has_aws_creds = False
115+
print(f"AWS credentials secret not found: {aws_creds_secret}")
116+
117+
# Build commands for compression job
118+
commands = []
119+
if has_aws_creds:
120+
commands.extend([
121+
"mkdir -p ~/.aws",
122+
'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
123+
])
124+
125+
commands.extend([
126+
# Install llmcompressor
127+
"pip install llmcompressor==0.6.0",
128+
# Run compression
129+
f'python -m olmocr.train.compress_checkpoint --recipe {recipe} {input_model} {output_model} --calibration-pdfs "{calibration_pdfs}"'
130+
])
131+
132+
# Build task spec with optional env vars
133+
task_spec_args = {
134+
"name": "olmocr-compress",
135+
"image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
136+
"command": [
137+
"bash", "-c",
138+
" && ".join(commands)
139+
],
140+
"context": TaskContext(
141+
priority=Priority.normal,
142+
preemptible=True,
143+
),
144+
"resources": TaskResources(gpu_count=1),
145+
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
146+
"result": ResultSpec(path="/noop-results"),
147+
}
148+
149+
# Add env vars if AWS credentials exist
150+
if has_aws_creds:
151+
task_spec_args["env_vars"] = [
152+
EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
153+
]
154+
155+
# Create experiment spec
156+
experiment_spec = ExperimentSpec(
157+
description=f"OlmOCR Model Compression - Branch: {git_branch}, Commit: {git_hash}, Recipe: {recipe}",
158+
budget="ai2/oe-data",
159+
tasks=[TaskSpec(**task_spec_args)],
160+
)
161+
162+
# Create the experiment
163+
experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
164+
print(f"Created compression experiment: {experiment.id}")
165+
print(f"View at: https://beaker.org/ex/{experiment.id}")
166+
EOF
167+
168+
# Run the Python script to create the experiment
169+
echo "Creating Beaker experiment..."
170+
echo "Compressing model from: $INPUT_MODEL to: $OUTPUT_MODEL"
171+
echo "Using recipe: $RECIPE"
172+
echo "Using calibration PDFs: $CALIBRATION_PDFS"
173+
$PYTHON /tmp/run_compress_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH "$RECIPE" "$INPUT_MODEL" "$OUTPUT_MODEL" "$CALIBRATION_PDFS"
174+
175+
# Clean up temporary file
176+
rm /tmp/run_compress_experiment.py
177+
178+
echo "Compression experiment submitted successfully!"

0 commit comments

Comments
 (0)