1
+ #! /bin/bash
2
+
3
+ # Compresses an OlmOCR model using quantization
4
+ # Usage: ./scripts/compress_model.sh <recipe_path> <input_model_path> <output_model_path> [--calibration-pdfs PATTERN]
5
+
6
+ set -e
7
+
8
+ # Default calibration PDFs pattern
9
+ DEFAULT_CALIBRATION_PDFS=" /weka/oe-data-default/jakep/olmOCR-mix-0225-benchmark_set/*.pdf"
10
+
11
+ # Parse arguments
12
+ if [ $# -lt 3 ]; then
13
+ echo " Usage: $0 <recipe_path> <input_model_path> <output_model_path> [--calibration-pdfs PATTERN]"
14
+ echo " Example: $0 olmocr/train/quantization_configs/qwen2_5vl_w8a8_int8.yaml ./olmocrv2-base/ s3://ai2-oe-data/jakep/olmocr/compressed-model"
15
+ echo " Example with custom PDFs: $0 recipe.yaml ./model/ s3://output/ --calibration-pdfs '/path/to/pdfs/*.pdf'"
16
+ exit 1
17
+ fi
18
+
19
+ RECIPE=" $1 "
20
+ INPUT_MODEL=" $2 "
21
+ OUTPUT_MODEL=" $3 "
22
+ CALIBRATION_PDFS=" $DEFAULT_CALIBRATION_PDFS "
23
+
24
+ # Check for optional calibration-pdfs argument
25
+ shift 3
26
+ while [[ $# -gt 0 ]]; do
27
+ case $1 in
28
+ --calibration-pdfs)
29
+ CALIBRATION_PDFS=" $2 "
30
+ shift 2
31
+ ;;
32
+ * )
33
+ echo " Unknown option: $1 "
34
+ exit 1
35
+ ;;
36
+ esac
37
+ done
38
+
39
+ # Check for uncommitted changes
40
+ if ! git diff-index --quiet HEAD --; then
41
+ echo " Error: There are uncommitted changes in the repository."
42
+ echo " Please commit or stash your changes before running the compression."
43
+ echo " "
44
+ echo " Uncommitted changes:"
45
+ git status --short
46
+ exit 1
47
+ fi
48
+
49
+ # Use conda environment Python if available, otherwise use system Python
50
+ if [ -n " $CONDA_PREFIX " ]; then
51
+ PYTHON=" $CONDA_PREFIX /bin/python"
52
+ echo " Using conda Python from: $CONDA_PREFIX "
53
+ else
54
+ PYTHON=" python"
55
+ echo " Warning: No conda environment detected, using system Python"
56
+ fi
57
+
58
+ # Get version from version.py
59
+ VERSION=$( $PYTHON -c ' import olmocr.version; print(olmocr.version.VERSION)' )
60
+ echo " OlmOCR version: $VERSION "
61
+
62
+ # Get first 10 characters of git hash
63
+ GIT_HASH=$( git rev-parse HEAD | cut -c1-10)
64
+ echo " Git hash: $GIT_HASH "
65
+
66
+ # Get current git branch name
67
+ GIT_BRANCH=$( git rev-parse --abbrev-ref HEAD)
68
+ echo " Git branch: $GIT_BRANCH "
69
+
70
+ # Create full image tag
71
+ IMAGE_TAG=" olmocr-compress-${VERSION} -${GIT_HASH} "
72
+ echo " Building Docker image with tag: $IMAGE_TAG "
73
+
74
+ # Build the Docker image
75
+ echo " Building Docker image..."
76
+ docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
77
+
78
+ # Get Beaker username
79
+ BEAKER_USER=$( beaker account whoami --format json | jq -r ' .[0].name' )
80
+ echo " Beaker user: $BEAKER_USER "
81
+
82
+ # Push image to beaker
83
+ echo " Trying to push image to Beaker..."
84
+ if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TAG 2> /dev/null; then
85
+ echo " Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
86
+ fi
87
+
88
+ # Create Python script to run beaker experiment
89
+ cat << 'EOF ' > /tmp/run_compress_experiment.py
90
+ import sys
91
+ from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar
92
+
93
+ # Get parameters from command line
94
+ image_tag = sys.argv[1]
95
+ beaker_user = sys.argv[2]
96
+ git_branch = sys.argv[3]
97
+ git_hash = sys.argv[4]
98
+ recipe = sys.argv[5]
99
+ input_model = sys.argv[6]
100
+ output_model = sys.argv[7]
101
+ calibration_pdfs = sys.argv[8]
102
+
103
+ # Initialize Beaker client
104
+ b = Beaker.from_env(default_workspace="ai2/olmocr")
105
+
106
+ # Check if AWS credentials secret exists
107
+ aws_creds_secret = f"{beaker_user}-AWS_CREDENTIALS_FILE"
108
+ try:
109
+ # Try to get the secret to see if it exists
110
+ b.secret.get(aws_creds_secret, workspace="ai2/olmocr")
111
+ has_aws_creds = True
112
+ print(f"Found AWS credentials secret: {aws_creds_secret}")
113
+ except:
114
+ has_aws_creds = False
115
+ print(f"AWS credentials secret not found: {aws_creds_secret}")
116
+
117
+ # Build commands for compression job
118
+ commands = []
119
+ if has_aws_creds:
120
+ commands.extend([
121
+ "mkdir -p ~/.aws",
122
+ 'echo "$AWS_CREDENTIALS_FILE" > ~/.aws/credentials'
123
+ ])
124
+
125
+ commands.extend([
126
+ # Install llmcompressor
127
+ "pip install llmcompressor==0.6.0",
128
+ # Run compression
129
+ f'python -m olmocr.train.compress_checkpoint --recipe {recipe} {input_model} {output_model} --calibration-pdfs "{calibration_pdfs}"'
130
+ ])
131
+
132
+ # Build task spec with optional env vars
133
+ task_spec_args = {
134
+ "name": "olmocr-compress",
135
+ "image": ImageSource(beaker=f"{beaker_user}/{image_tag}"),
136
+ "command": [
137
+ "bash", "-c",
138
+ " && ".join(commands)
139
+ ],
140
+ "context": TaskContext(
141
+ priority=Priority.normal,
142
+ preemptible=True,
143
+ ),
144
+ "resources": TaskResources(gpu_count=1),
145
+ "constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
146
+ "result": ResultSpec(path="/noop-results"),
147
+ }
148
+
149
+ # Add env vars if AWS credentials exist
150
+ if has_aws_creds:
151
+ task_spec_args["env_vars"] = [
152
+ EnvVar(name="AWS_CREDENTIALS_FILE", secret=aws_creds_secret)
153
+ ]
154
+
155
+ # Create experiment spec
156
+ experiment_spec = ExperimentSpec(
157
+ description=f"OlmOCR Model Compression - Branch: {git_branch}, Commit: {git_hash}, Recipe: {recipe}",
158
+ budget="ai2/oe-data",
159
+ tasks=[TaskSpec(**task_spec_args)],
160
+ )
161
+
162
+ # Create the experiment
163
+ experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
164
+ print(f"Created compression experiment: {experiment.id}")
165
+ print(f"View at: https://beaker.org/ex/{experiment.id}")
166
+ EOF
167
+
168
+ # Run the Python script to create the experiment
169
+ echo " Creating Beaker experiment..."
170
+ echo " Compressing model from: $INPUT_MODEL to: $OUTPUT_MODEL "
171
+ echo " Using recipe: $RECIPE "
172
+ echo " Using calibration PDFs: $CALIBRATION_PDFS "
173
+ $PYTHON /tmp/run_compress_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH " $RECIPE " " $INPUT_MODEL " " $OUTPUT_MODEL " " $CALIBRATION_PDFS "
174
+
175
+ # Clean up temporary file
176
+ rm /tmp/run_compress_experiment.py
177
+
178
+ echo " Compression experiment submitted successfully!"
0 commit comments