@@ -28,8 +28,9 @@ IMAGE_TAG="olmocr-train-${VERSION}-${GIT_HASH}"
28
28
echo " Building Docker image with tag: $IMAGE_TAG "
29
29
30
30
# Build the Docker image
31
- echo " Building Docker image..."
32
- docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
31
+ # echo "Building Docker image..."
32
+ # docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
33
+ IMAGE_TAG=olmocr-train-0.1.76-9f0f912101
33
34
34
35
# Get Beaker username
35
36
BEAKER_USER=$( beaker account whoami --format json | jq -r ' .[0].name' )
@@ -41,26 +42,78 @@ if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TA
41
42
echo " Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
42
43
fi
43
44
44
- gantry run \
45
- --description " OlmOCR Training Run" \
46
- --task-name " ${run_name} " \
47
- --allow-dirty \
48
- --host-networking \
49
- --workspace ai2/olmocr \
50
- --beaker-image $BEAKER_USER /$IMAGE_TAG \
51
- --priority normal \
52
- --gpus 1 \
53
- --preemptible \
54
- --cluster " ai2/titan-cirrascale" \
55
- --budget ai2/oe-data \
56
- --env LOG_FILTER_TYPE=local_rank0_only \
57
- --env OMP_NUM_THREADS=8 \
58
- --env BEAKER_USER_ID=$( beaker account whoami --format json | jq ' .[0].name' -cr) \
59
- --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
60
- --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
61
- --env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
62
- --weka oe-data-default:/weka/oe-data-default \
63
- --weka oe-training-default:/weka/oe-training-default \
64
- --shared-memory 10GiB \
65
- --yes \
66
- -- /bin/bash -c " pip install -r gantry-train-requirements.txt && pip install transformers==4.52.4 && pip install flash-attn==2.8.0.post2 --no-build-isolation && python -m olmocr.train.train --config olmocr/train/configs/qwen25_vl_b100_x1_default.yaml"
45
+ # Create Python script to run beaker experiment
46
+ cat << 'EOF ' > /tmp/run_training_experiment.py
47
+ import sys
48
+ from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar, DataMount
49
+
50
+ # Get image tag, beaker user, git branch, and git hash from command line
51
+ image_tag = sys.argv[1]
52
+ beaker_user = sys.argv[2]
53
+ git_branch = sys.argv[3]
54
+ git_hash = sys.argv[4]
55
+
56
+ # Initialize Beaker client
57
+ b = Beaker.from_env(default_workspace="ai2/olmocr")
58
+
59
+ # Build the training command
60
+ commands = [
61
+ "pip install -r gantry-train-requirements.txt",
62
+ "pip install transformers==4.52.4",
63
+ "pip install flash-attn==2.8.0.post2 --no-build-isolation",
64
+ "python -m olmocr.train.train --config olmocr/train/configs/qwen25_vl_b100_x1_default.yaml"
65
+ ]
66
+
67
+ # Build task spec
68
+ task_spec = TaskSpec(
69
+ name="olmocr-training",
70
+ image=ImageSource(beaker=f"{beaker_user}/{image_tag}"),
71
+ command=[
72
+ "bash", "-c",
73
+ " && ".join(commands)
74
+ ],
75
+ context=TaskContext(
76
+ priority=Priority.normal,
77
+ preemptible=True,
78
+ ),
79
+ resources=TaskResources(
80
+ gpu_count=1,
81
+ shared_memory="10GiB"
82
+ ),
83
+ constraints=Constraints(cluster=["ai2/titan-cirrascale"]),
84
+ result=ResultSpec(path="/noop-results"),
85
+ env_vars=[
86
+ EnvVar(name="LOG_FILTER_TYPE", value="local_rank0_only"),
87
+ EnvVar(name="OMP_NUM_THREADS", value="8"),
88
+ EnvVar(name="BEAKER_USER_ID", value=beaker_user),
89
+ EnvVar(name="AWS_ACCESS_KEY_ID", secret="S2_AWS_ACCESS_KEY_ID"),
90
+ EnvVar(name="AWS_SECRET_ACCESS_KEY", secret="S2_AWS_SECRET_ACCESS_KEY"),
91
+ EnvVar(name="WANDB_API_KEY", secret="JAKE_WANDB_API_KEY")
92
+ ],
93
+ datasets=[
94
+ DataMount.new(mount_path="/weka/oe-data-default", weka="oe-data-default"),
95
+ DataMount.new(mount_path="/weka/oe-training-default", weka="oe-training-default"),
96
+ ]
97
+ )
98
+
99
+ # Create experiment spec
100
+ experiment_spec = ExperimentSpec(
101
+ description=f"OlmOCR Training Run - Branch: {git_branch}, Commit: {git_hash}",
102
+ budget="ai2/oe-data",
103
+ tasks=[task_spec],
104
+ )
105
+
106
+ # Create the experiment
107
+ experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
108
+ print(f"Created training experiment: {experiment.id}")
109
+ print(f"View at: https://beaker.org/ex/{experiment.id}")
110
+ EOF
111
+
112
+ # Run the Python script to create the experiment
113
+ echo " Creating Beaker experiment..."
114
+ $PYTHON /tmp/run_training_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
115
+
116
+ # Clean up temporary file
117
+ rm /tmp/run_training_experiment.py
118
+
119
+ echo " Training experiment submitted successfully!"
0 commit comments