Skip to content

Commit dcf026a

Browse files
committed
Better script
1 parent 9f0f912 commit dcf026a

File tree

1 file changed

+78
-25
lines changed

1 file changed

+78
-25
lines changed

scripts/train/newtrainer-gantry.sh

Lines changed: 78 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ IMAGE_TAG="olmocr-train-${VERSION}-${GIT_HASH}"
2828
echo "Building Docker image with tag: $IMAGE_TAG"
2929

3030
# Build the Docker image
31-
echo "Building Docker image..."
32-
docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
31+
# echo "Building Docker image..."
32+
# docker build --platform linux/amd64 -f ./Dockerfile -t $IMAGE_TAG .
33+
IMAGE_TAG=olmocr-train-0.1.76-9f0f912101
3334

3435
# Get Beaker username
3536
BEAKER_USER=$(beaker account whoami --format json | jq -r '.[0].name')
@@ -41,26 +42,78 @@ if ! beaker image create --workspace ai2/oe-data-pdf --name $IMAGE_TAG $IMAGE_TA
4142
echo "Warning: Beaker image with tag $IMAGE_TAG already exists. Using existing image."
4243
fi
4344

44-
gantry run \
45-
--description "OlmOCR Training Run"\
46-
--task-name "${run_name}"\
47-
--allow-dirty \
48-
--host-networking \
49-
--workspace ai2/olmocr \
50-
--beaker-image $BEAKER_USER/$IMAGE_TAG \
51-
--priority normal \
52-
--gpus 1 \
53-
--preemptible \
54-
--cluster "ai2/titan-cirrascale" \
55-
--budget ai2/oe-data \
56-
--env LOG_FILTER_TYPE=local_rank0_only \
57-
--env OMP_NUM_THREADS=8 \
58-
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
59-
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
60-
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
61-
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
62-
--weka oe-data-default:/weka/oe-data-default \
63-
--weka oe-training-default:/weka/oe-training-default \
64-
--shared-memory 10GiB \
65-
--yes \
66-
-- /bin/bash -c "pip install -r gantry-train-requirements.txt && pip install transformers==4.52.4 && pip install flash-attn==2.8.0.post2 --no-build-isolation && python -m olmocr.train.train --config olmocr/train/configs/qwen25_vl_b100_x1_default.yaml"
45+
# Create Python script to run beaker experiment
46+
cat << 'EOF' > /tmp/run_training_experiment.py
47+
import sys
48+
from beaker import Beaker, ExperimentSpec, TaskSpec, TaskContext, ResultSpec, TaskResources, ImageSource, Priority, Constraints, EnvVar, DataMount
49+
50+
# Get image tag, beaker user, git branch, and git hash from command line
51+
image_tag = sys.argv[1]
52+
beaker_user = sys.argv[2]
53+
git_branch = sys.argv[3]
54+
git_hash = sys.argv[4]
55+
56+
# Initialize Beaker client
57+
b = Beaker.from_env(default_workspace="ai2/olmocr")
58+
59+
# Build the training command
60+
commands = [
61+
"pip install -r gantry-train-requirements.txt",
62+
"pip install transformers==4.52.4",
63+
"pip install flash-attn==2.8.0.post2 --no-build-isolation",
64+
"python -m olmocr.train.train --config olmocr/train/configs/qwen25_vl_b100_x1_default.yaml"
65+
]
66+
67+
# Build task spec
68+
task_spec = TaskSpec(
69+
name="olmocr-training",
70+
image=ImageSource(beaker=f"{beaker_user}/{image_tag}"),
71+
command=[
72+
"bash", "-c",
73+
" && ".join(commands)
74+
],
75+
context=TaskContext(
76+
priority=Priority.normal,
77+
preemptible=True,
78+
),
79+
resources=TaskResources(
80+
gpu_count=1,
81+
shared_memory="10GiB"
82+
),
83+
constraints=Constraints(cluster=["ai2/titan-cirrascale"]),
84+
result=ResultSpec(path="/noop-results"),
85+
env_vars=[
86+
EnvVar(name="LOG_FILTER_TYPE", value="local_rank0_only"),
87+
EnvVar(name="OMP_NUM_THREADS", value="8"),
88+
EnvVar(name="BEAKER_USER_ID", value=beaker_user),
89+
EnvVar(name="AWS_ACCESS_KEY_ID", secret="S2_AWS_ACCESS_KEY_ID"),
90+
EnvVar(name="AWS_SECRET_ACCESS_KEY", secret="S2_AWS_SECRET_ACCESS_KEY"),
91+
EnvVar(name="WANDB_API_KEY", secret="JAKE_WANDB_API_KEY")
92+
],
93+
datasets=[
94+
DataMount.new(mount_path="/weka/oe-data-default", weka="oe-data-default"),
95+
DataMount.new(mount_path="/weka/oe-training-default", weka="oe-training-default"),
96+
]
97+
)
98+
99+
# Create experiment spec
100+
experiment_spec = ExperimentSpec(
101+
description=f"OlmOCR Training Run - Branch: {git_branch}, Commit: {git_hash}",
102+
budget="ai2/oe-data",
103+
tasks=[task_spec],
104+
)
105+
106+
# Create the experiment
107+
experiment = b.experiment.create(spec=experiment_spec, workspace="ai2/olmocr")
108+
print(f"Created training experiment: {experiment.id}")
109+
print(f"View at: https://beaker.org/ex/{experiment.id}")
110+
EOF
111+
112+
# Run the Python script to create the experiment
113+
echo "Creating Beaker experiment..."
114+
$PYTHON /tmp/run_training_experiment.py $IMAGE_TAG $BEAKER_USER $GIT_BRANCH $GIT_HASH
115+
116+
# Clean up temporary file
117+
rm /tmp/run_training_experiment.py
118+
119+
echo "Training experiment submitted successfully!"

0 commit comments

Comments
 (0)