Skip to content

Commit 5c2d69a

Browse files
committed
Some cleanup stuff
1 parent e86511e commit 5c2d69a

File tree

4 files changed

+12
-6
lines changed

4 files changed

+12
-6
lines changed

olmocr/train/configs/example_config.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ model:
1010
trust_remote_code: true
1111
torch_dtype: auto
1212
use_flash_attention: true
13-
attn_implementation: sdpa
13+
attn_implementation: flash_attention_2
1414

1515
# LoRA settings (disabled by default)
1616
use_lora: false
@@ -65,9 +65,11 @@ training:
6565
per_device_train_batch_size: 1
6666
per_device_eval_batch_size: 1
6767
gradient_accumulation_steps: 8
68+
69+
gradient_checkpointing: False
6870

6971
# Learning rate
70-
learning_rate: 2e-5
72+
learning_rate: 1e-6
7173
lr_scheduler_type: cosine
7274
warmup_ratio: 0.1
7375

olmocr/train/train.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,9 +162,13 @@ def main():
162162
total_eval_samples = sum(len(dataset) for dataset in eval_datasets.values())
163163
logger.info(f"Total evaluation samples across {len(eval_datasets)} datasets: {total_eval_samples}")
164164

165+
# Construct full output directory by appending run_name to base output_dir
166+
full_output_dir = os.path.join(config.training.output_dir, config.run_name)
167+
logger.info(f"Setting output directory to: {full_output_dir}")
168+
165169
# Set up training arguments
166170
training_args = TrainingArguments(
167-
output_dir=config.training.output_dir,
171+
output_dir=full_output_dir,
168172
num_train_epochs=config.training.num_train_epochs,
169173
per_device_train_batch_size=config.training.per_device_train_batch_size,
170174
per_device_eval_batch_size=config.training.per_device_eval_batch_size,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ dependencies = [
3737
"boto3",
3838
"httpx",
3939
"torch>=2.7.0",
40-
"transformers>=4.51.1",
40+
"transformers==4.52.4",
4141
"img2pdf",
4242
"beaker-py",
4343
]

scripts/train/newtrainer-gantry.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ gantry run \
5252
--priority normal \
5353
--gpus 1 \
5454
--preemptible \
55-
--cluster "ai2/jupiter-cirrascale-2" \
55+
--cluster "ai2/titan-cirrascale" \
5656
--budget ai2/oe-data \
5757
--env LOG_FILTER_TYPE=local_rank0_only \
5858
--env OMP_NUM_THREADS=8 \
@@ -64,4 +64,4 @@ gantry run \
6464
--weka oe-training-default:/weka/oe-training-default \
6565
--shared-memory 10GiB \
6666
--yes \
67-
-- /bin/bash -c "source scripts/beaker/jupiter-ib.sh && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"
67+
-- /bin/bash -c "pip install flash-attn==2.8.0.post2 --no-build-isolation && python -m olmocr.train.train --config olmocr/train/configs/example_config.yaml"

0 commit comments

Comments
 (0)