Skip to content

Commit ba49fd5

Browse files
committed
frontier train script let's see what happens
1 parent bde6f29 commit ba49fd5

File tree

1 file changed

+31
-0
lines changed

1 file changed

+31
-0
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env bash
2+
3+
#SBATCH -A csc652
4+
#SBATCH -J olmocr-train
5+
#SBATCH -o logs/%j.out
6+
#SBATCH -N 1
7+
#SBATCH -t 02:00:00
8+
9+
module reset
10+
module load PrgEnv-gnu
11+
module load olcf-container-tools
12+
module load apptainer-enable-mpi
13+
14+
# Run in offline mode to make sure it doesn't timeout loading the model
15+
export TRANSFORMERS_OFFLINE=1
16+
export HF_DATASETS_OFFLINE=1
17+
export HF_HUB_OFFLINE=1
18+
19+
export HF_DATASETS_CACHE="/lustre/orion/csc652/proj-shared/huggingface-shared/datasets"
20+
export HF_HUB_CACHE="/lustre/orion/csc652/proj-shared/huggingface-shared/hub"
21+
22+
# Was getting MIOpen errors with caching, had to disable for now
23+
export MIOPEN_DISABLE_CACHE=1
24+
25+
source activate /lustre/orion/csc652/proj-shared/jakep/conda_env_312_olmocr_train
26+
27+
# Run in offline mode to make sure it doesn't timeout loading the model
28+
export TRANSFORMERS_OFFLINE=1
29+
export HF_DATASETS_OFFLINE=1
30+
31+
python -m olmocr.train.train --config olmocr/train/configs/example_config.yam

0 commit comments

Comments
 (0)