Skip to content

Commit 336b000

Browse files
committed
Adding wsd as an option
1 parent 69581cc commit 336b000

File tree

4 files changed

+18
-104
lines changed

4 files changed

+18
-104
lines changed

olmocr/train/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ class TrainingConfig:
155155
learning_rate: float = 2e-5
156156
lr_scheduler_type: str = "cosine"
157157
warmup_ratio: float = 0.1
158+
lr_scheduler_kwargs: Dict[str, Any] = field(default_factory=dict)
158159

159160
# Optimization
160161
optim: str = "adamw_torch"

olmocr/train/configs/example_config_frontier.yaml

Lines changed: 0 additions & 92 deletions
This file was deleted.

olmocr/train/configs/example_config.yaml renamed to olmocr/train/configs/qwen25_vl_b100_x1_day3_json_wsd.yaml

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# Project metadata
44
project_name: olmocr-qwen-vl-training
5-
run_name: qwen2.5-vl-7b-finetune
5+
run_name: qwen2.5-vl-7b-finetune-day3-json
66

77
# Model configuration
88
model:
@@ -35,17 +35,16 @@ dataset:
3535
- name: PDFRenderer
3636
target_longest_image_dim: 1024
3737
- name: StaticLengthDocumentAnchoring
38-
target_anchor_text_len: 1000
38+
target_anchor_text_len: 3000
3939
- name: FinetuningPrompt
40-
- name: FrontMatterOutputFormat
40+
- name: JSONOutputFormat
4141
- name: InstructUserMessages
4242
- name: Tokenizer
4343
masking_index: -100
4444
end_of_message_token: "<|im_end|>"
45-
# Not putting in big bulk of data to speed up loading for debugging for now
46-
# - name: processed_00_documents_train_s2pdf
47-
# root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
48-
# pipeline: *basic_pipeline
45+
- name: processed_00_documents_train_s2pdf
46+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47+
pipeline: *basic_pipeline
4948

5049
eval:
5150
- name: processed_00_documents_eval_s2pdf
@@ -56,6 +55,7 @@ dataset:
5655
pipeline: *basic_pipeline
5756

5857

58+
5959
# Training configuration
6060
training:
6161
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
@@ -64,13 +64,17 @@ training:
6464
# Batch size and accumulation
6565
per_device_train_batch_size: 1
6666
per_device_eval_batch_size: 1
67-
gradient_accumulation_steps: 8
67+
gradient_accumulation_steps: 32
6868

6969
gradient_checkpointing: False
70+
71+
collator_max_token_len: 8192
7072

7173
# Learning rate
72-
learning_rate: 1e-6
73-
lr_scheduler_type: cosine
74+
learning_rate: 2e-5
75+
lr_scheduler_type: warmup_stable_decay
76+
lr_scheduler_kwargs:
77+
num_stable_steps: 4000
7478
warmup_ratio: 0.1
7579

7680
# Optimization
@@ -84,8 +88,8 @@ training:
8488
eval_steps: 500
8589
save_strategy: steps
8690
save_steps: 500
87-
save_total_limit: 3
88-
load_best_model_at_end: true
91+
save_total_limit: 5
92+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
8993
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
9094
greater_is_better: false
9195

olmocr/train/train.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ def main():
202202
learning_rate=float(config.training.learning_rate),
203203
lr_scheduler_type=config.training.lr_scheduler_type,
204204
warmup_ratio=config.training.warmup_ratio,
205+
lr_scheduler_kwargs=config.training.lr_scheduler_kwargs,
205206
optim=config.training.optim,
206207
adam_beta1=config.training.adam_beta1,
207208
adam_beta2=config.training.adam_beta2,

0 commit comments

Comments
 (0)