2
2
3
3
# Project metadata
4
4
project_name : olmocr-qwen-vl-training
5
- run_name : qwen2.5-vl-7b-finetune
5
+ run_name : qwen2.5-vl-7b-finetune-day3-json
6
6
7
7
# Model configuration
8
8
model :
@@ -35,17 +35,16 @@ dataset:
35
35
- name : PDFRenderer
36
36
target_longest_image_dim : 1024
37
37
- name : StaticLengthDocumentAnchoring
38
- target_anchor_text_len : 1000
38
+ target_anchor_text_len : 3000
39
39
- name : FinetuningPrompt
40
- - name : FrontMatterOutputFormat
40
+ - name : JSONOutputFormat
41
41
- name : InstructUserMessages
42
42
- name : Tokenizer
43
43
masking_index : -100
44
44
end_of_message_token : " <|im_end|>"
45
- # Not putting in big bulk of data to speed up loading for debugging for now
46
- # - name: processed_00_documents_train_s2pdf
47
- # root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
48
- # pipeline: *basic_pipeline
45
+ - name : processed_00_documents_train_s2pdf
46
+ root_dir : /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47
+ pipeline : *basic_pipeline
49
48
50
49
eval :
51
50
- name : processed_00_documents_eval_s2pdf
@@ -56,6 +55,7 @@ dataset:
56
55
pipeline : *basic_pipeline
57
56
58
57
58
+
59
59
# Training configuration
60
60
training :
61
61
output_dir : /weka/oe-data-default/jakep/olmocr-trainer/
@@ -64,13 +64,17 @@ training:
64
64
# Batch size and accumulation
65
65
per_device_train_batch_size : 1
66
66
per_device_eval_batch_size : 1
67
- gradient_accumulation_steps : 8
67
+ gradient_accumulation_steps : 32
68
68
69
69
gradient_checkpointing : False
70
+
71
+ collator_max_token_len : 8192
70
72
71
73
# Learning rate
72
- learning_rate : 1e-6
73
- lr_scheduler_type : cosine
74
+ learning_rate : 2e-5
75
+ lr_scheduler_type : warmup_stable_decay
76
+ lr_scheduler_kwargs :
77
+ num_stable_steps : 4000
74
78
warmup_ratio : 0.1
75
79
76
80
# Optimization
@@ -84,8 +88,8 @@ training:
84
88
eval_steps : 500
85
89
save_strategy : steps
86
90
save_steps : 500
87
- save_total_limit : 3
88
- load_best_model_at_end : true
91
+ save_total_limit : 5
92
+ load_best_model_at_end : false # Needs to be false because it has a problem restoring checkpoints for some reason
89
93
metric_for_best_model : eval_processed_00_documents_eval_s2pdf_loss
90
94
greater_is_better : false
91
95
0 commit comments