1
+ # Example OlmOCR Training Configuration
2
+
3
+ # Project metadata
4
+ project_name : olmocr-qwen-vl-training
5
+ run_name : qwen2.5-vl-7b-olmocrv2_2epoch
6
+
7
+ # Model configuration
8
+ model :
9
+ name : Qwen/Qwen2.5-VL-7B-Instruct
10
+ trust_remote_code : true
11
+ torch_dtype : bfloat16
12
+ use_flash_attention : true
13
+ attn_implementation : flash_attention_2
14
+
15
+ # LoRA settings (disabled by default)
16
+ use_lora : false
17
+ # lora_rank: 8
18
+ # lora_alpha: 32
19
+ # lora_dropout: 0.1
20
+ # lora_target_modules:
21
+ # - q_proj
22
+ # - v_proj
23
+ # - k_proj
24
+ # - o_proj
25
+
26
+ # Dataset configuration
27
+ dataset :
28
+
29
+ train :
30
+ - name : processed_01_books_train_iabooks
31
+ root_dir : /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32
+ pipeline : &basic_pipeline
33
+ - name : FrontMatterParser
34
+ front_matter_class : PageResponse
35
+ - name : PDFRenderer
36
+ target_longest_image_dim : 1288
37
+ - name : NewYamlFinetuningPromptWithNoAnchoring
38
+ - name : FrontMatterOutputFormat
39
+ - name : InstructUserMessages
40
+ - name : Tokenizer
41
+ masking_index : -100
42
+ end_of_message_token : " <|im_end|>"
43
+ - name : processed_00_documents_train_s2pdf
44
+ root_dir : /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
45
+ pipeline : *basic_pipeline
46
+
47
+ eval :
48
+ - name : processed_00_documents_eval_s2pdf
49
+ root_dir : /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
50
+ pipeline : *basic_pipeline
51
+ - name : processed_01_books_eval_iabooks
52
+ root_dir : /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
53
+ pipeline : *basic_pipeline
54
+
55
+
56
+
57
+ # Training configuration
58
+ training :
59
+ output_dir : /weka/oe-data-default/jakep/olmocr-trainer/
60
+ num_train_epochs : 2.0
61
+
62
+ # Batch size and accumulation
63
+ per_device_train_batch_size : 1
64
+ per_device_eval_batch_size : 1
65
+ gradient_accumulation_steps : 32
66
+
67
+ gradient_checkpointing : False
68
+
69
+ collator_max_token_len : 8192
70
+
71
+ # Learning rate
72
+ learning_rate : 2e-5
73
+ lr_scheduler_type : linear
74
+ warmup_ratio : 0.1
75
+
76
+ # Optimization
77
+ optim : adamw_torch
78
+ weight_decay : 0.01
79
+ max_grad_norm : 1.0
80
+
81
+ seed : 300
82
+ data_seed : 301
83
+
84
+ # Evaluation and checkpointing
85
+ evaluation_strategy : steps
86
+ eval_steps : 500
87
+ save_strategy : steps
88
+ save_steps : 500
89
+ save_total_limit : 5
90
+ load_best_model_at_end : false # Needs to be false because it has a problem restoring checkpoints for some reason
91
+ metric_for_best_model : eval_processed_00_documents_eval_s2pdf_loss
92
+ greater_is_better : false
93
+
94
+ report_to :
95
+ - wandb
96
+
0 commit comments