1
+ # Example OlmOCR Training Configuration
2
+
3
+ # Project metadata
4
+ project_name : olmocr-qwen-vl-training
5
+ run_name : qwen2.5-vl-7b-finetune-day2
6
+
7
+ # Model configuration
8
+ model :
9
+ name : Qwen/Qwen2.5-VL-7B-Instruct
10
+ trust_remote_code : true
11
+ torch_dtype : bfloat16
12
+ use_flash_attention : true
13
+ attn_implementation : flash_attention_2
14
+
15
+ # LoRA settings (disabled by default)
16
+ use_lora : false
17
+ # lora_rank: 8
18
+ # lora_alpha: 32
19
+ # lora_dropout: 0.1
20
+ # lora_target_modules:
21
+ # - q_proj
22
+ # - v_proj
23
+ # - k_proj
24
+ # - o_proj
25
+
26
+ # Dataset configuration
27
+ dataset :
28
+
29
+ train :
30
+ - name : processed_01_books_train_iabooks
31
+ root_dir : /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32
+ pipeline : &basic_pipeline
33
+ - name : FrontMatterParser
34
+ front_matter_class : PageResponse
35
+ - name : PDFRenderer
36
+ target_longest_image_dim : 1024
37
+ - name : StaticLengthDocumentAnchoring
38
+ target_anchor_text_len : 3000
39
+ - name : FinetuningPrompt
40
+ - name : JSONOutputFormat
41
+ - name : InstructUserMessages
42
+ - name : Tokenizer
43
+ masking_index : -100
44
+ end_of_message_token : " <|im_end|>"
45
+ - name : processed_00_documents_train_s2pdf
46
+ root_dir : /weka/oe-data-default/jakep/processed_00_documents_train_s2pdf/
47
+ pipeline : *basic_pipeline
48
+
49
+ eval :
50
+ - name : processed_00_documents_eval_s2pdf
51
+ root_dir : /weka/oe-data-default/jakep/processed_00_documents_eval_s2pdf/
52
+ pipeline : *basic_pipeline
53
+ - name : processed_01_books_eval_iabooks
54
+ root_dir : /weka/oe-data-default/jakep/processed_01_books_eval_iabooks/
55
+ pipeline : *basic_pipeline
56
+
57
+
58
+ # Training configuration
59
+ training :
60
+ output_dir : /home/ubuntu/olmocr-trainer/
61
+ num_train_epochs : 1
62
+
63
+ # Batch size and accumulation
64
+ per_device_train_batch_size : 1
65
+ per_device_eval_batch_size : 1
66
+ gradient_accumulation_steps : 32
67
+
68
+ gradient_checkpointing : False
69
+
70
+ collator_max_token_len : 8192
71
+
72
+ # Learning rate
73
+ learning_rate : 2e-5
74
+ lr_scheduler_type : cosine
75
+ warmup_ratio : 0.1
76
+
77
+ # Optimization
78
+ optim : adamw_torch
79
+ weight_decay : 0.01
80
+ max_grad_norm : 1.0
81
+
82
+
83
+ # Evaluation and checkpointing
84
+ evaluation_strategy : steps
85
+ eval_steps : 500
86
+ save_strategy : steps
87
+ save_steps : 500
88
+ save_total_limit : 5
89
+ load_best_model_at_end : true
90
+ metric_for_best_model : eval_processed_00_documents_eval_s2pdf_loss
91
+ greater_is_better : false
92
+
93
+ report_to :
94
+ - wandb
95
+
0 commit comments