Skip to content

Commit ca2609c

Browse files
committed
No doc anchoring version
1 parent 560a585 commit ca2609c

File tree

1 file changed

+96
-0
lines changed

1 file changed

+96
-0
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-finetune-day2-1280-noanchor
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1280
37+
- name: StaticLengthDocumentAnchoring
38+
target_anchor_text_len: -1
39+
- name: FinetuningPrompt
40+
- name: JSONOutputFormat
41+
- name: InstructUserMessages
42+
- name: Tokenizer
43+
masking_index: -100
44+
end_of_message_token: "<|im_end|>"
45+
- name: processed_00_documents_train_s2pdf
46+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47+
pipeline: *basic_pipeline
48+
49+
eval:
50+
- name: processed_00_documents_eval_s2pdf
51+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
52+
pipeline: *basic_pipeline
53+
- name: processed_01_books_eval_iabooks
54+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
55+
pipeline: *basic_pipeline
56+
57+
58+
59+
# Training configuration
60+
training:
61+
output_dir: /home/ubuntu/olmocr-trainer/
62+
num_train_epochs: 1
63+
64+
# Batch size and accumulation
65+
per_device_train_batch_size: 1
66+
per_device_eval_batch_size: 1
67+
gradient_accumulation_steps: 32
68+
69+
gradient_checkpointing: False
70+
71+
collator_max_token_len: 8192
72+
73+
# Learning rate
74+
learning_rate: 2e-5
75+
lr_scheduler_type: cosine
76+
warmup_ratio: 0.1
77+
78+
# Optimization
79+
optim: adamw_torch
80+
weight_decay: 0.01
81+
max_grad_norm: 1.0
82+
83+
84+
# Evaluation and checkpointing
85+
evaluation_strategy: steps
86+
eval_steps: 500
87+
save_strategy: steps
88+
save_steps: 500
89+
save_total_limit: 5
90+
load_best_model_at_end: true
91+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
92+
greater_is_better: false
93+
94+
report_to:
95+
- wandb
96+

0 commit comments

Comments
 (0)