Skip to content

Commit 7cf9879

Browse files
committed
Image 1600 configuration
1 parent d2ef9d7 commit 7cf9879

File tree

1 file changed

+93
-0
lines changed

1 file changed

+93
-0
lines changed
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-finetune-default_image1280
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1600
37+
- name: StaticLengthDocumentAnchoring
38+
target_anchor_text_len: 3000
39+
- name: FinetuningPrompt
40+
- name: FrontMatterOutputFormat
41+
- name: InstructUserMessages
42+
- name: Tokenizer
43+
masking_index: -100
44+
end_of_message_token: "<|im_end|>"
45+
- name: processed_00_documents_train_s2pdf
46+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47+
pipeline: *basic_pipeline
48+
49+
eval:
50+
- name: processed_00_documents_eval_s2pdf
51+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
52+
pipeline: *basic_pipeline
53+
- name: processed_01_books_eval_iabooks
54+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
55+
pipeline: *basic_pipeline
56+
57+
58+
# Training configuration
59+
training:
60+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
61+
num_train_epochs: 1
62+
63+
# Batch size and accumulation
64+
per_device_train_batch_size: 1
65+
per_device_eval_batch_size: 1
66+
gradient_accumulation_steps: 8
67+
68+
gradient_checkpointing: False
69+
70+
# Learning rate
71+
learning_rate: 1e-6
72+
lr_scheduler_type: cosine
73+
warmup_ratio: 0.1
74+
75+
# Optimization
76+
optim: adamw_torch
77+
weight_decay: 0.01
78+
max_grad_norm: 1.0
79+
80+
81+
# Evaluation and checkpointing
82+
evaluation_strategy: steps
83+
eval_steps: 500
84+
save_strategy: steps
85+
save_steps: 500
86+
save_total_limit: 5
87+
load_best_model_at_end: true
88+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
89+
greater_is_better: false
90+
91+
report_to:
92+
- wandb
93+

0 commit comments

Comments
 (0)