Skip to content

Commit 60c3944

Browse files
committed
More configs
1 parent f44d03f commit 60c3944

File tree

4 files changed

+392
-0
lines changed

4 files changed

+392
-0
lines changed
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-olmocrv2-tokflip1k
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1288
37+
- name: NewYamlFinetuningPromptWithNoAnchoring
38+
- name: FrontMatterOutputFormat
39+
- name: InstructUserMessages
40+
- name: Tokenizer
41+
masking_index: -100
42+
end_of_message_token: "<|im_end|>"
43+
- name: RandomTokenFlipper
44+
token_flip_rate: 0.001
45+
- name: processed_00_documents_train_s2pdf
46+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47+
pipeline: *basic_pipeline
48+
49+
eval:
50+
- name: processed_00_documents_eval_s2pdf
51+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
52+
pipeline: *basic_pipeline
53+
- name: processed_01_books_eval_iabooks
54+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
55+
pipeline: *basic_pipeline
56+
57+
58+
59+
# Training configuration
60+
training:
61+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
62+
num_train_epochs: 1.0
63+
64+
# Batch size and accumulation
65+
per_device_train_batch_size: 1
66+
per_device_eval_batch_size: 1
67+
gradient_accumulation_steps: 32
68+
69+
gradient_checkpointing: False
70+
71+
collator_max_token_len: 8192
72+
73+
# Learning rate
74+
learning_rate: 2e-5
75+
lr_scheduler_type: linear
76+
warmup_ratio: 0.1
77+
78+
# Optimization
79+
optim: adamw_torch
80+
weight_decay: 0.01
81+
max_grad_norm: 1.0
82+
83+
seed: 300
84+
data_seed: 301
85+
86+
# Evaluation and checkpointing
87+
evaluation_strategy: steps
88+
eval_steps: 500
89+
save_strategy: steps
90+
save_steps: 500
91+
save_total_limit: 5
92+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
93+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
94+
greater_is_better: false
95+
96+
report_to:
97+
- wandb
98+
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-olmocrv2-tokflip3k
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1288
37+
- name: NewYamlFinetuningPromptWithNoAnchoring
38+
- name: FrontMatterOutputFormat
39+
- name: InstructUserMessages
40+
- name: Tokenizer
41+
masking_index: -100
42+
end_of_message_token: "<|im_end|>"
43+
- name: RandomTokenFlipper
44+
token_flip_rate: 0.003
45+
- name: processed_00_documents_train_s2pdf
46+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47+
pipeline: *basic_pipeline
48+
49+
eval:
50+
- name: processed_00_documents_eval_s2pdf
51+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
52+
pipeline: *basic_pipeline
53+
- name: processed_01_books_eval_iabooks
54+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
55+
pipeline: *basic_pipeline
56+
57+
58+
59+
# Training configuration
60+
training:
61+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
62+
num_train_epochs: 1.0
63+
64+
# Batch size and accumulation
65+
per_device_train_batch_size: 1
66+
per_device_eval_batch_size: 1
67+
gradient_accumulation_steps: 32
68+
69+
gradient_checkpointing: False
70+
71+
collator_max_token_len: 8192
72+
73+
# Learning rate
74+
learning_rate: 2e-5
75+
lr_scheduler_type: linear
76+
warmup_ratio: 0.1
77+
78+
# Optimization
79+
optim: adamw_torch
80+
weight_decay: 0.01
81+
max_grad_norm: 1.0
82+
83+
seed: 300
84+
data_seed: 301
85+
86+
# Evaluation and checkpointing
87+
evaluation_strategy: steps
88+
eval_steps: 500
89+
save_strategy: steps
90+
save_steps: 500
91+
save_total_limit: 5
92+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
93+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
94+
greater_is_better: false
95+
96+
report_to:
97+
- wandb
98+
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-olmocrv2-tokflip500
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1288
37+
- name: NewYamlFinetuningPromptWithNoAnchoring
38+
- name: FrontMatterOutputFormat
39+
- name: InstructUserMessages
40+
- name: Tokenizer
41+
masking_index: -100
42+
end_of_message_token: "<|im_end|>"
43+
- name: RandomTokenFlipper
44+
token_flip_rate: 0.005
45+
- name: processed_00_documents_train_s2pdf
46+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47+
pipeline: *basic_pipeline
48+
49+
eval:
50+
- name: processed_00_documents_eval_s2pdf
51+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
52+
pipeline: *basic_pipeline
53+
- name: processed_01_books_eval_iabooks
54+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
55+
pipeline: *basic_pipeline
56+
57+
58+
59+
# Training configuration
60+
training:
61+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
62+
num_train_epochs: 1.0
63+
64+
# Batch size and accumulation
65+
per_device_train_batch_size: 1
66+
per_device_eval_batch_size: 1
67+
gradient_accumulation_steps: 32
68+
69+
gradient_checkpointing: False
70+
71+
collator_max_token_len: 8192
72+
73+
# Learning rate
74+
learning_rate: 2e-5
75+
lr_scheduler_type: linear
76+
warmup_ratio: 0.1
77+
78+
# Optimization
79+
optim: adamw_torch
80+
weight_decay: 0.01
81+
max_grad_norm: 1.0
82+
83+
seed: 300
84+
data_seed: 301
85+
86+
# Evaluation and checkpointing
87+
evaluation_strategy: steps
88+
eval_steps: 500
89+
save_strategy: steps
90+
save_steps: 500
91+
save_total_limit: 5
92+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
93+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
94+
greater_is_better: false
95+
96+
report_to:
97+
- wandb
98+
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-olmocrv2-tokflip-2ep
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1288
37+
- name: NewYamlFinetuningPromptWithNoAnchoring
38+
- name: FrontMatterOutputFormat
39+
- name: InstructUserMessages
40+
- name: Tokenizer
41+
masking_index: -100
42+
end_of_message_token: "<|im_end|>"
43+
- name: RandomTokenFlipper
44+
token_flip_rate: 0.0001
45+
- name: processed_00_documents_train_s2pdf
46+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
47+
pipeline: *basic_pipeline
48+
49+
eval:
50+
- name: processed_00_documents_eval_s2pdf
51+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
52+
pipeline: *basic_pipeline
53+
- name: processed_01_books_eval_iabooks
54+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
55+
pipeline: *basic_pipeline
56+
57+
58+
59+
# Training configuration
60+
training:
61+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
62+
num_train_epochs: 2.0
63+
64+
# Batch size and accumulation
65+
per_device_train_batch_size: 1
66+
per_device_eval_batch_size: 1
67+
gradient_accumulation_steps: 32
68+
69+
gradient_checkpointing: False
70+
71+
collator_max_token_len: 8192
72+
73+
# Learning rate
74+
learning_rate: 2e-5
75+
lr_scheduler_type: linear
76+
warmup_ratio: 0.1
77+
78+
# Optimization
79+
optim: adamw_torch
80+
weight_decay: 0.01
81+
max_grad_norm: 1.0
82+
83+
seed: 300
84+
data_seed: 301
85+
86+
# Evaluation and checkpointing
87+
evaluation_strategy: steps
88+
eval_steps: 500
89+
save_strategy: steps
90+
save_steps: 500
91+
save_total_limit: 5
92+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
93+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
94+
greater_is_better: false
95+
96+
report_to:
97+
- wandb
98+

0 commit comments

Comments
 (0)