Skip to content

Commit 75bfa6a

Browse files
committed
Adding full soup configs
1 parent 0f733ff commit 75bfa6a

File tree

3 files changed

+288
-0
lines changed

3 files changed

+288
-0
lines changed
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-olmocrv2_soupfull0
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1280
37+
- name: NewYamlFinetuningPromptWithNoAnchoring
38+
- name: FrontMatterOutputFormat
39+
- name: InstructUserMessages
40+
- name: Tokenizer
41+
masking_index: -100
42+
end_of_message_token: "<|im_end|>"
43+
- name: processed_00_documents_train_s2pdf
44+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
45+
pipeline: *basic_pipeline
46+
47+
eval:
48+
- name: processed_00_documents_eval_s2pdf
49+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
50+
pipeline: *basic_pipeline
51+
- name: processed_01_books_eval_iabooks
52+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
53+
pipeline: *basic_pipeline
54+
55+
56+
57+
# Training configuration
58+
training:
59+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
60+
num_train_epochs: 1.0
61+
62+
# Batch size and accumulation
63+
per_device_train_batch_size: 1
64+
per_device_eval_batch_size: 1
65+
gradient_accumulation_steps: 32
66+
67+
gradient_checkpointing: False
68+
69+
collator_max_token_len: 8192
70+
71+
# Learning rate
72+
learning_rate: 2e-5
73+
lr_scheduler_type: linear
74+
warmup_ratio: 0.1
75+
76+
# Optimization
77+
optim: adamw_torch
78+
weight_decay: 0.01
79+
max_grad_norm: 1.0
80+
81+
seed: 100
82+
data_seed: 101
83+
84+
# Evaluation and checkpointing
85+
evaluation_strategy: steps
86+
eval_steps: 500
87+
save_strategy: steps
88+
save_steps: 500
89+
save_total_limit: 5
90+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
91+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
92+
greater_is_better: false
93+
94+
report_to:
95+
- wandb
96+
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-olmocrv2_soupfull1
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1280
37+
- name: NewYamlFinetuningPromptWithNoAnchoring
38+
- name: FrontMatterOutputFormat
39+
- name: InstructUserMessages
40+
- name: Tokenizer
41+
masking_index: -100
42+
end_of_message_token: "<|im_end|>"
43+
- name: processed_00_documents_train_s2pdf
44+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
45+
pipeline: *basic_pipeline
46+
47+
eval:
48+
- name: processed_00_documents_eval_s2pdf
49+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
50+
pipeline: *basic_pipeline
51+
- name: processed_01_books_eval_iabooks
52+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
53+
pipeline: *basic_pipeline
54+
55+
56+
57+
# Training configuration
58+
training:
59+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
60+
num_train_epochs: 1.0
61+
62+
# Batch size and accumulation
63+
per_device_train_batch_size: 1
64+
per_device_eval_batch_size: 1
65+
gradient_accumulation_steps: 32
66+
67+
gradient_checkpointing: False
68+
69+
collator_max_token_len: 8192
70+
71+
# Learning rate
72+
learning_rate: 2e-5
73+
lr_scheduler_type: linear
74+
warmup_ratio: 0.1
75+
76+
# Optimization
77+
optim: adamw_torch
78+
weight_decay: 0.01
79+
max_grad_norm: 1.0
80+
81+
seed: 200
82+
data_seed: 201
83+
84+
# Evaluation and checkpointing
85+
evaluation_strategy: steps
86+
eval_steps: 500
87+
save_strategy: steps
88+
save_steps: 500
89+
save_total_limit: 5
90+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
91+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
92+
greater_is_better: false
93+
94+
report_to:
95+
- wandb
96+
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Example OlmOCR Training Configuration
2+
3+
# Project metadata
4+
project_name: olmocr-qwen-vl-training
5+
run_name: qwen2.5-vl-7b-olmocrv2_soupfull1
6+
7+
# Model configuration
8+
model:
9+
name: Qwen/Qwen2.5-VL-7B-Instruct
10+
trust_remote_code: true
11+
torch_dtype: bfloat16
12+
use_flash_attention: true
13+
attn_implementation: flash_attention_2
14+
15+
# LoRA settings (disabled by default)
16+
use_lora: false
17+
# lora_rank: 8
18+
# lora_alpha: 32
19+
# lora_dropout: 0.1
20+
# lora_target_modules:
21+
# - q_proj
22+
# - v_proj
23+
# - k_proj
24+
# - o_proj
25+
26+
# Dataset configuration
27+
dataset:
28+
29+
train:
30+
- name: processed_01_books_train_iabooks
31+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
32+
pipeline: &basic_pipeline
33+
- name: FrontMatterParser
34+
front_matter_class: PageResponse
35+
- name: PDFRenderer
36+
target_longest_image_dim: 1280
37+
- name: NewYamlFinetuningPromptWithNoAnchoring
38+
- name: FrontMatterOutputFormat
39+
- name: InstructUserMessages
40+
- name: Tokenizer
41+
masking_index: -100
42+
end_of_message_token: "<|im_end|>"
43+
- name: processed_00_documents_train_s2pdf
44+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
45+
pipeline: *basic_pipeline
46+
47+
eval:
48+
- name: processed_00_documents_eval_s2pdf
49+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
50+
pipeline: *basic_pipeline
51+
- name: processed_01_books_eval_iabooks
52+
root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
53+
pipeline: *basic_pipeline
54+
55+
56+
57+
# Training configuration
58+
training:
59+
output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
60+
num_train_epochs: 1.0
61+
62+
# Batch size and accumulation
63+
per_device_train_batch_size: 1
64+
per_device_eval_batch_size: 1
65+
gradient_accumulation_steps: 32
66+
67+
gradient_checkpointing: False
68+
69+
collator_max_token_len: 8192
70+
71+
# Learning rate
72+
learning_rate: 2e-5
73+
lr_scheduler_type: linear
74+
warmup_ratio: 0.1
75+
76+
# Optimization
77+
optim: adamw_torch
78+
weight_decay: 0.01
79+
max_grad_norm: 1.0
80+
81+
seed: 300
82+
data_seed: 301
83+
84+
# Evaluation and checkpointing
85+
evaluation_strategy: steps
86+
eval_steps: 500
87+
save_strategy: steps
88+
save_steps: 500
89+
save_total_limit: 5
90+
load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
91+
metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
92+
greater_is_better: false
93+
94+
report_to:
95+
- wandb
96+

0 commit comments

Comments
 (0)