Trying a few more configs

jakep-allenai · jakep-allenai · commit a5a0cd747855 · 2025-07-11T20:19:48.000Z
diff --git a/olmocr/train/config.py b/olmocr/train/config.py
@@ -83,6 +83,13 @@ class InstructUserMessagesConfig(PipelineStepConfig):
     name: str = "InstructUserMessages"
 
 
+@dataclass
+class LatexBracketNormalizerConfig(PipelineStepConfig):
+    """Configuration for LatexBracketNormalizer step."""
+
+    name: str = "LatexBracketNormalizer"
+
+
 @dataclass
 class TokenizerStepConfig(PipelineStepConfig):
     """Configuration for Tokenizer step."""
@@ -307,6 +314,7 @@ def get_pipeline_steps(self, pipeline_config: List[Dict[str, Any]], processor=No
             FrontMatterOutputFormat,
             FrontMatterParser,
             InstructUserMessages,
+            LatexBracketNormalizer,
             NewYamlFinetuningPromptWithAnchoring,
             NewYamlFinetuningPromptWithNoAnchoring,
             JSONOutputFormat,
@@ -356,6 +364,9 @@ def get_pipeline_steps(self, pipeline_config: List[Dict[str, Any]], processor=No
             elif step_name == "InstructUserMessages":
                 steps.append(InstructUserMessages())
 
+            elif step_name == "LatexBracketNormalizer":
+                steps.append(LatexBracketNormalizer())
+
             elif step_name == "Tokenizer":
                 if processor is None:
                     raise ValueError("Processor must be provided for Tokenizer step")
diff --git a/olmocr/train/configs/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_latexnormalize.yaml b/olmocr/train/configs/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_latexnormalize.yaml
@@ -0,0 +1,97 @@
+# Example OlmOCR Training Configuration
+
+# Project metadata
+project_name: olmocr-qwen-vl-training
+run_name: qwen2.5-vl-7b-finetune-day3-yaml-1280-noanchor-latexnormalize
+
+# Model configuration
+model:
+  name: Qwen/Qwen2.5-VL-7B-Instruct
+  trust_remote_code: true
+  torch_dtype: bfloat16
+  use_flash_attention: true
+  attn_implementation: flash_attention_2
+  
+  # LoRA settings (disabled by default)
+  use_lora: false
+  # lora_rank: 8
+  # lora_alpha: 32
+  # lora_dropout: 0.1
+  # lora_target_modules:
+  #   - q_proj
+  #   - v_proj
+  #   - k_proj
+  #   - o_proj
+
+# Dataset configuration
+dataset:
+
+  train:
+    - name: processed_01_books_train_iabooks
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
+      pipeline: &basic_pipeline
+        - name: FrontMatterParser
+          front_matter_class: PageResponse
+        - name: PDFRenderer
+          target_longest_image_dim: 1280
+        - name: LatexBracketNormalizer
+        - name: StaticLengthDocumentAnchoring
+          target_anchor_text_len: -1
+        - name: FinetuningPrompt
+        - name: FrontMatterOutputFormat
+        - name: InstructUserMessages
+        - name: Tokenizer
+          masking_index: -100
+          end_of_message_token: "<|im_end|>"
+    - name: processed_00_documents_train_s2pdf
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
+      pipeline: *basic_pipeline
+
+  eval:
+    - name: processed_00_documents_eval_s2pdf
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
+      pipeline: *basic_pipeline
+    - name: processed_01_books_eval_iabooks
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
+      pipeline: *basic_pipeline
+
+
+
+# Training configuration
+training:
+  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
+  num_train_epochs: 1
+  
+  # Batch size and accumulation
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 32
+
+  gradient_checkpointing: False
+
+  collator_max_token_len: 8192
+  
+  # Learning rate
+  learning_rate: 2e-5
+  lr_scheduler_type: linear
+  warmup_ratio: 0.1
+  
+  # Optimization
+  optim: adamw_torch
+  weight_decay: 0.01
+  max_grad_norm: 1.0
+  
+  
+  # Evaluation and checkpointing
+  evaluation_strategy: steps
+  eval_steps: 500
+  save_strategy: steps
+  save_steps: 500
+  save_total_limit: 5
+  load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
+  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
+  greater_is_better: false
+  
+  report_to: 
+    - wandb
+ 
diff --git a/olmocr/train/configs/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_newprompt.yaml b/olmocr/train/configs/qwen25_vl_b100_x1_day3_yaml_1280_noanchor_newprompt.yaml
@@ -0,0 +1,94 @@
+# Example OlmOCR Training Configuration
+
+# Project metadata
+project_name: olmocr-qwen-vl-training
+run_name: qwen2.5-vl-7b-finetune-day3-yaml-1280-noanchor-newprompt
+
+# Model configuration
+model:
+  name: Qwen/Qwen2.5-VL-7B-Instruct
+  trust_remote_code: true
+  torch_dtype: bfloat16
+  use_flash_attention: true
+  attn_implementation: flash_attention_2
+  
+  # LoRA settings (disabled by default)
+  use_lora: false
+  # lora_rank: 8
+  # lora_alpha: 32
+  # lora_dropout: 0.1
+  # lora_target_modules:
+  #   - q_proj
+  #   - v_proj
+  #   - k_proj
+  #   - o_proj
+
+# Dataset configuration
+dataset:
+
+  train:
+    - name: processed_01_books_train_iabooks
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
+      pipeline: &basic_pipeline
+        - name: FrontMatterParser
+          front_matter_class: PageResponse
+        - name: PDFRenderer
+          target_longest_image_dim: 1280
+        - name: NewYamlFinetuningPromptWithNoAnchoring
+        - name: FrontMatterOutputFormat
+        - name: InstructUserMessages
+        - name: Tokenizer
+          masking_index: -100
+          end_of_message_token: "<|im_end|>"
+    - name: processed_00_documents_train_s2pdf
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
+      pipeline: *basic_pipeline
+
+  eval:
+    - name: processed_00_documents_eval_s2pdf
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
+      pipeline: *basic_pipeline
+    - name: processed_01_books_eval_iabooks
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
+      pipeline: *basic_pipeline
+
+
+
+# Training configuration
+training:
+  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
+  num_train_epochs: 1
+  
+  # Batch size and accumulation
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 32
+
+  gradient_checkpointing: False
+
+  collator_max_token_len: 8192
+  
+  # Learning rate
+  learning_rate: 2e-5
+  lr_scheduler_type: linear
+  warmup_ratio: 0.1
+  
+  # Optimization
+  optim: adamw_torch
+  weight_decay: 0.01
+  max_grad_norm: 1.0
+  
+  
+  # Evaluation and checkpointing
+  evaluation_strategy: steps
+  eval_steps: 500
+  save_strategy: steps
+  save_steps: 500
+  save_total_limit: 5
+  load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
+  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
+  greater_is_better: false
+  
+  report_to: 
+    - wandb
+ 
diff --git a/olmocr/train/configs/qwen2_vl_b100_x1_day3_yaml.yaml b/olmocr/train/configs/qwen2_vl_b100_x1_day3_yaml.yaml
@@ -0,0 +1,96 @@
+# Example OlmOCR Training Configuration
+
+# Project metadata
+project_name: olmocr-qwen-vl-training
+run_name: qwen2-vl-7b-finetune-day3-yaml
+
+# Model configuration
+model:
+  name: Qwen/Qwen2-VL-7B-Instruct
+  trust_remote_code: true
+  torch_dtype: bfloat16
+  use_flash_attention: true
+  attn_implementation: flash_attention_2
+  
+  # LoRA settings (disabled by default)
+  use_lora: false
+  # lora_rank: 8
+  # lora_alpha: 32
+  # lora_dropout: 0.1
+  # lora_target_modules:
+  #   - q_proj
+  #   - v_proj
+  #   - k_proj
+  #   - o_proj
+
+# Dataset configuration
+dataset:
+
+  train:
+    - name: processed_01_books_train_iabooks
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_train_iabooks/
+      pipeline: &basic_pipeline
+        - name: FrontMatterParser
+          front_matter_class: PageResponse
+        - name: PDFRenderer
+          target_longest_image_dim: 1280
+        - name: StaticLengthDocumentAnchoring
+          target_anchor_text_len: -1
+        - name: FinetuningPrompt
+        - name: FrontMatterOutputFormat
+        - name: InstructUserMessages
+        - name: Tokenizer
+          masking_index: -100
+          end_of_message_token: "<|im_end|>"
+    - name: processed_00_documents_train_s2pdf
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_train_s2pdf/
+      pipeline: *basic_pipeline
+
+  eval:
+    - name: processed_00_documents_eval_s2pdf
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_00_documents_eval_s2pdf/
+      pipeline: *basic_pipeline
+    - name: processed_01_books_eval_iabooks
+      root_dir: /weka/oe-data-default/jakep/olmOCR-mix-0225/processed_01_books_eval_iabooks/
+      pipeline: *basic_pipeline
+
+
+
+# Training configuration
+training:
+  output_dir: /weka/oe-data-default/jakep/olmocr-trainer/
+  num_train_epochs: 1
+  
+  # Batch size and accumulation
+  per_device_train_batch_size: 1
+  per_device_eval_batch_size: 1
+  gradient_accumulation_steps: 32
+
+  gradient_checkpointing: False
+
+  collator_max_token_len: 8192
+  
+  # Learning rate
+  learning_rate: 2e-5
+  lr_scheduler_type: linear
+  warmup_ratio: 0.1
+  
+  # Optimization
+  optim: adamw_torch
+  weight_decay: 0.01
+  max_grad_norm: 1.0
+  
+  
+  # Evaluation and checkpointing
+  evaluation_strategy: steps
+  eval_steps: 500
+  save_strategy: steps
+  save_steps: 500
+  save_total_limit: 5
+  load_best_model_at_end: false # Needs to be false because it has a problem restoring checkpoints for some reason
+  metric_for_best_model: eval_processed_00_documents_eval_s2pdf_loss
+  greater_is_better: false
+  
+  report_to: 
+    - wandb
+ 
diff --git a/olmocr/train/dataloader.py b/olmocr/train/dataloader.py
@@ -1,5 +1,6 @@
 import base64
 import logging
+import re
 from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from dataclasses import dataclass, fields
@@ -358,6 +359,49 @@ def __call__(self, sample: Sample) -> Sample:
 
         return sample
 
+@dataclass(frozen=True, slots=True)
+class LatexBracketNormalizer(PipelineStep):
+    """Normalizes LaTeX brackets in natural text field."""
+    
+    def __call__(self, sample: Sample) -> Sample:
+        """Normalize LaTeX brackets in the natural text field."""
+        # Get the page_data object
+        if "page_data" not in sample:
+            return sample
+            
+        page_data = sample["page_data"]
+        if not hasattr(page_data, "natural_text") or not page_data.natural_text:
+            return sample
+        
+        text = page_data.natural_text
+        
+        # Define patterns for LaTeX normalization
+        # Order matters: process display math first, then inline
+        patterns = [
+            (r"\$\$(.+?)\$\$", r"\[\1\]"),  # $$...$$ to \[...\]
+            (r"\$(.+?)\$", r"\(\1\)"),      # $...$ to \(...\)
+        ]
+        
+        # Apply replacements
+        for pattern, replacement in patterns:
+            text = re.sub(pattern, replacement, text, flags=re.DOTALL)
+        
+        # Update the page_data with normalized text
+        # Since PageResponse is frozen, we need to create a new instance
+        from olmocr.prompts.prompts import PageResponse
+        new_page_data = PageResponse(
+            primary_language=page_data.primary_language,
+            is_rotation_valid=page_data.is_rotation_valid,
+            rotation_correction=page_data.rotation_correction,
+            is_table=page_data.is_table,
+            is_diagram=page_data.is_diagram,
+            natural_text=text
+        )
+        
+        sample["page_data"] = new_page_data
+        return sample
+
+
 @dataclass(frozen=True, slots=True)
 class InstructUserMessages(PipelineStep):
     """Creates instruction-following messages format for training."""