allenai
diff --git a/‎.github/workflows/main.yml‎
Lines changed: 20 additions & 19 deletions b/‎.github/workflows/main.yml‎
Lines changed: 20 additions & 19 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 7 additions & 7 deletions b/‎Dockerfile‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 35 additions & 30 deletions b/‎README.md‎
Lines changed: 35 additions & 30 deletions
diff --git a/‎olmocr/bench/README.md‎
Lines changed: 52 additions & 34 deletions b/‎olmocr/bench/README.md‎
Lines changed: 52 additions & 34 deletions
diff --git a/‎olmocr/bench/convert.py‎
Lines changed: 1 addition & 0 deletions b/‎olmocr/bench/convert.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎olmocr/bench/runners/run_marker.py‎
Lines changed: 15 additions & 2 deletions b/‎olmocr/bench/runners/run_marker.py‎
Lines changed: 15 additions & 2 deletions
@@ -271,25 +271,26 @@ jobs:
           outputs: type=registry
           no-cache: true
 
-      - name: Setup Beaker CLI
-        uses: allenai/setup-beaker@v2
-        with:
-          token: ${{ secrets.BEAKER_TOKEN }}
-          version: latest
-
-      - name: Push to Beaker
-        env:
-          BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
-        run: |
-          # Get the version without 'v' prefix
-          VERSION=${GITHUB_REF#refs/tags/v}
-          
-          # Push the Docker image to Beaker
-          beaker image create \
-            --name "olmocr-inference-$VERSION" \
-            --workspace ai2/olmocr \
-            "docker://${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$VERSION"
-          
+      # jakep: push to beaker can't work because of limitted disk space on these runners
+      # jakep: (you can try by setting load: true above, but you'll need a larger runner)
+      # - name: Setup Beaker CLI
+      #   uses: allenai/setup-beaker@v2
+      #   with:
+      #     token: ${{ secrets.BEAKER_TOKEN }}
+      #     version: latest
+      # - name: Debug Docker images
+      #   run: docker images
+
+      # - name: Push to Beaker
+      #   env:
+      #     BEAKER_TOKEN: ${{ secrets.BEAKER_TOKEN }}
+      #   run: |
+      #     VERSION=${{ steps.meta.outputs.version }}
+      #     beaker image create \
+      #       --name "olmocr-inference-$VERSION" \
+      #       --workspace ai2/olmocr \
+      #       alleninstituteforai/olmocr:$VERSION
+        
       - name: Clean up after build
         if: always()
         run: |
 
@@ -21,6 +21,7 @@ olmOCR-bench/*
 table_data*/
 /synth*/
 dolma_samples/*
+old_train/
 /*.html
 scoreelo.csv
 debug.log
 
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [v0.1.76](https://github.com/allenai/olmocr/releases/tag/v0.1.76) - 2025-06-23
+
+## [v0.1.75](https://github.com/allenai/olmocr/releases/tag/v0.1.75) - 2025-06-17
+
+## [v0.1.74](https://github.com/allenai/olmocr/releases/tag/v0.1.74) - 2025-06-17
+
+## [v0.1.73](https://github.com/allenai/olmocr/releases/tag/v0.1.73) - 2025-06-17
+
+## [v0.1.72](https://github.com/allenai/olmocr/releases/tag/v0.1.72) - 2025-06-17
+
 ## [v0.1.71](https://github.com/allenai/olmocr/releases/tag/v0.1.71) - 2025-05-30
 
 ## [v0.1.70](https://github.com/allenai/olmocr/releases/tag/v0.1.70) - 2025-05-23
 
@@ -47,19 +47,19 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
     unzip
 
 ENV PYTHONUNBUFFERED=1
-WORKDIR /root
-COPY pyproject.toml pyproject.toml
-COPY olmocr/version.py olmocr/version.py
+
+# keep the build context clean
+WORKDIR /build          
+COPY . /build
+
 
 # Needed to resolve setuptools dependencies
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-RUN uv pip install --system --no-cache -e ".[gpu]"  --extra-index-url https://download.pytorch.org/whl/cu128
+RUN uv pip install --system --no-cache ".[gpu]" --extra-index-url https://download.pytorch.org/whl/cu128
 RUN uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
 RUN uv pip install --system --no-cache ".[bench]"
+
 RUN playwright install-deps
 RUN playwright install chromium
 
-COPY olmocr olmocr
-COPY scripts scripts
-
 RUN python3 -m olmocr.pipeline --help
@@ -35,6 +35,7 @@ Features:
  - (Based on a 7B parameter VLM, so it requires a GPU)
 
 ### News
+ - June 17, 2025 - v0.1.75 - Switch from sglang to vllm based inference pipeline, updated docker image to CUDA 12.8.
  - May 23, 2025 - v0.1.70 - Official docker support and images are now available! [See Docker usage](#using-docker)
  - May 19, 2025 - v0.1.68 - [olmOCR-Bench](https://github.com/allenai/olmocr/tree/main/olmocr/bench) launch, scoring 77.4. Launch includes 2 point performance boost in olmOCR pipeline due to bug fixes with prompts.
  - Mar 17, 2025 - v0.1.60 - Performance improvements due to better temperature selection in sampling.
@@ -49,29 +50,29 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
   <thead>
     <tr>
       <th align="left"><strong>Model</strong></th>
-      <th align="center">AR</th>
-      <th align="center">OSM</th>
-      <th align="center">TA</th>
-      <th align="center">OS</th>
-      <th align="center">HF</th>
-      <th align="center">MC</th>
-      <th align="center">LTT</th>
+      <th align="center">ArXiv</th>
+      <th align="center">Old Scans Math</th>
+      <th align="center">Tables</th>
+      <th align="center">Old Scans</th>
+      <th align="center">Headers and Footers</th>
+      <th align="center">Multi column</th>
+      <th align="center">Long tiny text</th>
       <th align="center">Base</th>
-      <th align="center">Overall Score</th>
+      <th align="center">Overall</th>
     </tr>
   </thead>
   <tbody>
     <tr>
-      <td align="left">Marker v1.6.2</td>
-      <td align="center">24.3</td>
-      <td align="center">22.1</td>
-      <td align="center">69.8</td>
-      <td align="center">24.3</td>
-      <td align="center">87.1</td>
-      <td align="center">71.0</td>
-      <td align="center">76.9</td>
-      <td align="center"><strong>99.5</strong></td>
-      <td align="center">59.4 ± 1.1</td>
+      <td align="left">Marker v1.7.5 (base)</td>
+      <td align="center">76.0</td>
+      <td align="center">57.9</td>
+      <td align="center">57.6</td>
+      <td align="center">27.8</td>
+      <td align="center">84.9</td>
+      <td align="center">72.9</td>
+      <td align="center">84.6</td>
+      <td align="center">99.1</td>
+      <td align="center">70.1 ± 1.1</td>
     </tr>
     <tr>
       <td align="left">MinerU v1.3.10</td>
@@ -94,24 +95,25 @@ We also ship a comprehensive benchmark suite covering over 7,000 test cases acro
       <td align="center">93.6</td>
       <td align="center">71.3</td>
       <td align="center">77.1</td>
-      <td align="center">99.4</td>
+      <td align="center"><strong>99.4</strong></td>
       <td align="center">72.0 ± 1.1</td>
     </tr>
     <tr>
-      <td align="left">olmOCR v0.1.68 (pipeline.py)</td>
-      <td align="center">75.6</td>
-      <td align="center">75.1</td>
-      <td align="center">70.2</td>
-      <td align="center"><strong>44.5</strong></td>
-      <td align="center">93.4</td>
-      <td align="center"><strong>79.4</strong></td>
-      <td align="center">81.7</td>
-      <td align="center">99.0</td>
-      <td align="center"><strong>77.4 ± 1.0</strong></td>
+      <td align="left">olmOCR v0.1.75 (Anchored)</td>
+      <td align="center">74.9</td>
+      <td align="center">71.2</td>
+      <td align="center">71.0</td>
+      <td align="center">42.2</td>
+      <td align="center">94.5</td>
+      <td align="center"><strong>78.3</strong></td>
+      <td align="center">73.3</td>
+      <td align="center">98.3</td>
+      <td align="center"><strong>75.5 ± 1.0</strong></td>
     </tr>
   </tbody>
 </table>
 
+
 ### Installation
 
 Requirements:
@@ -136,7 +138,10 @@ conda activate olmocr
 pip install olmocr[bench]
 
 # For actually converting the files with your own GPU
-pip install olmocr[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
+pip install olmocr.[gpu]  --extra-index-url https://download.pytorch.org/whl/cu128
+
+# Recommended: Install flash infer for faster inference on GPU
+pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
 ```
 
 ### Local Usage Example
 
@@ -14,6 +14,9 @@ olmOCR-bench operates on single page PDFs directly. We make this choice because
 We have run the benchmark against some contemporary OCR pipelines, but it is really easy 
 to run it against your own OCR tools. Your tool just needs to support Markdown or plain text output.
 
+<div align="center">
+  <img src="https://github.com/allenai/olmocr/blob/main/scripts/pareto/ocr_pareto.png?raw=true" width=800/>
+</div>
 
 ## Results
 
@@ -37,7 +40,7 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="left">GOT OCR</td>
       <td align="center">52.7</td>
       <td align="center">52.0</td>
-      <td align="center">0.2</td>
+      <td align="center">0.20</td>
       <td align="center">22.1</td>
       <td align="center">93.6</td>
       <td align="center">42.0</td>
@@ -46,16 +49,16 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="center">48.3 ± 1.1</td>
     </tr>
     <tr>
-      <td align="left">Marker v1.6.2</td>
-      <td align="center">24.3</td>
-      <td align="center">22.1</td>
-      <td align="center">69.8</td>
-      <td align="center">24.3</td>
-      <td align="center">87.1</td>
-      <td align="center">71.0</td>
-      <td align="center">76.9</td>
-      <td align="center"><strong>99.5</strong></td>
-      <td align="center">59.4 ± 1.1</td>
+      <td align="left">Marker v1.7.5 (base, force_ocr)</td>
+      <td align="center">76.0</td>
+      <td align="center">57.9</td>
+      <td align="center">57.6</td>
+      <td align="center">27.8</td>
+      <td align="center">84.9</td>
+      <td align="center">72.9</td>
+      <td align="center">84.6</td>
+      <td align="center">99.1</td>
+      <td align="center">70.1 ± 1.1</td>
     </tr>
     <tr>
       <td align="left">MinerU v1.3.10</td>
@@ -78,9 +81,21 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="center">93.6</td>
       <td align="center">71.3</td>
       <td align="center">77.1</td>
-      <td align="center">99.4</td>
+      <td align="center"><strong>99.4</strong></td>
       <td align="center">72.0 ± 1.1</td>
     </tr>
+    <tr>
+      <td align="left">Nanonets OCR</td>
+      <td align="center">67.0</td>
+      <td align="center">68.6</td>
+      <td align="center"><strong>77.7</strong></td>
+      <td align="center">39.5</td>
+      <td align="center">40.7</td>
+      <td align="center">69.9</td>
+      <td align="center">53.4</td>
+      <td align="center">99.3</td>
+      <td align="center">64.5 ± 1.1</td>
+    </tr>
     <tr>
       <td align="left">GPT-4o (No Anchor)</td>
       <td align="center">51.5</td>
@@ -154,33 +169,39 @@ to run it against your own OCR tools. Your tool just needs to support Markdown o
       <td align="center">65.5 ± 1.2</td>
     </tr>
     <tr>
-      <td align="left">olmOCR v0.1.68 (No Anchor)</td>
-      <td align="center">72.1</td>
-      <td align="center">74.7</td>
+      <td align="left">olmOCR v0.1.75 (No Anchor)</td>
       <td align="center">71.5</td>
-      <td align="center">43.7</td>
-      <td align="center">91.6</td>
-      <td align="center">78.5</td>
-      <td align="center">80.5</td>
-      <td align="center">98.1</td>
-      <td align="center">76.3 ± 1.1</td>
+      <td align="center">71.4</td>
+      <td align="center">71.4</td>
+      <td align="center"><strong>42.8</strong></td>
+      <td align="center">94.1</td>
+      <td align="center">77.7</td>
+      <td align="center">71.0</td>
+      <td align="center">97.8</td>
+      <td align="center">74.7 ± 1.1</td>
     </tr>
     <tr>
-      <td align="left">olmOCR v0.1.68 (Anchored)</td>
-      <td align="center">75.6</td>
-      <td align="center">75.1</td>
-      <td align="center">70.2</td>
-      <td align="center"><strong>44.5</strong></td>
-      <td align="center">93.4</td>
-      <td align="center"><strong>79.4</strong></td>
-      <td align="center">81.7</td>
-      <td align="center">99.0</td>
-      <td align="center"><strong>77.4 ± 1.0</strong></td>
+      <td align="left">olmOCR v0.1.75 (Anchored)</td>
+      <td align="center">74.9</td>
+      <td align="center">71.2</td>
+      <td align="center">71.0</td>
+      <td align="center">42.2</td>
+      <td align="center">94.5</td>
+      <td align="center"><strong>78.3</strong></td>
+      <td align="center">73.3</td>
+      <td align="center">98.3</td>
+      <td align="center"><strong>75.5 ± 1.0</strong></td>
     </tr>
   </tbody>
 </table>
 
 
+<sup><sub>There was a small drop in scores from olmOCR v0.1.68 (77.4), which is due to two factors. One, is that we have adjusted our benchmark code to not include
+any "fallback" mechanism when measuring benchmark scores (though it still exists when you run olmocr.pipeline). Second, there is a small drop in scores as we have updated
+from sglang 0.4.2 to vllm 0.9.1. In net, we think the upgrade to vllm is the right choice, given that sglang 0.4.6 had even lower scores by one point, and vllm comes with a 
+small performance boost, and great support for quantization.
+</sub></sup>
+
 ## Sourcing Documents and Tests
 
 We define 7 distinct document types that we found olmOCR (or its earlier iterations) often struggled to process and defined custom acquisition strategies for each (described below). We removed documents that both contained PII and were not meant for public dissemination. We also decontaminate against documents that appear in olmOCR-Mix via URL level deduplication. To scale creation of test cases over these documents, we combined manual design and review with prompting GPT-4o.
@@ -288,6 +309,3 @@ We have an internal data annotation tool that can be used to review the question
 ```bash
 python -m olmocr.bench.review_app --port 5000 --debug ./olmOCR-bench/bench_data/multi_column.jsonl --force
 ```
-
-
-
 
@@ -223,6 +223,7 @@ async def process_with_semaphore(task):
     available_methods = {
         "olmocr_pipeline": ("olmocr.bench.runners.run_olmocr_pipeline", "run_olmocr_pipeline"),
         "gotocr": ("olmocr.bench.runners.run_gotocr", "run_gotocr"),
+        "nanonetsocr": ("olmocr.bench.runners.run_nanonetsocr", "run_nanonetsocr"),
         "marker": ("olmocr.bench.runners.run_marker", "run_marker"),
         "mineru": ("olmocr.bench.runners.run_mineru", "run_mineru"),
         "chatgpt": ("olmocr.bench.runners.run_chatgpt", "run_chatgpt"),
 
@@ -1,6 +1,7 @@
 import os
 import tempfile
 
+from marker.config.parser import ConfigParser
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.output import text_from_rendered
@@ -15,10 +16,22 @@ def run_marker(pdf_path: str, page_num: int = 1) -> str:
     if _marker_converter is None:
         # Create a configuration dictionary with the necessary settings
         config = {
-            "texify_inline_spans": True,  # This enables conversion of inline math to LaTeX
+            "force_ocr": True,  # This enables conversion of inline math to LaTeX
+            "use_llm": False,  # We would prefer to run just plain marker for reporting bench results, not hybrid mode
+            "disable_tqdm": True,  # Disable tqdm for cleaner output
+            "recognition_batch_size": 256,
+            "layout_batch_size": 48,
+            "detection_batch_size": 48,
+            "equation_batch_size": 64,
+            "table_rec_batch_size": 48,
+            "ocr_error_batch_size": 64,
         }
+        config_parser = ConfigParser(config)
 
-        _marker_converter = PdfConverter(artifact_dict=create_model_dict(), config=config)
+        _marker_converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+            config=config_parser.generate_config_dict(),
+        )
 
     # Extract the specific page from the PDF
     pdf_to_process = pdf_path