Skip to content

Commit 0689676

Browse files
committed
Rendering the pdfs in the dataloader
1 parent 352287c commit 0689676

File tree

2 files changed

+33
-1
lines changed

2 files changed

+33
-1
lines changed
File renamed without changes.

olmocr/train/dataloader.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
from os import PathLike
22
from pathlib import Path
33
from typing import Dict, Any
4+
import base64
5+
from io import BytesIO
6+
from PIL import Image
47
from torch.utils.data import Dataset
58

9+
from olmocr.data.renderpdf import render_pdf_to_base64png
10+
611

712
class MarkdownPDFDocumentDataset(Dataset):
813
def __init__(self, root_dir: str | PathLike, transform=None):
@@ -49,6 +54,7 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
4954
5055
Returns:
5156
dict containing:
57+
- 'image': PIL Image of the rendered PDF page
5258
- 'pdf_path': Path to the PDF file
5359
- 'text': Text content without front matter
5460
- 'front_matter': Dict with parsed front matter
@@ -82,7 +88,17 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
8288
# Get text without front matter
8389
text = parts[2].strip()
8490

91+
# Render PDF to image
92+
base64_png = render_pdf_to_base64png(str(sample['pdf_path']), page_num=1)
93+
png_bytes = base64.b64decode(base64_png)
94+
image = Image.open(BytesIO(png_bytes))
95+
96+
# Apply transform if provided
97+
if self.transform:
98+
image = self.transform(image)
99+
85100
return {
101+
'image': image,
86102
'pdf_path': str(sample['pdf_path']),
87103
'text': text,
88104
'front_matter': front_matter
@@ -118,6 +134,22 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
118134
# Test __getitem__
119135
print("\nTesting __getitem__ on first sample:")
120136
first_sample = dataset[0]
137+
print(f"Image type: {type(first_sample['image'])}")
138+
print(f"Image size: {first_sample['image'].size}")
121139
print(f"PDF Path: {first_sample['pdf_path']}")
122140
print(f"Front Matter: {first_sample['front_matter']}")
123-
print(f"Text preview: {first_sample['text']}")
141+
print(f"Text preview (first 200 chars): {first_sample['text'][:200]}...")
142+
143+
# Test with transforms
144+
print("\nTesting with torchvision transforms:")
145+
import torchvision.transforms as transforms
146+
147+
transform = transforms.Compose([
148+
transforms.Resize((1024, 1024)),
149+
transforms.ToTensor(),
150+
])
151+
152+
dataset_with_transform = MarkdownPDFDocumentDataset(args.root_dir, transform=transform)
153+
transformed_sample = dataset_with_transform[0]
154+
print(f"Transformed image type: {type(transformed_sample['image'])}")
155+
print(f"Transformed image shape: {transformed_sample['image'].shape}")

0 commit comments

Comments
 (0)