1
1
from os import PathLike
2
2
from pathlib import Path
3
3
from typing import Dict , Any
4
+ import base64
5
+ from io import BytesIO
6
+ from PIL import Image
4
7
from torch .utils .data import Dataset
5
8
9
+ from olmocr .data .renderpdf import render_pdf_to_base64png
10
+
6
11
7
12
class MarkdownPDFDocumentDataset (Dataset ):
8
13
def __init__ (self , root_dir : str | PathLike , transform = None ):
@@ -49,6 +54,7 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
49
54
50
55
Returns:
51
56
dict containing:
57
+ - 'image': PIL Image of the rendered PDF page
52
58
- 'pdf_path': Path to the PDF file
53
59
- 'text': Text content without front matter
54
60
- 'front_matter': Dict with parsed front matter
@@ -82,7 +88,17 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
82
88
# Get text without front matter
83
89
text = parts [2 ].strip ()
84
90
91
+ # Render PDF to image
92
+ base64_png = render_pdf_to_base64png (str (sample ['pdf_path' ]), page_num = 1 )
93
+ png_bytes = base64 .b64decode (base64_png )
94
+ image = Image .open (BytesIO (png_bytes ))
95
+
96
+ # Apply transform if provided
97
+ if self .transform :
98
+ image = self .transform (image )
99
+
85
100
return {
101
+ 'image' : image ,
86
102
'pdf_path' : str (sample ['pdf_path' ]),
87
103
'text' : text ,
88
104
'front_matter' : front_matter
@@ -118,6 +134,22 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
118
134
# Test __getitem__
119
135
print ("\n Testing __getitem__ on first sample:" )
120
136
first_sample = dataset [0 ]
137
+ print (f"Image type: { type (first_sample ['image' ])} " )
138
+ print (f"Image size: { first_sample ['image' ].size } " )
121
139
print (f"PDF Path: { first_sample ['pdf_path' ]} " )
122
140
print (f"Front Matter: { first_sample ['front_matter' ]} " )
123
- print (f"Text preview: { first_sample ['text' ]} " )
141
+ print (f"Text preview (first 200 chars): { first_sample ['text' ][:200 ]} ..." )
142
+
143
+ # Test with transforms
144
+ print ("\n Testing with torchvision transforms:" )
145
+ import torchvision .transforms as transforms
146
+
147
+ transform = transforms .Compose ([
148
+ transforms .Resize ((1024 , 1024 )),
149
+ transforms .ToTensor (),
150
+ ])
151
+
152
+ dataset_with_transform = MarkdownPDFDocumentDataset (args .root_dir , transform = transform )
153
+ transformed_sample = dataset_with_transform [0 ]
154
+ print (f"Transformed image type: { type (transformed_sample ['image' ])} " )
155
+ print (f"Transformed image shape: { transformed_sample ['image' ].shape } " )
0 commit comments