Skip to content

Commit ee69faa

Browse files
committed
Dataset
1 parent bd92f08 commit ee69faa

File tree

1 file changed

+17
-30
lines changed

1 file changed

+17
-30
lines changed

olmocr/train/compress_checkpoint.py

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from llmcompressor import oneshot
3232
from PIL import Image
3333
from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, AutoProcessor
34+
from qwen_vl_utils import process_vision_info
3435

3536
from olmocr.s3_utils import parse_s3_path
3637
from olmocr.pipeline import build_page_query
@@ -81,36 +82,22 @@ async def prepare_calibration_dataset(pdf_paths: List[str], processor) -> List[d
8182

8283
# Extract the image and text from the query
8384
messages = query["messages"]
84-
if messages and len(messages) > 0:
85-
content = messages[0]["content"]
86-
87-
# Extract image data and text
88-
image_data = None
89-
text = None
90-
91-
for item in content:
92-
if item["type"] == "image_url":
93-
image_data = item["image_url"]["url"]
94-
elif item["type"] == "text":
95-
text = item["text"]
96-
97-
if image_data and text:
98-
# Convert base64 image to PIL Image
99-
# Remove data URL prefix
100-
base64_str = image_data.split(",")[1] if "," in image_data else image_data
101-
image_bytes = base64.b64decode(base64_str)
102-
image = Image.open(BytesIO(image_bytes))
103-
104-
# Process with the model's processor
105-
inputs = processor(
106-
text=[text],
107-
images=[image],
108-
padding=False,
109-
truncation=True,
110-
max_length=4096
111-
)
112-
113-
dataset.append(inputs)
85+
86+
text = processor.apply_chat_template(
87+
messages, tokenize=False, add_generation_prompt=True
88+
)
89+
90+
image_inputs, video_inputs = process_vision_info(messages)
91+
92+
# tokenize
93+
return processor(
94+
text=[text],
95+
images=image_inputs,
96+
videos=video_inputs,
97+
padding=False,
98+
max_length=8192,
99+
truncation=True,
100+
)
114101

115102
return dataset
116103

0 commit comments

Comments
 (0)