|
31 | 31 | from llmcompressor import oneshot
|
32 | 32 | from PIL import Image
|
33 | 33 | from transformers import AutoTokenizer, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
| 34 | +from qwen_vl_utils import process_vision_info |
34 | 35 |
|
35 | 36 | from olmocr.s3_utils import parse_s3_path
|
36 | 37 | from olmocr.pipeline import build_page_query
|
@@ -81,36 +82,22 @@ async def prepare_calibration_dataset(pdf_paths: List[str], processor) -> List[d
|
81 | 82 |
|
82 | 83 | # Extract the image and text from the query
|
83 | 84 | messages = query["messages"]
|
84 |
| - if messages and len(messages) > 0: |
85 |
| - content = messages[0]["content"] |
86 |
| - |
87 |
| - # Extract image data and text |
88 |
| - image_data = None |
89 |
| - text = None |
90 |
| - |
91 |
| - for item in content: |
92 |
| - if item["type"] == "image_url": |
93 |
| - image_data = item["image_url"]["url"] |
94 |
| - elif item["type"] == "text": |
95 |
| - text = item["text"] |
96 |
| - |
97 |
| - if image_data and text: |
98 |
| - # Convert base64 image to PIL Image |
99 |
| - # Remove data URL prefix |
100 |
| - base64_str = image_data.split(",")[1] if "," in image_data else image_data |
101 |
| - image_bytes = base64.b64decode(base64_str) |
102 |
| - image = Image.open(BytesIO(image_bytes)) |
103 |
| - |
104 |
| - # Process with the model's processor |
105 |
| - inputs = processor( |
106 |
| - text=[text], |
107 |
| - images=[image], |
108 |
| - padding=False, |
109 |
| - truncation=True, |
110 |
| - max_length=4096 |
111 |
| - ) |
112 |
| - |
113 |
| - dataset.append(inputs) |
| 85 | + |
| 86 | + text = processor.apply_chat_template( |
| 87 | + messages, tokenize=False, add_generation_prompt=True |
| 88 | + ) |
| 89 | + |
| 90 | + image_inputs, video_inputs = process_vision_info(messages) |
| 91 | + |
| 92 | + # tokenize |
| 93 | + return processor( |
| 94 | + text=[text], |
| 95 | + images=image_inputs, |
| 96 | + videos=video_inputs, |
| 97 | + padding=False, |
| 98 | + max_length=8192, |
| 99 | + truncation=True, |
| 100 | + ) |
114 | 101 |
|
115 | 102 | return dataset
|
116 | 103 |
|
|
0 commit comments