Skip to content

Commit da6bc45

Browse files
committed
Fix for compresor
1 parent eb200e7 commit da6bc45

File tree

1 file changed

+20
-5
lines changed

1 file changed

+20
-5
lines changed

olmocr/train/compress_checkpoint.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,27 @@ async def prepare_calibration_dataset(pdf_paths: List[str], processor) -> Datase
118118

119119
# Convert list of dicts to HuggingFace Dataset
120120
if dataset_items:
121-
# Flatten the list of dicts into a single dict of lists
122-
dataset_dict = {}
123-
for key in dataset_items[0].keys():
124-
dataset_dict[key] = [item[key] for item in dataset_items]
121+
# Create dataset in batches to avoid overflow
122+
batch_size = 50 # Process in smaller batches
123+
all_datasets = []
125124

126-
return Dataset.from_dict(dataset_dict)
125+
for i in range(0, len(dataset_items), batch_size):
126+
batch = dataset_items[i:i + batch_size]
127+
# Flatten the batch into a dict of lists
128+
batch_dict = {}
129+
for key in batch[0].keys():
130+
batch_dict[key] = [item[key] for item in batch]
131+
132+
# Create dataset for this batch
133+
batch_dataset = Dataset.from_dict(batch_dict)
134+
all_datasets.append(batch_dataset)
135+
136+
# Concatenate all batch datasets
137+
if len(all_datasets) == 1:
138+
return all_datasets[0]
139+
else:
140+
from datasets import concatenate_datasets
141+
return concatenate_datasets(all_datasets)
127142
else:
128143
return Dataset.from_dict({})
129144

0 commit comments

Comments
 (0)