Skip to content

Commit e6c9823

Browse files
committed
Adding more pipeline retry stats, compress code fixed
1 parent 4dbbf91 commit e6c9823

File tree

3 files changed

+43
-1
lines changed

3 files changed

+43
-1
lines changed

olmocr/pipeline.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ async def process_page(args, worker_id: int, pdf_orig_path: str, pdf_local_path:
271271
local_image_rotation = page_response.rotation_correction
272272
raise ValueError(f"invalid_page rotation for {pdf_orig_path}-{page_num}")
273273

274-
metrics.add_metrics(completed_pages=1)
274+
metrics.add_metrics(**{"completed_pages": 1, f"finished_on_attempt_{attempt}": 1})
275275
await tracker.track_work(worker_id, f"{pdf_orig_path}-{page_num}", "finished")
276276
return PageResult(
277277
pdf_orig_path,
@@ -1211,6 +1211,19 @@ async def main():
12111211
f"Page Failure rate: {total_metrics.get('failed_pages', 0) / max(total_metrics.get('completed_pages', 0) + total_metrics.get('failed_pages', 0), 1) * 100:.2f}%"
12121212
)
12131213

1214+
# Output finished_on_attempt statistics
1215+
logger.info("\nPages finished by attempt number:")
1216+
total_finished = sum(total_metrics.get(f'finished_on_attempt_{i}', 0) for i in range(args.max_page_retries))
1217+
cumulative = 0
1218+
1219+
for i in range(args.max_page_retries):
1220+
if f'finished_on_attempt_{i}' in total_metrics:
1221+
count = total_metrics[f'finished_on_attempt_{i}']
1222+
cumulative += count
1223+
percentage = (count / total_finished * 100) if total_finished > 0 else 0
1224+
cumulative_percentage = (cumulative / total_finished * 100) if total_finished > 0 else 0
1225+
logger.info(f" Attempt {i}: {count:,} pages ({percentage:.1f}%) - Cumulative: {cumulative:,} ({cumulative_percentage:.1f}%)")
1226+
12141227
# Output rates
12151228
if "server_input_tokens_per_sec" in rates:
12161229
logger.info(f"Server Input tokens/sec rate: {rates['server_input_tokens_per_sec']:.2f}")
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
quant_stage:
2+
quant_modifiers:
3+
QuantizationModifier:
4+
ignore: ['re:.*lm_head', 're:model.visual.*']
5+
config_groups:
6+
group_0:
7+
weights:
8+
num_bits: 8
9+
type: float
10+
strategy: channel
11+
dynamic: false
12+
symmetric: true
13+
input_activations:
14+
num_bits: 8
15+
type: float
16+
strategy: token
17+
dynamic: true
18+
symmetric: true
19+
targets: ["Linear"]
20+
kv_cache_scheme:
21+
num_bits: 8
22+
type: float
23+
strategy: tensor
24+
dynamic: false
25+
symmetric: true

scripts/compress_model.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,10 @@ task_spec_args = {
144144
"resources": TaskResources(gpu_count=1),
145145
"constraints": Constraints(cluster=["ai2/ceres-cirrascale", "ai2/jupiter-cirrascale-2"]),
146146
"result": ResultSpec(path="/noop-results"),
147+
"datasets": [
148+
DataMount.new(mount_path="/weka/oe-data-default", weka="oe-data-default"),
149+
DataMount.new(mount_path="/weka/oe-training-default", weka="oe-training-default"),
150+
]
147151
}
148152
149153
# Add env vars if AWS credentials exist

0 commit comments

Comments
 (0)