@@ -296,3 +296,211 @@ async def mock_build_page_query(local_pdf_path, page, target_longest_image_dim,
296
296
# Verify tracker was called correctly
297
297
mock_tracker .track_work .assert_any_call (0 , "test-edgar-rotated90.pdf-1" , "started" )
298
298
mock_tracker .track_work .assert_any_call (0 , "test-edgar-rotated90.pdf-1" , "finished" )
299
+
300
+ @pytest .mark .asyncio
301
+ async def test_process_page_with_cumulative_rotation (self ):
302
+ """Test that process_page correctly accumulates rotations across multiple attempts."""
303
+
304
+ # Path to the test PDF (can use any test PDF)
305
+ test_pdf_path = "tests/gnarly_pdfs/edgar-rotated90.pdf"
306
+
307
+ # Mock arguments
308
+ args = MockArgs ()
309
+
310
+ # Counter to track number of API calls
311
+ call_count = 0
312
+
313
+ async def mock_apost (url , json_data ):
314
+ nonlocal call_count
315
+ call_count += 1
316
+
317
+ # First call - model detects rotation is needed (90 degrees)
318
+ if call_count == 1 :
319
+ response_content = """---
320
+ primary_language: en
321
+ is_rotation_valid: false
322
+ rotation_correction: 90
323
+ is_table: false
324
+ is_diagram: false
325
+ ---
326
+
327
+ This document appears to be rotated and needs correction."""
328
+
329
+ # Second call - model still detects rotation is needed (another 90 degrees)
330
+ elif call_count == 2 :
331
+ response_content = """---
332
+ primary_language: en
333
+ is_rotation_valid: false
334
+ rotation_correction: 90
335
+ is_table: false
336
+ is_diagram: false
337
+ ---
338
+
339
+ Document still needs rotation."""
340
+
341
+ # Third call - after 180 total degrees of rotation, model says it's correct
342
+ elif call_count == 3 :
343
+ response_content = """---
344
+ primary_language: en
345
+ is_rotation_valid: true
346
+ rotation_correction: 0
347
+ is_table: false
348
+ is_diagram: false
349
+ ---
350
+
351
+ UNITED STATES
352
+ SECURITIES AND EXCHANGE COMMISSION
353
+ Washington, D.C. 20549
354
+
355
+ Document is now correctly oriented after 180 degree rotation."""
356
+
357
+ else :
358
+ raise ValueError (f"Unexpected call count: { call_count } " )
359
+
360
+ # Mock response structure
361
+ response_body = {
362
+ "choices" : [{"message" : {"content" : response_content }, "finish_reason" : "stop" }],
363
+ "usage" : {"prompt_tokens" : 1000 , "completion_tokens" : 100 , "total_tokens" : 1100 },
364
+ }
365
+
366
+ return 200 , json .dumps (response_body ).encode ()
367
+
368
+ # Mock the worker tracker
369
+ mock_tracker = AsyncMock ()
370
+
371
+ # Ensure the test PDF exists
372
+ assert os .path .exists (test_pdf_path ), f"Test PDF not found at { test_pdf_path } "
373
+
374
+ # Track calls to build_page_query
375
+ build_page_query_calls = []
376
+ original_build_page_query = build_page_query
377
+
378
+ async def mock_build_page_query (local_pdf_path , page , target_longest_image_dim , image_rotation = 0 ):
379
+ build_page_query_calls .append (image_rotation )
380
+ return await original_build_page_query (local_pdf_path , page , target_longest_image_dim , image_rotation )
381
+
382
+ with patch ("olmocr.pipeline.apost" , side_effect = mock_apost ):
383
+ with patch ("olmocr.pipeline.tracker" , mock_tracker ):
384
+ with patch ("olmocr.pipeline.build_page_query" , side_effect = mock_build_page_query ):
385
+ result = await process_page (args = args , worker_id = 0 , pdf_orig_path = "test-cumulative-rotation.pdf" , pdf_local_path = test_pdf_path , page_num = 1 )
386
+
387
+ # Verify the result
388
+ assert isinstance (result , PageResult )
389
+ assert result .page_num == 1
390
+ assert result .is_fallback == False
391
+ assert result .response .is_rotation_valid == True
392
+ assert result .response .rotation_correction == 0
393
+ assert result .response .natural_text is not None
394
+ assert "180 degree rotation" in result .response .natural_text
395
+
396
+ # Verify that exactly 3 API calls were made
397
+ assert call_count == 3
398
+
399
+ # Verify build_page_query was called with correct cumulative rotations
400
+ assert len (build_page_query_calls ) == 3
401
+ assert build_page_query_calls [0 ] == 0 # First call with no rotation
402
+ assert build_page_query_calls [1 ] == 90 # Second call with 90 degree rotation
403
+ assert build_page_query_calls [2 ] == 180 # Third call with cumulative 180 degree rotation
404
+
405
+ # Verify tracker was called correctly
406
+ mock_tracker .track_work .assert_any_call (0 , "test-cumulative-rotation.pdf-1" , "started" )
407
+ mock_tracker .track_work .assert_any_call (0 , "test-cumulative-rotation.pdf-1" , "finished" )
408
+
409
+ @pytest .mark .asyncio
410
+ async def test_process_page_rotation_wraps_around (self ):
411
+ """Test that cumulative rotation correctly wraps around at 360 degrees."""
412
+
413
+ # Path to the test PDF
414
+ test_pdf_path = "tests/gnarly_pdfs/edgar-rotated90.pdf"
415
+
416
+ # Mock arguments
417
+ args = MockArgs ()
418
+
419
+ # Counter to track number of API calls
420
+ call_count = 0
421
+
422
+ async def mock_apost (url , json_data ):
423
+ nonlocal call_count
424
+ call_count += 1
425
+
426
+ # First call - model detects rotation is needed (270 degrees)
427
+ if call_count == 1 :
428
+ response_content = """---
429
+ primary_language: en
430
+ is_rotation_valid: false
431
+ rotation_correction: 270
432
+ is_table: false
433
+ is_diagram: false
434
+ ---
435
+
436
+ Document needs 270 degree rotation."""
437
+
438
+ # Second call - model detects more rotation is needed (180 degrees)
439
+ # Total would be 450, but should wrap to 90
440
+ elif call_count == 2 :
441
+ response_content = """---
442
+ primary_language: en
443
+ is_rotation_valid: false
444
+ rotation_correction: 180
445
+ is_table: false
446
+ is_diagram: false
447
+ ---
448
+
449
+ Document needs additional rotation."""
450
+
451
+ # Third call - after wrapped rotation (90 degrees), model says it's correct
452
+ elif call_count == 3 :
453
+ response_content = """---
454
+ primary_language: en
455
+ is_rotation_valid: true
456
+ rotation_correction: 0
457
+ is_table: false
458
+ is_diagram: false
459
+ ---
460
+
461
+ Document correctly oriented at 90 degrees total rotation."""
462
+
463
+ else :
464
+ raise ValueError (f"Unexpected call count: { call_count } " )
465
+
466
+ # Mock response structure
467
+ response_body = {
468
+ "choices" : [{"message" : {"content" : response_content }, "finish_reason" : "stop" }],
469
+ "usage" : {"prompt_tokens" : 1000 , "completion_tokens" : 100 , "total_tokens" : 1100 },
470
+ }
471
+
472
+ return 200 , json .dumps (response_body ).encode ()
473
+
474
+ # Mock the worker tracker
475
+ mock_tracker = AsyncMock ()
476
+
477
+ # Ensure the test PDF exists
478
+ assert os .path .exists (test_pdf_path ), f"Test PDF not found at { test_pdf_path } "
479
+
480
+ # Track calls to build_page_query
481
+ build_page_query_calls = []
482
+ original_build_page_query = build_page_query
483
+
484
+ async def mock_build_page_query (local_pdf_path , page , target_longest_image_dim , image_rotation = 0 ):
485
+ build_page_query_calls .append (image_rotation )
486
+ return await original_build_page_query (local_pdf_path , page , target_longest_image_dim , image_rotation )
487
+
488
+ with patch ("olmocr.pipeline.apost" , side_effect = mock_apost ):
489
+ with patch ("olmocr.pipeline.tracker" , mock_tracker ):
490
+ with patch ("olmocr.pipeline.build_page_query" , side_effect = mock_build_page_query ):
491
+ result = await process_page (args = args , worker_id = 0 , pdf_orig_path = "test-rotation-wrap.pdf" , pdf_local_path = test_pdf_path , page_num = 1 )
492
+
493
+ # Verify the result
494
+ assert isinstance (result , PageResult )
495
+ assert result .page_num == 1
496
+ assert result .is_fallback == False
497
+ assert result .response .is_rotation_valid == True
498
+
499
+ # Verify that exactly 3 API calls were made
500
+ assert call_count == 3
501
+
502
+ # Verify build_page_query was called with correct cumulative rotations
503
+ assert len (build_page_query_calls ) == 3
504
+ assert build_page_query_calls [0 ] == 0 # First call with no rotation
505
+ assert build_page_query_calls [1 ] == 270 # Second call with 270 degree rotation
506
+ assert build_page_query_calls [2 ] == 90 # Third call with wrapped rotation (270 + 180 = 450 % 360 = 90)
0 commit comments