|
1 | 1 | from datetime import datetime
|
2 |
| -from typing import Any, Dict |
| 2 | +import json |
| 3 | +from pathlib import Path |
| 4 | +from typing import Any, Dict, Optional |
3 | 5 |
|
4 | 6 | import pytest
|
5 | 7 | from llama_cloud import ExtractRun, File
|
6 | 8 | from llama_cloud.types.agent_data import AgentData
|
7 | 9 | from llama_cloud.types.aggregate_group import AggregateGroup
|
8 |
| -from pydantic import BaseModel, ValidationError |
| 10 | +from pydantic import BaseModel, Field, ValidationError |
9 | 11 |
|
10 | 12 | from llama_cloud_services.beta.agent_data.schema import (
|
11 | 13 | ExtractedData,
|
12 | 14 | ExtractedFieldMetadata,
|
| 15 | + FieldCitation, |
13 | 16 | InvalidExtractionData,
|
14 | 17 | TypedAgentData,
|
15 | 18 | TypedAggregateGroup,
|
@@ -81,8 +84,12 @@ def test_extracted_data_create_method():
|
81 | 84 |
|
82 | 85 | # Test with custom values using ExtractedFieldMetadata
|
83 | 86 | field_metadata = {
|
84 |
| - "name": ExtractedFieldMetadata(confidence=0.99, page_number=1), |
85 |
| - "age": ExtractedFieldMetadata(confidence=0.85, page_number=1), |
| 87 | + "name": ExtractedFieldMetadata( |
| 88 | + confidence=0.99, citation=[FieldCitation(page=1)] |
| 89 | + ), |
| 90 | + "age": ExtractedFieldMetadata( |
| 91 | + confidence=0.85, citation=[FieldCitation(page=1)] |
| 92 | + ), |
86 | 93 | }
|
87 | 94 | extracted_custom = ExtractedData.create(
|
88 | 95 | person, status="accepted", field_metadata=field_metadata
|
@@ -254,14 +261,16 @@ def test_parse_extracted_field_metadata():
|
254 | 261 | # name should have parsed citation data
|
255 | 262 | assert isinstance(result["name"], ExtractedFieldMetadata)
|
256 | 263 | assert result["name"].confidence == 0.95
|
257 |
| - assert result["name"].page_number == 1 |
258 |
| - assert result["name"].matching_text == "John Smith" |
| 264 | + assert result["name"].citation == [ |
| 265 | + FieldCitation(page=1, matching_text="John Smith") |
| 266 | + ] |
259 | 267 |
|
260 | 268 | # age should handle float page number
|
261 | 269 | assert isinstance(result["age"], ExtractedFieldMetadata)
|
262 | 270 | assert result["age"].confidence == 0.87
|
263 |
| - assert result["age"].page_number == 2 # Should be converted to int |
264 |
| - assert result["age"].matching_text == "25 years old" |
| 271 | + assert result["age"].citation == [ |
| 272 | + FieldCitation(page=2, matching_text="25 years old") |
| 273 | + ] |
265 | 274 |
|
266 | 275 | # email should handle empty citations
|
267 | 276 | assert isinstance(result["email"], ExtractedFieldMetadata)
|
@@ -327,46 +336,52 @@ def test_parse_extracted_field_metadata_complex():
|
327 | 336 | reasoning="Combined key parametrics and construction from the datasheet for a structured title.",
|
328 | 337 | confidence=0.9470628580889779,
|
329 | 338 | extraction_confidence=0.9470628580889779,
|
330 |
| - page_number=1, |
331 |
| - matching_text="PHE844/F844, Film, Metallized Polypropylene, Safety, 0.47 uF", |
| 339 | + citation=[ |
| 340 | + FieldCitation( |
| 341 | + page=1, |
| 342 | + matching_text="PHE844/F844, Film, Metallized Polypropylene, Safety, 0.47 uF", |
| 343 | + ) |
| 344 | + ], |
332 | 345 | ),
|
333 | 346 | "manufacturer": ExtractedFieldMetadata(
|
334 | 347 | reasoning="VERBATIM EXTRACTION",
|
335 | 348 | confidence=0.9997446550976602,
|
336 | 349 | extraction_confidence=0.9997446550976602,
|
337 |
| - page_number=1, |
338 |
| - matching_text="YAGEO KEMET", |
| 350 | + citation=[FieldCitation(page=1, matching_text="YAGEO KEMET")], |
339 | 351 | ),
|
340 | 352 | "features": [
|
341 | 353 | ExtractedFieldMetadata(
|
342 | 354 | reasoning="VERBATIM EXTRACTION",
|
343 | 355 | confidence=0.9999308195540074,
|
344 | 356 | extraction_confidence=0.9999308195540074,
|
345 |
| - page_number=1, |
346 |
| - matching_text="Features</td><td>EMI Safety", |
| 357 | + citation=[ |
| 358 | + FieldCitation( |
| 359 | + page=1, |
| 360 | + matching_text="Features</td><td>EMI Safety", |
| 361 | + ) |
| 362 | + ], |
347 | 363 | ),
|
348 | 364 | ExtractedFieldMetadata(
|
349 | 365 | reasoning="VERBATIM EXTRACTION",
|
350 | 366 | confidence=0.8642493886452225,
|
351 | 367 | extraction_confidence=0.8642493886452225,
|
352 |
| - page_number=1, |
353 |
| - matching_text="THB Performance</td><td>Yes", |
| 368 | + citation=[ |
| 369 | + FieldCitation(page=1, matching_text="THB Performance</td><td>Yes") |
| 370 | + ], |
354 | 371 | ),
|
355 | 372 | ],
|
356 | 373 | "dimensions": {
|
357 | 374 | "length": ExtractedFieldMetadata(
|
358 | 375 | reasoning="VERBATIM EXTRACTION",
|
359 | 376 | confidence=0.8986941382802304,
|
360 | 377 | extraction_confidence=0.8986941382802304,
|
361 |
| - page_number=1, |
362 |
| - matching_text="L</td><td>41mm MAX", |
| 378 | + citation=[FieldCitation(page=1, matching_text="L</td><td>41mm MAX")], |
363 | 379 | ),
|
364 | 380 | "width": ExtractedFieldMetadata(
|
365 | 381 | reasoning="VERBATIM EXTRACTION",
|
366 | 382 | confidence=0.9999377974447091,
|
367 | 383 | extraction_confidence=0.9999377974447091,
|
368 |
| - page_number=1, |
369 |
| - matching_text="T</td><td>13mm MAX", |
| 384 | + citation=[FieldCitation(page=1, matching_text="T</td><td>13mm MAX")], |
370 | 385 | ),
|
371 | 386 | },
|
372 | 387 | }
|
@@ -450,8 +465,9 @@ def test_extracted_data_from_extraction_result_success():
|
450 | 465 | # Verify field metadata was parsed
|
451 | 466 | assert isinstance(extracted.field_metadata["name"], ExtractedFieldMetadata)
|
452 | 467 | assert extracted.field_metadata["name"].confidence == 0.95
|
453 |
| - assert extracted.field_metadata["name"].page_number == 1 |
454 |
| - assert extracted.field_metadata["name"].matching_text == "John Doe" |
| 468 | + assert extracted.field_metadata["name"].citation == [ |
| 469 | + FieldCitation(page=1, matching_text="John Doe") |
| 470 | + ] |
455 | 471 |
|
456 | 472 | # Verify overall confidence was calculated
|
457 | 473 | expected_confidence = (0.95 + 0.87 + 0.92) / 3
|
@@ -523,3 +539,54 @@ def test_extracted_data_from_extraction_result_invalid_data():
|
523 | 539 | assert isinstance(invalid_data.field_metadata["name"], ExtractedFieldMetadata)
|
524 | 540 | assert invalid_data.field_metadata["name"].confidence == 0.9
|
525 | 541 | assert invalid_data.overall_confidence == 0.9
|
| 542 | + |
| 543 | + |
| 544 | +class Dimensions(BaseModel): |
| 545 | + length: Optional[str] = Field( |
| 546 | + None, description="Length in mm (Size, Longest Side, L)" |
| 547 | + ) |
| 548 | + width: Optional[str] = Field( |
| 549 | + None, description="Width in mm (Breadth, Side Width, W)" |
| 550 | + ) |
| 551 | + height: Optional[str] = Field( |
| 552 | + None, description="Height in mm (Thickness, Vertical Size, H)" |
| 553 | + ) |
| 554 | + diameter: Optional[str] = Field( |
| 555 | + None, |
| 556 | + description="Diameter in mm (for radial or cylindrical types) (Outer Diameter, dt, OD, D, d<sub>t</sub>)", |
| 557 | + ) |
| 558 | + lead_spacing: Optional[str] = Field( |
| 559 | + None, description="Lead spacing in mm (Pin Pitch, Terminal Gap, LS)" |
| 560 | + ) |
| 561 | + |
| 562 | + |
| 563 | +class Capacitor(BaseModel): |
| 564 | + dimensions: Optional[Dimensions] = None |
| 565 | + |
| 566 | + |
| 567 | +def test_full_parse_nested_dimensions(): |
| 568 | + with open(Path(__file__).parent.parent.parent / "data" / "capacitor.json") as f: |
| 569 | + data = json.load(f) |
| 570 | + result = ExtractedData.from_extraction_result(ExtractRun.parse_obj(data), Capacitor) |
| 571 | + expected = { |
| 572 | + "dimensions": { |
| 573 | + "diameter": ExtractedFieldMetadata( |
| 574 | + reasoning="VERBATIM EXTRACTION", |
| 575 | + confidence=1.0, |
| 576 | + extraction_confidence=1.0, |
| 577 | + ), |
| 578 | + "lead_spacing": ExtractedFieldMetadata( |
| 579 | + reasoning="VERBATIM EXTRACTION", |
| 580 | + confidence=0.9999999031936799, |
| 581 | + extraction_confidence=0.9999999031936799, |
| 582 | + ), |
| 583 | + "length": ExtractedFieldMetadata( |
| 584 | + reasoning="VERBATIM EXTRACTION", |
| 585 | + confidence=0.9999968039036192, |
| 586 | + extraction_confidence=0.9999968039036192, |
| 587 | + ), |
| 588 | + } |
| 589 | + } |
| 590 | + assert result.field_metadata == expected |
| 591 | + parsed = ExtractedData.model_validate_json(result.model_dump_json()) |
| 592 | + assert parsed.field_metadata == expected |
0 commit comments