Skip to content

Commit 79fe193

Browse files
authored
Re-order extraction metadata union for better parsing (#865)
* Re-order args so that pydantic doesn't parse nested dict to a empty extraction result * Use a citations array instead
1 parent ab225c3 commit 79fe193

File tree

8 files changed

+2533
-2324
lines changed

8 files changed

+2533
-2324
lines changed

py/llama_cloud_services/beta/agent_data/schema.py

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,10 @@ class Person(BaseModel):
3737
"""
3838

3939
from datetime import datetime
40-
import numbers
4140
from llama_cloud import ExtractRun
4241
from llama_cloud.types.agent_data import AgentData
4342
from llama_cloud.types.aggregate_group import AggregateGroup
44-
from pydantic import BaseModel, Field, ValidationError
43+
from pydantic import BaseModel, Field, ValidationError, model_validator, ConfigDict
4544
from typing import (
4645
Generic,
4746
List,
@@ -176,6 +175,16 @@ class TypedAgentDataItems(BaseModel, Generic[AgentDataT]):
176175
)
177176

178177

178+
class FieldCitation(BaseModel):
179+
page: Optional[int] = Field(
180+
None, description="The page number that the field occurred on"
181+
)
182+
matching_text: Optional[str] = Field(
183+
None,
184+
description="The original text this field's value was derived from",
185+
)
186+
187+
179188
class ExtractedFieldMetadata(BaseModel):
180189
"""
181190
Metadata for an extracted data field, such as confidence, and citation information.
@@ -193,14 +202,14 @@ class ExtractedFieldMetadata(BaseModel):
193202
None,
194203
description="The confidence score for the field based on the extracted text only",
195204
)
196-
page_number: Optional[int] = Field(
197-
None, description="The page number that the field occurred on"
198-
)
199-
matching_text: Optional[str] = Field(
205+
citation: Optional[List[FieldCitation]] = Field(
200206
None,
201-
description="The original text this field's value was derived from",
207+
description="The citation for the field, including page number and matching text",
202208
)
203209

210+
# Forbid unknown keys to avoid swallowing nested dicts
211+
model_config = ConfigDict(extra="forbid")
212+
204213

205214
ExtractedFieldMetaDataDict = Dict[
206215
str, Union[ExtractedFieldMetadata, Dict[str, Any], list[Any]]
@@ -238,19 +247,10 @@ def _parse_extracted_field_metadata_recursive(
238247
if len(indicator_fields.intersection(field_value.keys())) > 0:
239248
try:
240249
merged = {**field_value, **additional_fields}
250+
allowed_fields = ExtractedFieldMetadata.model_fields.keys()
251+
merged = {k: v for k, v in merged.items() if k in allowed_fields}
241252
validated = ExtractedFieldMetadata.model_validate(merged)
242253

243-
# grab the citation from the array. This is just an array for backwards compatibility.
244-
if "citation" in field_value and len(field_value["citation"]) > 0:
245-
first_citation = field_value["citation"][0]
246-
if "page" in first_citation and isinstance(
247-
first_citation["page"], numbers.Number
248-
):
249-
validated.page_number = int(first_citation["page"]) # type: ignore
250-
if "matching_text" in first_citation and isinstance(
251-
first_citation["matching_text"], str
252-
):
253-
validated.matching_text = first_citation["matching_text"]
254254
return validated
255255
except ValidationError:
256256
pass
@@ -340,6 +340,28 @@ class ExtractedData(BaseModel, Generic[ExtractedT]):
340340
description="Additional metadata about the extracted data, such as errors, tokens, etc.",
341341
)
342342

343+
@model_validator(mode="before")
344+
@classmethod
345+
def _normalize_field_metadata_on_input(cls, value: Any) -> Any:
346+
# Ensure any inbound representation (including JSON round-trips)
347+
# gets normalized so nested dicts become ExtractedFieldMetadata where appropriate.
348+
if (
349+
isinstance(value, dict)
350+
and "field_metadata" in value
351+
and isinstance(value["field_metadata"], dict)
352+
):
353+
try:
354+
value = {
355+
**value,
356+
"field_metadata": parse_extracted_field_metadata(
357+
value["field_metadata"]
358+
),
359+
}
360+
except Exception:
361+
# Let pydantic surface detailed errors later rather than swallowing completely
362+
pass
363+
return value
364+
343365
@classmethod
344366
def create(
345367
cls,

py/llama_parse/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ dev = [
1111

1212
[project]
1313
name = "llama-parse"
14-
version = "0.6.57"
14+
version = "0.6.58"
1515
description = "Parse files into RAG-Optimized formats."
1616
authors = [{name = "Logan Markewich", email = "[email protected]"}]
1717
requires-python = ">=3.9,<4.0"
1818
readme = "README.md"
1919
license = "MIT"
20-
dependencies = ["llama-cloud-services>=0.6.56"]
20+
dependencies = ["llama-cloud-services>=0.6.58"]
2121

2222
[project.scripts]
2323
llama-parse = "llama_parse.cli.main:parse"

py/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ dev = [
1818

1919
[project]
2020
name = "llama-cloud-services"
21-
version = "0.6.57"
21+
version = "0.6.58"
2222
description = "Tailored SDK clients for LlamaCloud services."
2323
authors = [{name = "Logan Markewich", email = "[email protected]"}]
2424
requires-python = ">=3.9,<4.0"

py/unit_tests/beta/agent/test_agent_data_schema.py

Lines changed: 89 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
from datetime import datetime
2-
from typing import Any, Dict
2+
import json
3+
from pathlib import Path
4+
from typing import Any, Dict, Optional
35

46
import pytest
57
from llama_cloud import ExtractRun, File
68
from llama_cloud.types.agent_data import AgentData
79
from llama_cloud.types.aggregate_group import AggregateGroup
8-
from pydantic import BaseModel, ValidationError
10+
from pydantic import BaseModel, Field, ValidationError
911

1012
from llama_cloud_services.beta.agent_data.schema import (
1113
ExtractedData,
1214
ExtractedFieldMetadata,
15+
FieldCitation,
1316
InvalidExtractionData,
1417
TypedAgentData,
1518
TypedAggregateGroup,
@@ -81,8 +84,12 @@ def test_extracted_data_create_method():
8184

8285
# Test with custom values using ExtractedFieldMetadata
8386
field_metadata = {
84-
"name": ExtractedFieldMetadata(confidence=0.99, page_number=1),
85-
"age": ExtractedFieldMetadata(confidence=0.85, page_number=1),
87+
"name": ExtractedFieldMetadata(
88+
confidence=0.99, citation=[FieldCitation(page=1)]
89+
),
90+
"age": ExtractedFieldMetadata(
91+
confidence=0.85, citation=[FieldCitation(page=1)]
92+
),
8693
}
8794
extracted_custom = ExtractedData.create(
8895
person, status="accepted", field_metadata=field_metadata
@@ -254,14 +261,16 @@ def test_parse_extracted_field_metadata():
254261
# name should have parsed citation data
255262
assert isinstance(result["name"], ExtractedFieldMetadata)
256263
assert result["name"].confidence == 0.95
257-
assert result["name"].page_number == 1
258-
assert result["name"].matching_text == "John Smith"
264+
assert result["name"].citation == [
265+
FieldCitation(page=1, matching_text="John Smith")
266+
]
259267

260268
# age should handle float page number
261269
assert isinstance(result["age"], ExtractedFieldMetadata)
262270
assert result["age"].confidence == 0.87
263-
assert result["age"].page_number == 2 # Should be converted to int
264-
assert result["age"].matching_text == "25 years old"
271+
assert result["age"].citation == [
272+
FieldCitation(page=2, matching_text="25 years old")
273+
]
265274

266275
# email should handle empty citations
267276
assert isinstance(result["email"], ExtractedFieldMetadata)
@@ -327,46 +336,52 @@ def test_parse_extracted_field_metadata_complex():
327336
reasoning="Combined key parametrics and construction from the datasheet for a structured title.",
328337
confidence=0.9470628580889779,
329338
extraction_confidence=0.9470628580889779,
330-
page_number=1,
331-
matching_text="PHE844/F844, Film, Metallized Polypropylene, Safety, 0.47 uF",
339+
citation=[
340+
FieldCitation(
341+
page=1,
342+
matching_text="PHE844/F844, Film, Metallized Polypropylene, Safety, 0.47 uF",
343+
)
344+
],
332345
),
333346
"manufacturer": ExtractedFieldMetadata(
334347
reasoning="VERBATIM EXTRACTION",
335348
confidence=0.9997446550976602,
336349
extraction_confidence=0.9997446550976602,
337-
page_number=1,
338-
matching_text="YAGEO KEMET",
350+
citation=[FieldCitation(page=1, matching_text="YAGEO KEMET")],
339351
),
340352
"features": [
341353
ExtractedFieldMetadata(
342354
reasoning="VERBATIM EXTRACTION",
343355
confidence=0.9999308195540074,
344356
extraction_confidence=0.9999308195540074,
345-
page_number=1,
346-
matching_text="Features</td><td>EMI Safety",
357+
citation=[
358+
FieldCitation(
359+
page=1,
360+
matching_text="Features</td><td>EMI Safety",
361+
)
362+
],
347363
),
348364
ExtractedFieldMetadata(
349365
reasoning="VERBATIM EXTRACTION",
350366
confidence=0.8642493886452225,
351367
extraction_confidence=0.8642493886452225,
352-
page_number=1,
353-
matching_text="THB Performance</td><td>Yes",
368+
citation=[
369+
FieldCitation(page=1, matching_text="THB Performance</td><td>Yes")
370+
],
354371
),
355372
],
356373
"dimensions": {
357374
"length": ExtractedFieldMetadata(
358375
reasoning="VERBATIM EXTRACTION",
359376
confidence=0.8986941382802304,
360377
extraction_confidence=0.8986941382802304,
361-
page_number=1,
362-
matching_text="L</td><td>41mm MAX",
378+
citation=[FieldCitation(page=1, matching_text="L</td><td>41mm MAX")],
363379
),
364380
"width": ExtractedFieldMetadata(
365381
reasoning="VERBATIM EXTRACTION",
366382
confidence=0.9999377974447091,
367383
extraction_confidence=0.9999377974447091,
368-
page_number=1,
369-
matching_text="T</td><td>13mm MAX",
384+
citation=[FieldCitation(page=1, matching_text="T</td><td>13mm MAX")],
370385
),
371386
},
372387
}
@@ -450,8 +465,9 @@ def test_extracted_data_from_extraction_result_success():
450465
# Verify field metadata was parsed
451466
assert isinstance(extracted.field_metadata["name"], ExtractedFieldMetadata)
452467
assert extracted.field_metadata["name"].confidence == 0.95
453-
assert extracted.field_metadata["name"].page_number == 1
454-
assert extracted.field_metadata["name"].matching_text == "John Doe"
468+
assert extracted.field_metadata["name"].citation == [
469+
FieldCitation(page=1, matching_text="John Doe")
470+
]
455471

456472
# Verify overall confidence was calculated
457473
expected_confidence = (0.95 + 0.87 + 0.92) / 3
@@ -523,3 +539,54 @@ def test_extracted_data_from_extraction_result_invalid_data():
523539
assert isinstance(invalid_data.field_metadata["name"], ExtractedFieldMetadata)
524540
assert invalid_data.field_metadata["name"].confidence == 0.9
525541
assert invalid_data.overall_confidence == 0.9
542+
543+
544+
class Dimensions(BaseModel):
545+
length: Optional[str] = Field(
546+
None, description="Length in mm (Size, Longest Side, L)"
547+
)
548+
width: Optional[str] = Field(
549+
None, description="Width in mm (Breadth, Side Width, W)"
550+
)
551+
height: Optional[str] = Field(
552+
None, description="Height in mm (Thickness, Vertical Size, H)"
553+
)
554+
diameter: Optional[str] = Field(
555+
None,
556+
description="Diameter in mm (for radial or cylindrical types) (Outer Diameter, dt, OD, D, d<sub>t</sub>)",
557+
)
558+
lead_spacing: Optional[str] = Field(
559+
None, description="Lead spacing in mm (Pin Pitch, Terminal Gap, LS)"
560+
)
561+
562+
563+
class Capacitor(BaseModel):
564+
dimensions: Optional[Dimensions] = None
565+
566+
567+
def test_full_parse_nested_dimensions():
568+
with open(Path(__file__).parent.parent.parent / "data" / "capacitor.json") as f:
569+
data = json.load(f)
570+
result = ExtractedData.from_extraction_result(ExtractRun.parse_obj(data), Capacitor)
571+
expected = {
572+
"dimensions": {
573+
"diameter": ExtractedFieldMetadata(
574+
reasoning="VERBATIM EXTRACTION",
575+
confidence=1.0,
576+
extraction_confidence=1.0,
577+
),
578+
"lead_spacing": ExtractedFieldMetadata(
579+
reasoning="VERBATIM EXTRACTION",
580+
confidence=0.9999999031936799,
581+
extraction_confidence=0.9999999031936799,
582+
),
583+
"length": ExtractedFieldMetadata(
584+
reasoning="VERBATIM EXTRACTION",
585+
confidence=0.9999968039036192,
586+
extraction_confidence=0.9999968039036192,
587+
),
588+
}
589+
}
590+
assert result.field_metadata == expected
591+
parsed = ExtractedData.model_validate_json(result.model_dump_json())
592+
assert parsed.field_metadata == expected

0 commit comments

Comments
 (0)