langchain-ai
diff --git a/‎libs/core/langchain_core/language_models/_utils.py
Lines changed: 219 additions & 69 deletions b/‎libs/core/langchain_core/language_models/_utils.py
Lines changed: 219 additions & 69 deletions
@@ -1,12 +1,30 @@
 import re
 from collections.abc import Sequence
-from typing import Optional, TypeVar
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    Optional,
+    TypedDict,
+    TypeVar,
+    Union,
+)
 
-from langchain_core.messages import BaseMessage
+if TYPE_CHECKING:
+    from langchain_core.messages import BaseMessage
+from langchain_core.messages.content import (
+    ContentBlock,
+)
 
 
 def _is_openai_data_block(block: dict) -> bool:
-    """Check if the block contains multimodal data in OpenAI Chat Completions format."""
+    """Check if the block contains multimodal data in OpenAI Chat Completions format.
+
+    Supports both data and ID-style blocks (e.g. ``'file_data'`` and ``'file_id'``)
+
+    If additional keys are present, they are ignored / will not affect outcome as long
+    as the required keys are present and valid.
+
+    """
     if block.get("type") == "image_url":
         if (
             (set(block.keys()) <= {"type", "image_url", "detail"})
@@ -15,29 +33,43 @@ def _is_openai_data_block(block: dict) -> bool:
         ):
             url = image_url.get("url")
             if isinstance(url, str):
+                # Required per OpenAI spec
+                return True
+            # Ignore `'detail'` since it's optional and specific to OpenAI
+
+    elif block.get("type") == "input_audio":
+        if (audio := block.get("input_audio")) and isinstance(audio, dict):
+            audio_data = audio.get("data")
+            audio_format = audio.get("format")
+            # Both required per OpenAI spec
+            if isinstance(audio_data, str) and isinstance(audio_format, str):
                 return True
 
     elif block.get("type") == "file":
         if (file := block.get("file")) and isinstance(file, dict):
             file_data = file.get("file_data")
-            if isinstance(file_data, str):
-                return True
-
-    elif block.get("type") == "input_audio":
-        if (input_audio := block.get("input_audio")) and isinstance(input_audio, dict):
-            audio_data = input_audio.get("data")
-            audio_format = input_audio.get("format")
-            if isinstance(audio_data, str) and isinstance(audio_format, str):
+            file_id = file.get("file_id")
+            # Files can be either base64-encoded or pre-uploaded with an ID
+            if isinstance(file_data, str) or isinstance(file_id, str):
                 return True
 
     else:
         return False
 
+    # Has no `'type'` key
     return False
 
 
-def _parse_data_uri(uri: str) -> Optional[dict]:
-    """Parse a data URI into its components. If parsing fails, return None.
+class ParsedDataUri(TypedDict):
+    source_type: Literal["base64"]
+    data: str
+    mime_type: str
+
+
+def _parse_data_uri(uri: str) -> Optional[ParsedDataUri]:
+    """Parse a data URI into its components.
+
+    If parsing fails, return None. If either MIME type or data is missing, return None.
 
     Example:
 
@@ -57,90 +89,208 @@ def _parse_data_uri(uri: str) -> Optional[dict]:
     match = re.match(regex, uri)
     if match is None:
         return None
+
+    mime_type = match.group("mime_type")
+    data = match.group("data")
+    if not mime_type or not data:
+        return None
+
     return {
         "source_type": "base64",
-        "data": match.group("data"),
-        "mime_type": match.group("mime_type"),
+        "data": data,
+        "mime_type": mime_type,
     }
 
 
-def _convert_openai_format_to_data_block(block: dict) -> dict:
-    """Convert OpenAI image content block to standard data content block.
+def _normalize_messages(
+    messages: Sequence["BaseMessage"],
+) -> list["BaseMessage"]:
+    """Normalize message formats to LangChain v1 standard content blocks.
 
-    If parsing fails, pass-through.
+    Chat models already implement support for:
+    - Images in OpenAI Chat Completions format
+        These will be passed through unchanged
+    - LangChain v1 standard content blocks
 
-    Args:
-        block: The OpenAI image content block to convert.
+    This function extends support to:
+    - `Audio <https://platform.openai.com/docs/api-reference/chat/create>`__ and
+        `file <https://platform.openai.com/docs/api-reference/files>`__ data in OpenAI
+        Chat Completions format
+        - Images are technically supported but we expect chat models to handle them
+            directly; this may change in the future
+    - LangChain v0 standard content blocks for backward compatibility
 
-    Returns:
-        The converted standard data content block.
-    """
-    if block["type"] == "image_url":
-        parsed = _parse_data_uri(block["image_url"]["url"])
-        if parsed is not None:
-            parsed["type"] = "image"
-            return parsed
-        return block
-
-    if block["type"] == "file":
-        parsed = _parse_data_uri(block["file"]["file_data"])
-        if parsed is not None:
-            parsed["type"] = "file"
-            if filename := block["file"].get("filename"):
-                parsed["filename"] = filename
-            return parsed
-        return block
-
-    if block["type"] == "input_audio":
-        data = block["input_audio"].get("data")
-        audio_format = block["input_audio"].get("format")
-        if data and audio_format:
-            return {
-                "type": "audio",
-                "source_type": "base64",
-                "data": data,
-                "mime_type": f"audio/{audio_format}",
+    .. versionchanged:: 1.0.0
+        In previous versions, this function returned messages in LangChain v0 format.
+        Now, it returns messages in LangChain v1 format, which upgraded chat models now
+        expect to receive when passing back in message history. For backward
+        compatibility, this function will convert v0 message content to v1 format.
+
+    .. dropdown:: v0 Content Block Schemas
+
+        ``URLContentBlock``:
+
+        .. codeblock::
+
+            {
+                mime_type: NotRequired[str]
+                type: Literal['image', 'audio', 'file'],
+                source_type: Literal['url'],
+                url: str,
             }
-        return block
 
-    return block
+        ``Base64ContentBlock``:
+
+        .. codeblock::
+
+            {
+                mime_type: NotRequired[str]
+                type: Literal['image', 'audio', 'file'],
+                source_type: Literal['base64'],
+                data: str,
+            }
 
+        ``IDContentBlock``:
 
-def _normalize_messages(messages: Sequence[BaseMessage]) -> list[BaseMessage]:
-    """Extend support for message formats.
+        (In practice, this was never used)
+
+        .. codeblock::
+
+            {
+                type: Literal['image', 'audio', 'file'],
+                source_type: Literal['id'],
+                id: str,
+            }
+
+        ``PlainTextContentBlock``:
+
+        .. codeblock::
+
+            {
+                mime_type: NotRequired[str]
+                type: Literal['file'],
+                source_type: Literal['text'],
+                url: str,
+            }
+
+    If a v1 message is passed in, it will be returned as-is, meaning it is safe to
+    always pass in v1 messages to this function for assurance.
+
+    For posterity, here are the OpenAI Chat Completions schemas we expect:
+
+    Chat Completions image. Can be URL-based or base64-encoded. Supports MIME types
+    png, jpeg/jpg, webp, static gif:
+    {
+        "type": Literal['image_url'],
+        "image_url": {
+            "url": Union["data:$MIME_TYPE;base64,$BASE64_ENCODED_IMAGE", "$IMAGE_URL"],
+            "detail": Literal['low', 'high', 'auto'] = 'auto',  # Supported by OpenAI
+        }
+    }
+
+    Chat Completions audio:
+    {
+        "type": Literal['input_audio'],
+        "input_audio": {
+            "format": Literal['wav', 'mp3'],
+            "data": str = "$BASE64_ENCODED_AUDIO",
+        },
+    }
+
+    Chat Completions files: either base64 or pre-uploaded file ID
+    {
+        "type": Literal['file'],
+        "file": Union[
+            {
+                "filename": Optional[str] = "$FILENAME",
+                "file_data": str = "$BASE64_ENCODED_FILE",
+            },
+            {
+                "file_id": str = "$FILE_ID",  # For pre-uploaded files to OpenAI
+            },
+        ],
+    }
 
-    Chat models implement support for images in OpenAI Chat Completions format, as well
-    as other multimodal data as standard data blocks. This function extends support to
-    audio and file data in OpenAI Chat Completions format by converting them to standard
-    data blocks.
     """
+    from langchain_core.messages.block_translators.langchain import (
+        _convert_legacy_v0_content_block_to_v1,
+        _convert_openai_format_to_data_block,
+    )
+
     formatted_messages = []
     for message in messages:
+        # We preserve input messages - the caller may reuse them elsewhere and expects
+        # them to remain unchanged. We only create a copy if we need to translate.
         formatted_message = message
+
         if isinstance(message.content, list):
             for idx, block in enumerate(message.content):
+                # OpenAI Chat Completions multimodal data blocks to v1 standard
                 if (
                     isinstance(block, dict)
-                    # Subset to (PDF) files and audio, as most relevant chat models
-                    # support images in OAI format (and some may not yet support the
-                    # standard data block format)
-                    and block.get("type") in {"file", "input_audio"}
+                    and block.get("type") in {"input_audio", "file"}
+                    # Discriminate between OpenAI/LC format since they share `'type'`
                     and _is_openai_data_block(block)
                 ):
-                    if formatted_message is message:
-                        formatted_message = message.model_copy()
-                        # Also shallow-copy content
-                        formatted_message.content = list(formatted_message.content)
-
-                    formatted_message.content[idx] = (  # type: ignore[index]  # mypy confused by .model_copy
-                        _convert_openai_format_to_data_block(block)
-                    )
+                    formatted_message = _ensure_message_copy(message, formatted_message)
+
+                    converted_block = _convert_openai_format_to_data_block(block)
+                    _update_content_block(formatted_message, idx, converted_block)
+
+                # Convert multimodal LangChain v0 to v1 standard content blocks
+                elif (
+                    isinstance(block, dict)
+                    and block.get("type")
+                    in {
+                        "image",
+                        "audio",
+                        "file",
+                    }
+                    and block.get("source_type")  # v1 doesn't have `source_type`
+                    in {
+                        "url",
+                        "base64",
+                        "id",
+                        "text",
+                    }
+                ):
+                    formatted_message = _ensure_message_copy(message, formatted_message)
+
+                    converted_block = _convert_legacy_v0_content_block_to_v1(block)
+                    _update_content_block(formatted_message, idx, converted_block)
+                    continue
+
+                # else, pass through blocks that look like they have v1 format unchanged
+
         formatted_messages.append(formatted_message)
 
     return formatted_messages
 
 
-T = TypeVar("T", bound=BaseMessage)
+T = TypeVar("T", bound="BaseMessage")
+
+
+def _ensure_message_copy(message: T, formatted_message: T) -> T:
+    """Create a copy of the message if it hasn't been copied yet."""
+    if formatted_message is message:
+        formatted_message = message.model_copy()
+        # Shallow-copy content list to allow modifications
+        formatted_message.content = list(formatted_message.content)
+    return formatted_message
+
+
+def _update_content_block(
+    formatted_message: "BaseMessage", idx: int, new_block: Union[ContentBlock, dict]
+) -> None:
+    """Update a content block at the given index, handling type issues."""
+    # Type ignore needed because:
+    # - `BaseMessage.content` is typed as `Union[str, list[Union[str, dict]]]`
+    # - When content is str, indexing fails (index error)
+    # - When content is list, the items are `Union[str, dict]` but we're assigning
+    #   `Union[ContentBlock, dict]` where ContentBlock is richer than dict
+    # - This is safe because we only call this when we've verified content is a list and
+    #   we're doing content block conversions
+    formatted_message.content[idx] = new_block  # type: ignore[index, assignment]
 
 
 def _update_message_content_to_blocks(message: T, output_version: str) -> T: