@@ -70,6 +70,20 @@ class AnthropicTool(TypedDict):
70
70
cache_control : NotRequired [dict [str , str ]]
71
71
72
72
73
+ class _CombinedUsage (BaseModel ):
74
+ """Combined usage model for deferred token counting in streaming.
75
+
76
+ This mimics the Anthropic Usage structure while combining stored input usage
77
+ with final output usage for accurate token reporting during streaming.
78
+ """
79
+
80
+ input_tokens : int = 0
81
+ output_tokens : int = 0
82
+ cache_creation_input_tokens : Optional [int ] = None
83
+ cache_read_input_tokens : Optional [int ] = None
84
+ cache_creation : Optional [dict [str , Any ]] = None
85
+
86
+
73
87
def _is_builtin_tool (tool : Any ) -> bool :
74
88
if not isinstance (tool , dict ):
75
89
return False
@@ -1493,12 +1507,18 @@ def _stream(
1493
1507
and not _thinking_in_params (payload )
1494
1508
)
1495
1509
block_start_event = None
1510
+ stored_input_usage = None
1496
1511
for event in stream :
1497
- msg , block_start_event = _make_message_chunk_from_anthropic_event (
1512
+ (
1513
+ msg ,
1514
+ block_start_event ,
1515
+ stored_input_usage ,
1516
+ ) = _make_message_chunk_from_anthropic_event (
1498
1517
event ,
1499
1518
stream_usage = stream_usage ,
1500
1519
coerce_content_to_string = coerce_content_to_string ,
1501
1520
block_start_event = block_start_event ,
1521
+ stored_input_usage = stored_input_usage ,
1502
1522
)
1503
1523
if msg is not None :
1504
1524
chunk = ChatGenerationChunk (message = msg )
@@ -1529,12 +1549,18 @@ async def _astream(
1529
1549
and not _thinking_in_params (payload )
1530
1550
)
1531
1551
block_start_event = None
1552
+ stored_input_usage = None
1532
1553
async for event in stream :
1533
- msg , block_start_event = _make_message_chunk_from_anthropic_event (
1554
+ (
1555
+ msg ,
1556
+ block_start_event ,
1557
+ stored_input_usage ,
1558
+ ) = _make_message_chunk_from_anthropic_event (
1534
1559
event ,
1535
1560
stream_usage = stream_usage ,
1536
1561
coerce_content_to_string = coerce_content_to_string ,
1537
1562
block_start_event = block_start_event ,
1563
+ stored_input_usage = stored_input_usage ,
1538
1564
)
1539
1565
if msg is not None :
1540
1566
chunk = ChatGenerationChunk (message = msg )
@@ -2167,22 +2193,40 @@ def _make_message_chunk_from_anthropic_event(
2167
2193
stream_usage : bool = True ,
2168
2194
coerce_content_to_string : bool ,
2169
2195
block_start_event : Optional [anthropic .types .RawMessageStreamEvent ] = None ,
2170
- ) -> tuple [Optional [AIMessageChunk ], Optional [anthropic .types .RawMessageStreamEvent ]]:
2171
- """Convert Anthropic event to AIMessageChunk.
2196
+ stored_input_usage : Optional [BaseModel ] = None ,
2197
+ ) -> tuple [
2198
+ Optional [AIMessageChunk ],
2199
+ Optional [anthropic .types .RawMessageStreamEvent ],
2200
+ Optional [BaseModel ],
2201
+ ]:
2202
+ """Convert Anthropic event to ``AIMessageChunk``.
2172
2203
2173
2204
Note that not all events will result in a message chunk. In these cases
2174
2205
we return ``None``.
2206
+
2207
+ Args:
2208
+ event: The Anthropic streaming event to convert.
2209
+ stream_usage: Whether to include usage metadata in the chunk.
2210
+ coerce_content_to_string: Whether to coerce content blocks to strings.
2211
+ block_start_event: Previous content block start event for context.
2212
+ stored_input_usage: Usage metadata from ``message_start`` event to be used
2213
+ in ``message_delta`` event for accurate input token counts.
2214
+
2215
+ Returns:
2216
+ Tuple of ``(message_chunk, block_start_event, stored_usage)``
2217
+
2175
2218
"""
2176
2219
message_chunk : Optional [AIMessageChunk ] = None
2220
+ updated_stored_usage = stored_input_usage
2177
2221
# See https://github.com/anthropics/anthropic-sdk-python/blob/main/src/anthropic/lib/streaming/_messages.py # noqa: E501
2178
2222
if event .type == "message_start" and stream_usage :
2179
- usage_metadata = _create_usage_metadata (event .message .usage )
2180
- # We pick up a cumulative count of output_tokens at the end of the stream,
2181
- # so here we zero out to avoid double counting.
2182
- usage_metadata ["total_tokens" ] = (
2183
- usage_metadata ["total_tokens" ] - usage_metadata ["output_tokens" ]
2223
+ # Store input usage for later use in message_delta but don't emit tokens yet
2224
+ updated_stored_usage = event .message .usage
2225
+ usage_metadata = UsageMetadata (
2226
+ input_tokens = 0 ,
2227
+ output_tokens = 0 ,
2228
+ total_tokens = 0 ,
2184
2229
)
2185
- usage_metadata ["output_tokens" ] = 0
2186
2230
if hasattr (event .message , "model" ):
2187
2231
response_metadata = {"model_name" : event .message .model }
2188
2232
else :
@@ -2270,11 +2314,37 @@ def _make_message_chunk_from_anthropic_event(
2270
2314
tool_call_chunks = tool_call_chunks ,
2271
2315
)
2272
2316
elif event .type == "message_delta" and stream_usage :
2273
- usage_metadata = UsageMetadata (
2274
- input_tokens = 0 ,
2275
- output_tokens = event .usage .output_tokens ,
2276
- total_tokens = event .usage .output_tokens ,
2277
- )
2317
+ # Create usage metadata combining stored input usage with final output usage
2318
+ #
2319
+ # Per Anthropic docs: "The token counts shown in the usage field of the
2320
+ # message_delta event are cumulative." Thus, when MCP tools are called
2321
+ # mid-stream, `input_tokens` may be updated with a higher cumulative count.
2322
+ # We prioritize `event.usage.input_tokens` when available to handle this case.
2323
+ if stored_input_usage is not None :
2324
+ # Create a combined usage object that mimics the Anthropic Usage structure
2325
+ combined_usage = _CombinedUsage (
2326
+ input_tokens = event .usage .input_tokens
2327
+ or getattr (stored_input_usage , "input_tokens" , 0 ),
2328
+ output_tokens = event .usage .output_tokens ,
2329
+ cache_creation_input_tokens = getattr (
2330
+ stored_input_usage , "cache_creation_input_tokens" , None
2331
+ ),
2332
+ cache_read_input_tokens = getattr (
2333
+ stored_input_usage , "cache_read_input_tokens" , None
2334
+ ),
2335
+ cache_creation = getattr (stored_input_usage , "cache_creation" , None )
2336
+ if hasattr (stored_input_usage , "cache_creation" )
2337
+ else None ,
2338
+ )
2339
+ usage_metadata = _create_usage_metadata (combined_usage )
2340
+ else :
2341
+ # Fallback to just output tokens if no stored usage
2342
+ usage_metadata = UsageMetadata (
2343
+ input_tokens = event .usage .input_tokens or 0 ,
2344
+ output_tokens = event .usage .output_tokens ,
2345
+ total_tokens = (event .usage .input_tokens or 0 )
2346
+ + event .usage .output_tokens ,
2347
+ )
2278
2348
message_chunk = AIMessageChunk (
2279
2349
content = "" ,
2280
2350
usage_metadata = usage_metadata ,
@@ -2286,7 +2356,7 @@ def _make_message_chunk_from_anthropic_event(
2286
2356
else :
2287
2357
pass
2288
2358
2289
- return message_chunk , block_start_event
2359
+ return message_chunk , block_start_event , updated_stored_usage
2290
2360
2291
2361
2292
2362
@deprecated (since = "0.1.0" , removal = "1.0.0" , alternative = "ChatAnthropic" )
0 commit comments