@@ -36,6 +36,15 @@ export function createTransformer(options?: { isUsingOllama?: boolean }) {
3636 let inThinkingBlock = false // Track if we're currently in a thinking block
3737 let thinkingJsonStarted = false // Track if we've sent the JSON prefix for thinking deltas
3838
39+ // Track usage from the last main assistant message (exclude sidechain/subagents).
40+ // This is used for accurate context window display in final metadata.
41+ let lastMainAssistantUsage : {
42+ input_tokens : number
43+ cache_read_input_tokens : number
44+ cache_creation_input_tokens : number
45+ output_tokens : number
46+ } | null = null
47+
3948 // Helper to create composite toolCallId: "parentId:childId" or just "childId"
4049 const makeCompositeId = ( originalId : string , parentId : string | null ) : string => {
4150 if ( parentId ) return `${ parentId } :${ originalId } `
@@ -233,6 +242,17 @@ export function createTransformer(options?: { isUsingOllama?: boolean }) {
233242 }
234243 }
235244
245+ // Track per-turn usage from main assistant messages only.
246+ // Sidechain/subagent assistant messages have parent_tool_use_id set.
247+ if ( msg . type === "assistant" && msg . message ?. usage && msg . parent_tool_use_id == null ) {
248+ lastMainAssistantUsage = {
249+ input_tokens : msg . message . usage . input_tokens ?? 0 ,
250+ cache_read_input_tokens : msg . message . usage . cache_read_input_tokens ?? 0 ,
251+ cache_creation_input_tokens : msg . message . usage . cache_creation_input_tokens ?? 0 ,
252+ output_tokens : msg . message . usage . output_tokens ?? 0 ,
253+ }
254+ }
255+
236256 // ===== ASSISTANT MESSAGE (complete, often with tool_use) =====
237257 // When streaming is enabled, text arrives via stream_event, not here
238258 if ( msg . type === "assistant" && msg . message ?. content ) {
@@ -410,51 +430,25 @@ export function createTransformer(options?: { isUsingOllama?: boolean }) {
410430 yield * endTextBlock ( )
411431 yield * endToolInput ( )
412432
413- const inputTokens = msg . usage ?. input_tokens
414- const outputTokens = msg . usage ?. output_tokens
415-
416- // Extract per-model usage from SDK (if available)
417- const modelUsage = msg . modelUsage
418- ? Object . fromEntries (
419- Object . entries ( msg . modelUsage ) . map ( ( [ model , usage ] : [ string , any ] ) => [
420- model ,
421- {
422- inputTokens : usage . inputTokens || 0 ,
423- outputTokens : usage . outputTokens || 0 ,
424- cacheReadInputTokens : usage . cacheReadInputTokens || 0 ,
425- cacheCreationInputTokens : usage . cacheCreationInputTokens || 0 ,
426- costUSD : usage . costUSD || 0 ,
427- } ,
428- ] )
429- )
430- : undefined
431-
432- // Fallback: if SDK didn't populate msg.usage, derive totals from modelUsage
433- const fallbackInputTokens = msg . modelUsage
434- ? Object . values ( msg . modelUsage ) . reduce (
435- ( sum : number , usage : any ) => sum + ( usage ?. inputTokens || 0 ) ,
436- 0 ,
437- )
438- : undefined
439- const fallbackOutputTokens = msg . modelUsage
440- ? Object . values ( msg . modelUsage ) . reduce (
441- ( sum : number , usage : any ) => sum + ( usage ?. outputTokens || 0 ) ,
442- 0 ,
443- )
444- : undefined
445-
446- const resolvedInputTokens =
447- inputTokens == null || ( inputTokens === 0 && ( fallbackInputTokens || 0 ) > 0 )
448- ? fallbackInputTokens
449- : inputTokens
450- const resolvedOutputTokens =
451- outputTokens == null || ( outputTokens === 0 && ( fallbackOutputTokens || 0 ) > 0 )
452- ? fallbackOutputTokens
453- : outputTokens
433+ const resultOutputTokens = msg . usage ?. output_tokens
434+ const fallbackUsage = {
435+ input_tokens : msg . usage ?. input_tokens ?? 0 ,
436+ cache_read_input_tokens : msg . usage ?. cache_read_input_tokens ?? 0 ,
437+ cache_creation_input_tokens : msg . usage ?. cache_creation_input_tokens ?? 0 ,
438+ output_tokens : resultOutputTokens ?? 0 ,
439+ }
440+
441+ // Prefer the last main assistant usage snapshot for context metrics.
442+ // Fallback to result usage when assistant usage is unavailable.
443+ const usage = lastMainAssistantUsage ?? fallbackUsage
454444
445+ const resolvedInputTokens = usage . input_tokens
446+ const resolvedOutputTokens = resultOutputTokens ?? usage . output_tokens
455447 const metadata : MessageMetadata = {
456448 sessionId : msg . session_id ,
457449 inputTokens : resolvedInputTokens ,
450+ cacheReadInputTokens : usage . cache_read_input_tokens ,
451+ cacheCreationInputTokens : usage . cache_creation_input_tokens ,
458452 outputTokens : resolvedOutputTokens ,
459453 totalTokens :
460454 resolvedInputTokens != null && resolvedOutputTokens != null
@@ -465,8 +459,6 @@ export function createTransformer(options?: { isUsingOllama?: boolean }) {
465459 resultSubtype : msg . subtype || "success" ,
466460 // Include finalTextId for collapsing tools when there's a final response
467461 finalTextId : lastTextId || undefined ,
468- // Per-model usage breakdown
469- modelUsage,
470462 }
471463 yield { type : "message-metadata" , messageMetadata : metadata }
472464 yield { type : "finish-step" }
0 commit comments