Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions pkg/session/sanitize_orphan_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package session

import (
"testing"

"github.com/docker/docker-agent/pkg/agent"
"github.com/docker/docker-agent/pkg/chat"
"github.com/docker/docker-agent/pkg/tools"
)

// assertToolPairingInvariant fails if any tool-result message lacks a matching
// tool_call ID in a preceding assistant message. This is exactly the invariant
// AWS Bedrock's ConverseStream enforces: "The number of toolResult blocks ...
// exceeds the number of toolUse blocks of previous turn." (issue #1676)
func assertToolPairingInvariant(t *testing.T, messages []chat.Message) {
t.Helper()
seen := map[string]bool{}
for i, m := range messages {
if m.Role == chat.MessageRoleAssistant {
for _, tc := range m.ToolCalls {
seen[tc.ID] = true
}
}
if m.Role == chat.MessageRoleTool {
if !seen[m.ToolCallID] {
t.Errorf("orphaned tool result at index %d: tool_call_id %q has no preceding toolUse (Bedrock would reject this)",
i, m.ToolCallID)
}
}
}
}

// TestSanitizeToolCalls_DropsOrphanedToolResult is a regression test for issue
// #1676 (session resume fails on AWS Bedrock with a toolResult/toolUse count
// mismatch).
//
// Scenario: a session is compacted, and the kept-tail boundary (startIndex /
// FirstKeptEntry) lands *after* the assistant message that issued a tool call
// but *before* its tool result. On resume, buildSessionSummaryMessages emits
// the conversation from that boundary, so the reconstructed history begins with
// a tool-result message whose assistant (toolUse) was left behind the summary.
//
// sanitizeToolCalls is the final guard before the request hits the provider and
// must drop that orphaned result so the request stays valid.
func TestSanitizeToolCalls_DropsOrphanedToolResult(t *testing.T) {
// History as reconstructed on resume after compaction: the toolUse-bearing
// assistant message is gone (folded into the summary), only its result and
// the following turn survive.
reconstructed := []chat.Message{
{Role: chat.MessageRoleUser, Content: "Session Summary: ..."},
// <-- assistant message with ToolCalls{ID:"tc1"} was dropped here
{Role: chat.MessageRoleTool, ToolCallID: "tc1", Content: "file contents"},
{Role: chat.MessageRoleAssistant, Content: "Here is the file."},
{Role: chat.MessageRoleUser, Content: "thanks, now resume"},
}

out := sanitizeToolCalls(reconstructed)

// The orphaned tool result must be gone, leaving a Bedrock-valid sequence.
assertToolPairingInvariant(t, out)
for i, m := range out {
if m.Role == chat.MessageRoleTool && m.ToolCallID == "tc1" {
t.Fatalf("orphaned tool result tc1 should have been dropped, still present at index %d", i)
}
}
}

// TestSanitizeToolCalls_KeepsMissingResultBalanced pins the opposite direction:
// a dangling toolUse (no result) gets a synthetic result and stays balanced.
func TestSanitizeToolCalls_KeepsMissingResultBalanced(t *testing.T) {
messages := []chat.Message{
{Role: chat.MessageRoleUser, Content: "hi"},
{Role: chat.MessageRoleAssistant, ToolCalls: []tools.ToolCall{{ID: "tc1"}}},
// result missing
{Role: chat.MessageRoleUser, Content: "next"},
}
out := sanitizeToolCalls(messages)
assertToolPairingInvariant(t, out) // synthetic result injected
}

// TestGetMessages_ResumeAfterCompaction_NoOrphanedToolResult exercises the full
// GetMessages reconstruction path that runs on the first turn after a session
// is resumed (runtime/loop.go calls sess.GetMessages before the loop starts).
//
// It reproduces the exact #1676 condition: a compaction summary whose
// FirstKeptEntry boundary lands between an assistant tool_use and its
// tool_result, so buildSessionSummaryMessages emits the conversation starting at
// the orphaned tool_result. GetMessages must return a provider-valid sequence.
func TestGetMessages_ResumeAfterCompaction_NoOrphanedToolResult(t *testing.T) {
testAgent := &agent.Agent{}
s := New()

// items[0]: old user turn (summarized away)
s.AddMessage(NewAgentMessage("", &chat.Message{Role: chat.MessageRoleUser, Content: "old question"}))
// items[1]: assistant that issued tool call tc1 (summarized away)
s.AddMessage(NewAgentMessage("", &chat.Message{
Role: chat.MessageRoleAssistant,
Content: "calling tool",
ToolCalls: []tools.ToolCall{{ID: "tc1"}},
}))
// items[2]: the tool result — KEPT, but its assistant (items[1]) is not
s.AddMessage(NewAgentMessage("", &chat.Message{
Role: chat.MessageRoleTool,
ToolCallID: "tc1",
Content: "tool output",
}))
// items[3]: assistant follow-up — kept
s.AddMessage(NewAgentMessage("", &chat.Message{Role: chat.MessageRoleAssistant, Content: "answer"}))
// items[4]: summary whose kept-tail boundary (index 2) splits the tc1 pair
s.Messages = append(s.Messages, Item{Summary: "summary", FirstKeptEntry: 2})

messages := s.GetMessages(testAgent)

// The reconstructed history must not contain the orphaned tc1 result.
assertToolPairingInvariant(t, messages)
}
40 changes: 31 additions & 9 deletions pkg/session/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -1250,17 +1250,29 @@ func normalizeMessageContent(messages []chat.Message) []chat.Message {
return out
}

// sanitizeToolCalls ensures every tool call in assistant messages has a
// corresponding tool-result message. It walks the message list tracking
// pending tool calls; when a tool-result message arrives its ID is marked
// fulfilled. When the next assistant or user message is encountered (or the
// end of the list is reached), any still-pending tool calls receive synthetic
// error results injected just before that boundary. This guarantees the
// provider always sees a valid request/response pair for every tool call.
// sanitizeToolCalls ensures the tool_use/tool_result blocks sent to the
// provider are always balanced, in both directions:
//
// - Every tool call in an assistant message gets a corresponding tool-result
// message. It walks the message list tracking pending tool calls; when a
// tool-result message arrives its ID is marked fulfilled. When the next
// assistant or user message is encountered (or the end of the list is
// reached), any still-pending tool calls receive synthetic error results
// injected just before that boundary.
// - Every tool-result message has a matching tool_use in the preceding
// assistant message. Orphaned tool results — a tool-result whose
// ToolCallID was never issued by the preceding assistant message — are
// dropped. This happens when compaction's kept-tail boundary lands between
// an assistant tool_use and its result, leaving the result at the head of
// the resumed history with no matching tool_use. Providers such as AWS
// Bedrock reject the request outright in that case ("The number of
// toolResult blocks ... exceeds the number of toolUse blocks of previous
// turn"), so we must strip these before the request goes out.
func sanitizeToolCalls(messages []chat.Message) []chat.Message {
var (
out []chat.Message
pendingToolCalls []tools.ToolCall
pendingIDs = make(map[string]bool)
resultIDs = make(map[string]bool)
)

Expand All @@ -1276,20 +1288,30 @@ func sanitizeToolCalls(messages []chat.Message) []chat.Message {
}
}
pendingToolCalls = nil
pendingIDs = make(map[string]bool)
resultIDs = make(map[string]bool)
}

for _, msg := range messages {
switch {
case msg.Role == chat.MessageRoleTool:
if msg.ToolCallID != "" {
resultIDs[msg.ToolCallID] = true
// Drop orphaned tool results: a tool_result with no matching
// tool_use in the preceding assistant message violates the
// provider contract.
if msg.ToolCallID == "" || !pendingIDs[msg.ToolCallID] {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[LOW] Behavioral change: tool results with empty ToolCallID are now silently dropped

Under the old logic the condition was if msg.ToolCallID != "" which only gated the resultIDs book-keeping — a tool-result with an empty ID still fell through to out = append(out, msg) and was forwarded to the provider. Under the new condition:

if msg.ToolCallID == "" || !pendingIDs[msg.ToolCallID] {
    continue   // drops the message
}

an empty-ID tool result is now silently dropped. In practice this is likely fine — a tool result with no ID cannot be matched to any tool_use and most providers mandate non-empty IDs — but it is a silent behavioral regression from the previous behavior. If any in-flight session persisted with empty-ID tool results they would be discarded on the next resume.

Consider documenting the intentional drop (e.g., // empty ToolCallID is malformed; drop it) so the behavior is explicit rather than a side-effect of the orphan check.

continue
}
resultIDs[msg.ToolCallID] = true

case msg.Role == chat.MessageRoleAssistant && len(msg.ToolCalls) > 0:
flushPending()
out = append(out, msg)
pendingToolCalls = msg.ToolCalls
for _, tc := range msg.ToolCalls {
if tc.ID != "" {
pendingIDs[tc.ID] = true
}
}
continue

case msg.Role == chat.MessageRoleUser || msg.Role == chat.MessageRoleAssistant:
Expand Down
Loading