make ResponseComplete to accept LLMResponse and update the encoding method of Messages in ChatCompletions.

zetxqx · zetxqx · commit 824af68523b2 · 2025-10-28T20:42:12.000Z
diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
@@ -304,6 +304,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 						break
 					}
 
+					reqCtx.Response.Body = body
 					reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody)
 					if responseErr != nil {
 						if logger.V(logutil.DEBUG).Enabled() {
diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go
@@ -288,13 +288,7 @@ func (d *Director) HandleResponseBodyComplete(ctx context.Context, reqCtx *handl
 		logger.Error(err, "HandleResponseBodyComplete: failed to convert the response to LLMResponse.")
 		return reqCtx, err
 	}
-	response := &Response{
-		RequestId: requestID,
-		Headers:   reqCtx.Response.Headers,
-		// Currently use the first choice as the response body to process.
-		Body: llmResponse.GetFirstChoiceContent(),
-	}
-	d.runResponseCompletePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod)
+	d.runResponseCompletePlugins(ctx, reqCtx.SchedulingRequest, llmResponse, reqCtx.TargetPod)
 
 	logger.V(logutil.DEBUG).Info("Exiting HandleResponseBodyComplete")
 	return reqCtx, nil
@@ -344,7 +338,7 @@ func (d *Director) runResponseStreamingPlugins(ctx context.Context, request *sch
 	}
 }
 
-func (d *Director) runResponseCompletePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) {
+func (d *Director) runResponseCompletePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *schedulingtypes.LLMResponse, targetPod *backend.Pod) {
 	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
 	for _, plugin := range d.requestControlPlugins.responseCompletePlugins {
 		loggerDebug.Info("Running ResponseComplete plugin", "plugin", plugin.TypedName())
diff --git a/pkg/epp/requestcontrol/director_test.go b/pkg/epp/requestcontrol/director_test.go
@@ -712,6 +712,10 @@ func TestDirector_HandleResponseComplete(t *testing.T) {
 			"total_tokens": 3
 		}
 	}`
+	wantLLMResponse, err := schedulingtypes.NewLLMResponseFromBytes([]byte(chatCompletionJSON))
+	if err != nil {
+		t.Fatalf("NewLLMResponseFromBytes failed with error: %v", err)
+	}
 
 	reqCtx := &handlers.RequestContext{
 		Request: &handlers.Request{
@@ -726,21 +730,15 @@ func TestDirector_HandleResponseComplete(t *testing.T) {
 		TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}},
 	}
 
-	_, err := director.HandleResponseBodyComplete(ctx, reqCtx)
+	_, err = director.HandleResponseBodyComplete(ctx, reqCtx)
 	if err != nil {
 		t.Fatalf("HandleResponseBodyComplete() returned unexpected error: %v", err)
 	}
 
-	if diff := cmp.Diff("test-req-id-for-complete", pc1.lastRespOnComplete.RequestId); diff != "" {
-		t.Errorf("Scheduler.OnComplete RequestId mismatch (-want +got):\n%s", diff)
-	}
-	if diff := cmp.Diff(reqCtx.Response.Headers, pc1.lastRespOnComplete.Headers); diff != "" {
-		t.Errorf("Scheduler.OnComplete response headers mismatch (-want +got):\n%s", diff)
-	}
 	if diff := cmp.Diff("namespace1/test-pod-name", pc1.lastTargetPodOnComplete); diff != "" {
 		t.Errorf("Scheduler.OnComplete TargetPodName mismatch (-want +got):\n%s", diff)
 	}
-	if diff := cmp.Diff("Hello!", pc1.lastRespOnComplete.Body); diff != "" {
+	if diff := cmp.Diff(wantLLMResponse, pc1.lastRespOnComplete); diff != "" {
 		t.Errorf("Scheduler.OnComplete response body mismatch (-want +got):\n%s", diff)
 	}
 }
@@ -765,7 +763,7 @@ type testResponseStreaming struct {
 
 type testResponseComplete struct {
 	tn                      plugins.TypedName
-	lastRespOnComplete      *Response
+	lastRespOnComplete      *schedulingtypes.LLMResponse
 	lastTargetPodOnComplete string
 }
 
@@ -809,7 +807,7 @@ func (p *testResponseStreaming) ResponseStreaming(_ context.Context, _ *scheduli
 	p.lastTargetPodOnStreaming = targetPod.NamespacedName.String()
 }
 
-func (p *testResponseComplete) ResponseComplete(_ context.Context, _ *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) {
+func (p *testResponseComplete) ResponseComplete(_ context.Context, _ *schedulingtypes.LLMRequest, response *schedulingtypes.LLMResponse, targetPod *backend.Pod) {
 	p.lastRespOnComplete = response
 	p.lastTargetPodOnComplete = targetPod.NamespacedName.String()
 }
diff --git a/pkg/epp/requestcontrol/plugins.go b/pkg/epp/requestcontrol/plugins.go
@@ -55,5 +55,5 @@ type ResponseStreaming interface {
 // ResponseComplete is called by the director after the complete response is sent.
 type ResponseComplete interface {
 	plugins.Plugin
-	ResponseComplete(ctx context.Context, request *types.LLMRequest, response *Response, targetPod *backend.Pod)
+	ResponseComplete(ctx context.Context, request *types.LLMRequest, response *types.LLMResponse, targetPod *backend.Pod)
 }
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -123,6 +123,8 @@ type SchedulingContextState struct {
 	// If not empty, this will be used as the starting block for the following response that will
 	// be added to the response as well. This happens especially at the multi-turn scenario.
 	RestBytes []byte
+	// BlockSize is the block size used to caculate the hash of the request/response.
+	BlockSize int
 	// A map of server to its longest prefix cache match length.
 	PrefixCacheServers map[ServerID]int
 }
@@ -198,11 +200,13 @@ func (p *Plugin) WithName(name string) *Plugin {
 
 // Score returns the scoring result for the given list of pods based on context.
 func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
+	blockSize := getBlockSize(pods, p.config.DefaultBlockSize)
 	// pre score step, hashing prompt and find longest prefix match.
-	hashes, restBytes := hashPrompt(ctx, request, getBlockSize(pods, p.config.DefaultBlockSize), p.config.MaxPrefixBlocksToMatch)
+	hashes, restBytes := hashPrompt(ctx, request, blockSize, p.config.MaxPrefixBlocksToMatch)
 	state := &SchedulingContextState{
 		PrefixHashes:       hashes,
 		RestBytes:          restBytes,
+		BlockSize:          blockSize,
 		PrefixCacheServers: p.matchLongestPrefix(ctx, hashes),
 	}
 
@@ -233,7 +237,6 @@ func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, sche
 	targetPod := primaryProfileResult.TargetPods[0].GetPod() // get the first pod of the primary profile
 
 	state, err := plugins.ReadPluginStateKey[*SchedulingContextState](p.pluginState, request.RequestId, plugins.StateKey(p.TypedName().String()))
-	p.pluginState.Delete(request.RequestId) // delete the state explicitly after completing using it
 	if err != nil {
 		log.FromContext(ctx).Error(err, "failed to read prefix plugin state", "requestID", request.RequestId)
 		return
@@ -251,9 +254,7 @@ func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, sche
 
 	total := len(state.PrefixHashes)
 	matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)]
-
-	blockSize := getBlockSize(primaryProfileResult.TargetPods, p.config.DefaultBlockSize)
-	metrics.RecordPrefixCacheMatch(matchLen*blockSize, total*blockSize)
+	metrics.RecordPrefixCacheMatch(matchLen*state.BlockSize, total*state.BlockSize)
 }
 
 // matchLongestPrefix returns a map of servers and length of prefix that each server caches.
@@ -375,19 +376,25 @@ func getUserInputBytes(request *types.LLMRequest) ([]byte, error) {
 	}
 
 	// must be chat-completions request at this point, return bytes of entire messages
-	return json.Marshal(request.Body.ChatCompletions.Messages)
+	return types.MarshalMessagesToJSON(request.Body.ChatCompletions.Messages...)
 }
 
-func (p *Plugin) ResponseComplete(ctx context.Context, request *types.LLMRequest, response *requestcontrol.Response, targetPod *backend.Pod) {
+func (p *Plugin) ResponseComplete(ctx context.Context, request *types.LLMRequest, response *types.LLMResponse, targetPod *backend.Pod) {
 	state, err := plugins.ReadPluginStateKey[*SchedulingContextState](p.pluginState, request.RequestId, plugins.StateKey(p.TypedName().String()))
 	if err != nil {
 		log.FromContext(ctx).Error(err, "failed to read prefix plugin state", "requestID", request.RequestId)
 		return
 	}
 	p.pluginState.Delete(request.RequestId) // delete the state explicitly after completing using it.
+
+	reponseForKVCache, err := response.FirstChoiceContent()
+	if err != nil {
+		log.FromContext(ctx).Error(err, "failed to get first choice content", "requestID", request.RequestId)
+		return
+	}
 	var input bytes.Buffer
 	input.Write(state.RestBytes)
-	input.Write([]byte(response.Body))
+	input.Write(reponseForKVCache)
 
 	server := ServerID(targetPod.NamespacedName)
 	prevBlockHash := defaultPrevBlock(request)
@@ -396,8 +403,7 @@ func (p *Plugin) ResponseComplete(ctx context.Context, request *types.LLMRequest
 		prevBlockHash = state.PrefixHashes[len(state.PrefixHashes)-1]
 		prevBlockHashLength = len(state.PrefixHashes)
 	}
-	inputBytes := input.Bytes()
-	hashBlocks, _ := hashInputWithPrevBlockHash(ctx, prevBlockHash, prevBlockHashLength, inputBytes, p.config.DefaultBlockSize, p.config.MaxPrefixBlocksToMatch)
+	hashBlocks, _ := hashInputWithPrevBlockHash(ctx, prevBlockHash, prevBlockHashLength, input.Bytes(), state.BlockSize, p.config.MaxPrefixBlocksToMatch)
 	p.wg.Add(1)
 	go func() {
 		p.indexer.Add(hashBlocks, server)
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
@@ -30,7 +30,6 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 )
 
@@ -201,8 +200,9 @@ func TestPrefixPluginCompletion(t *testing.T) {
 }
 
 func TestPrefixPluginCompletionWithResponse(t *testing.T) {
+	const defaultBlockSize = 4
 	config := Config{
-		DefaultBlockSize:       4,
+		DefaultBlockSize:       defaultBlockSize,
 		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
 		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
 	}
@@ -231,6 +231,9 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
 	// Total hashes = 1 (for the "aaaa" block) + 1 (for the model prefix).
 	assert.Equal(t, 1, len(state.PrefixHashes), "number of hashes is incorrect")
 	assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers yet")
+	// The last 2 characters are recorded in restBytes of the state.
+	assert.Equal(t, 2, len(state.RestBytes), "number of restBytes is incorrect")
+	assert.Equal(t, defaultBlockSize, state.BlockSize, "blockSize is incorrect")
 	assert.Equal(t, float64(0), scores[pod1], "score for pod1 should be 0 on first request")
 	assert.Equal(t, float64(0), scores[pod2], "score for pod2 should be 0 on first request")
 
@@ -251,7 +254,16 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
 	//   - Response Body:  "bb"
 	//   - Cached Sequence: "aaaaaabb" (length 8)
 	// This sequence creates two 4-character blocks to be cached: "aaaa" and "aabb".
-	plugin.ResponseComplete(context.Background(), req1, &requestcontrol.Response{Body: "bb"}, pod1.GetPod())
+	resp1 := &types.LLMResponse{
+		Completion: &types.CompletionResponse{
+			Choices: []types.CompletionChoice{
+				{
+					Text: "bb",
+				},
+			},
+		},
+	}
+	plugin.ResponseComplete(context.Background(), req1, resp1, pod1.GetPod())
 	plugin.wg.Wait()
 
 	// -- Second Request: Multi-turn Follow-up --
@@ -278,6 +290,9 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
 	assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect")
 	// It should find a server (pod1) that has cached the prefixes.
 	assert.Equal(t, 1, len(state.PrefixCacheServers), "a cached server should have been found")
+	// The last 2 characters ("cc") are recorded in restBytes of the state.
+	assert.Equal(t, 2, len(state.RestBytes), "number of restBytes is incorrect")
+	assert.Equal(t, defaultBlockSize, state.BlockSize, "blockSize is incorrect")
 	// The score for pod1 should be 1.0 because both prompt blocks ("aaaa" and "aabb") were found in its cache.
 	assert.Equal(t, float64(1), scores[pod1], "score for pod1 should be a perfect match")
 	assert.Equal(t, float64(0), scores[pod2], "score for pod2 should be 0")
@@ -362,6 +377,19 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 	plugin.PreRequest(context.Background(), req1, schedulingResult)
 	plugin.wg.Wait()
 
+	resp1 := &types.LLMResponse{
+		ChatCompletion: &types.ChatCompletionResponse{
+			Choices: []types.ChatChoice{
+				{
+					Message: types.Message{Role: "assistant", Content: "I'm doing well, thank you! How can I help you today?"},
+				},
+			},
+		},
+	}
+	// Trigger to simulate the resp1 is added to the kvCache recording.
+	plugin.ResponseComplete(context.Background(), req1, resp1, pod1.GetPod())
+	plugin.wg.Wait()
+
 	// Second request adds assistant response and new user message (conversation grows)
 	req2 := &types.LLMRequest{
 		RequestId:   uuid.NewString(),
@@ -389,13 +417,27 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 	cachedBlocks := state.PrefixCacheServers[ServerID(pod1.GetPod().NamespacedName)]
 	expectedScore := float64(cachedBlocks) / float64(extendedHashCount)
 	assert.Equal(t, expectedScore, scores[pod1], "pod1 should have prefix cache hit")
+	assert.Greater(t, scores[pod1], float64(0.5), "given the response is also prefix cached the cache hit should be well above 0.5")
 	assert.Equal(t, float64(0), scores[pod2], "pod2 should have no cache hit")
 
 	// Simulate pod1 was picked again
 	plugin.PreRequest(context.Background(), req2, schedulingResult)
 	plugin.wg.Wait()
 
-	// Third request continues the conversation even further
+	resp2 := &types.LLMResponse{
+		ChatCompletion: &types.ChatCompletionResponse{
+			Choices: []types.ChatChoice{
+				{
+					Message: types.Message{Role: "assistant", Content: "Prefix caching is a technique where..."},
+				},
+			},
+		},
+	}
+	// Trigger to simulate the resp1 is added to the kvCache recording.
+	plugin.ResponseComplete(context.Background(), req2, resp2, pod1.GetPod())
+	plugin.wg.Wait()
+
+	// Third request is the whole above conversation to make the cache hit to 1.0.
 	req3 := &types.LLMRequest{
 		RequestId:   uuid.NewString(),
 		TargetModel: "test-model1",
@@ -424,7 +466,7 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 	cachedBlocks = state.PrefixCacheServers[ServerID(pod1.GetPod().NamespacedName)]
 	expectedScore = float64(cachedBlocks) / float64(longHashCount)
 	assert.Equal(t, expectedScore, scores[pod1], "pod1 should have higher prefix cache hit")
-	assert.Greater(t, scores[pod1], float64(0.5), "cache hit rate should be substantial for growing conversation")
+	assert.Equal(t, scores[pod1], float64(1), "cache hit rate should be substantial for growing conversation")
 	assert.Equal(t, float64(0), scores[pod2], "pod2 should still have no cache hit")
 }
 
diff --git a/pkg/epp/scheduling/types/llmresponse.go b/pkg/epp/scheduling/types/llmresponse.go
@@ -18,6 +18,7 @@ package types
 
 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 )
 
@@ -26,19 +27,19 @@ import (
 type LLMResponse struct {
 	// ChatCompletion is the representation of the OpenAI /v1/chat/completions response body.
 	ChatCompletion *ChatCompletionResponse `json:"chat_completion,omitempty"`
-	// LegacyCompletion is the representation of the OpenAI /v1/completions response body.
-	LegacyCompletion *LegacyCompletionResponse `json:"legacy_completion,omitempty"`
+	// Completion is the representation of the OpenAI /v1/completions response body.
+	Completion *CompletionResponse `json:"legacy_completion,omitempty"`
 }
 
-// GetFirstChoiceContent extracts the primary text content from the first choice
-// in either a ChatCompletion or a LegacyCompletion response.
-func (res *LLMResponse) GetFirstChoiceContent() string {
+// FirstChoiceContent extracts the first choice of the response.
+func (res *LLMResponse) FirstChoiceContent() ([]byte, error) {
 	if res.ChatCompletion != nil && len(res.ChatCompletion.Choices) > 0 {
-		return res.ChatCompletion.Choices[0].Message.Content
-	} else if res.LegacyCompletion != nil && len(res.LegacyCompletion.Choices) > 0 {
-		return res.LegacyCompletion.Choices[0].Text
+		return MarshalMessagesToJSON(res.ChatCompletion.Choices[0].Message)
 	}
-	return ""
+	if res.Completion != nil && len(res.Completion.Choices) > 0 {
+		return []byte(res.Completion.Choices[0].Text), nil
+	}
+	return nil, errors.New("no choices found in the LLM response")
 }
 
 // ChatCompletionResponse represents the full response body for the chat completions API.
@@ -60,8 +61,8 @@ func (r *ChatCompletionResponse) String() string {
 
 // ChatChoice represents a single choice in the chat completion response.
 type ChatChoice struct {
-	Message      ChatMessage `json:"message"`
-	FinishReason string      `json:"finish_reason"`
+	Message      Message `json:"message"`
+	FinishReason string  `json:"finish_reason"`
 }
 
 // ChatMessage represents the message object within a choice.
@@ -70,13 +71,13 @@ type ChatMessage struct {
 	Content string `json:"content"`
 }
 
-// LegacyCompletionResponse represents the full response body for the legacy completions API.
-type LegacyCompletionResponse struct {
-	Choices []LegacyChoice `json:"choices"`
-	Usage   *Usage         `json:"usage,omitempty"`
+// CompletionResponse represents the full response body for the legacy completions API.
+type CompletionResponse struct {
+	Choices []CompletionChoice `json:"choices"`
+	Usage   *Usage             `json:"usage,omitempty"`
 }
 
-func (r *LegacyCompletionResponse) String() string {
+func (r *CompletionResponse) String() string {
 	if r == nil {
 		return nilString
 	}
@@ -87,8 +88,8 @@ func (r *LegacyCompletionResponse) String() string {
 	return fmt.Sprintf("{TextLength: %d, Usage: %v}", textLen, r.Usage)
 }
 
-// LegacyChoice represents a single choice in the legacy completion response.
-type LegacyChoice struct {
+// CompletionChoice represents a single choice in the legacy completion response.
+type CompletionChoice struct {
 	Text         string `json:"text"`
 	FinishReason string `json:"finish_reason"`
 }
@@ -111,7 +112,7 @@ func (u *Usage) String() string {
 // as a chat completion and then as a legacy completion response.
 func NewLLMResponseFromBytes(body []byte) (*LLMResponse, error) {
 	if len(body) == 0 {
-		return nil, fmt.Errorf("input bytes are empty")
+		return nil, errors.New("input bytes are empty")
 	}
 
 	// Attempt to unmarshal as a ChatCompletionResponse first.
@@ -124,12 +125,12 @@ func NewLLMResponseFromBytes(body []byte) (*LLMResponse, error) {
 	}
 
 	// Try to unmarshal as a LegacyCompletionResponse.
-	var legacyResp LegacyCompletionResponse
+	var legacyResp CompletionResponse
 	if err := json.Unmarshal(body, &legacyResp); err == nil {
 		if len(legacyResp.Choices) > 0 {
-			return &LLMResponse{LegacyCompletion: &legacyResp}, nil
+			return &LLMResponse{Completion: &legacyResp}, nil
 		}
 	}
 
-	return nil, fmt.Errorf("failed to unmarshal body into any known LLM response format")
+	return nil, errors.New("failed to unmarshal body into any known LLM response format")
 }
diff --git a/pkg/epp/scheduling/types/llmresponse_test.go b/pkg/epp/scheduling/types/llmresponse_test.go
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
diff --git a/pkg/epp/scheduling/types/types_test.go b/pkg/epp/scheduling/types/types_test.go

Original file line number	Diff line number	Diff line change
`@@ -304,6 +304,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)`
`304`	`304`	`break`
`305`	`305`	`}`
`306`	`306`
	`307`	`+ reqCtx.Response.Body = body`
`307`	`308`	`reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody)`
`308`	`309`	`if responseErr != nil {`
`309`	`310`	`if logger.V(logutil.DEBUG).Enabled() {`
Original file line number	Diff line number	Diff line change
`@@ -712,6 +712,10 @@ func TestDirector_HandleResponseComplete(t *testing.T) {`
`712`	`712`	`"total_tokens": 3`
`713`	`713`	`}`
`714`	`714`	}`
	`715`	`+ wantLLMResponse, err := schedulingtypes.NewLLMResponseFromBytes([]byte(chatCompletionJSON))`
	`716`	`+ if err != nil {`
	`717`	`+ t.Fatalf("NewLLMResponseFromBytes failed with error: %v", err)`
	`718`	`+ }`
`715`	`719`
`716`	`720`	`reqCtx := &handlers.RequestContext{`
`717`	`721`	`Request: &handlers.Request{`
`@@ -726,21 +730,15 @@ func TestDirector_HandleResponseComplete(t *testing.T) {`
`726`	`730`	`TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}},`
`727`	`731`	`}`
`728`	`732`
`729`		`- _, err := director.HandleResponseBodyComplete(ctx, reqCtx)`
	`733`	`+ _, err = director.HandleResponseBodyComplete(ctx, reqCtx)`
`730`	`734`	`if err != nil {`
`731`	`735`	`t.Fatalf("HandleResponseBodyComplete() returned unexpected error: %v", err)`
`732`	`736`	`}`
`733`	`737`
`734`		`- if diff := cmp.Diff("test-req-id-for-complete", pc1.lastRespOnComplete.RequestId); diff != "" {`
`735`		`- t.Errorf("Scheduler.OnComplete RequestId mismatch (-want +got):\n%s", diff)`
`736`		`- }`
`737`		`- if diff := cmp.Diff(reqCtx.Response.Headers, pc1.lastRespOnComplete.Headers); diff != "" {`
`738`		`- t.Errorf("Scheduler.OnComplete response headers mismatch (-want +got):\n%s", diff)`
`739`		`- }`
`740`	`738`	`if diff := cmp.Diff("namespace1/test-pod-name", pc1.lastTargetPodOnComplete); diff != "" {`
`741`	`739`	`t.Errorf("Scheduler.OnComplete TargetPodName mismatch (-want +got):\n%s", diff)`
`742`	`740`	`}`
`743`		`- if diff := cmp.Diff("Hello!", pc1.lastRespOnComplete.Body); diff != "" {`
	`741`	`+ if diff := cmp.Diff(wantLLMResponse, pc1.lastRespOnComplete); diff != "" {`
`744`	`742`	`t.Errorf("Scheduler.OnComplete response body mismatch (-want +got):\n%s", diff)`
`745`	`743`	`}`
`746`	`744`	`}`
`@@ -765,7 +763,7 @@ type testResponseStreaming struct {`
`765`	`763`
`766`	`764`	`type testResponseComplete struct {`
`767`	`765`	`tn plugins.TypedName`
`768`		`- lastRespOnComplete *Response`
	`766`	`+ lastRespOnComplete *schedulingtypes.LLMResponse`
`769`	`767`	`lastTargetPodOnComplete string`
`770`	`768`	`}`
`771`	`769`
`@@ -809,7 +807,7 @@ func (p testResponseStreaming) ResponseStreaming(_ context.Context, _ scheduli`
`809`	`807`	`p.lastTargetPodOnStreaming = targetPod.NamespacedName.String()`
`810`	`808`	`}`
`811`	`809`
`812`		`-func (p testResponseComplete) ResponseComplete(_ context.Context, _ schedulingtypes.LLMRequest, response Response, targetPod backend.Pod) {`
	`810`	`+func (p testResponseComplete) ResponseComplete(_ context.Context, _ schedulingtypes.LLMRequest, response schedulingtypes.LLMResponse, targetPod backend.Pod) {`
`813`	`811`	`p.lastRespOnComplete = response`
`814`	`812`	`p.lastTargetPodOnComplete = targetPod.NamespacedName.String()`
`815`	`813`	`}`
Original file line number	Diff line number	Diff line change
`@@ -55,5 +55,5 @@ type ResponseStreaming interface {`
`55`	`55`	`// ResponseComplete is called by the director after the complete response is sent.`
`56`	`56`	`type ResponseComplete interface {`
`57`	`57`	`plugins.Plugin`
`58`		`- ResponseComplete(ctx context.Context, request types.LLMRequest, response Response, targetPod *backend.Pod)`
	`58`	`+ ResponseComplete(ctx context.Context, request types.LLMRequest, response types.LLMResponse, targetPod *backend.Pod)`
`59`	`59`	`}`