make ResponseComplete to accept LLMResponse and update the encoding method of Messages in ChatCompletions.

zetxqx · zetxqx · commit 7476ad335938 · 2025-10-16T01:07:41.000Z
diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
@@ -303,6 +303,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
 						break
 					}
 
+					reqCtx.Response.Body = body
 					reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody)
 					if responseErr != nil {
 						if logger.V(logutil.DEBUG).Enabled() {
diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go
@@ -297,13 +297,7 @@ func (d *Director) HandleResponseBodyComplete(ctx context.Context, reqCtx *handl
 		logger.Error(err, "HandleResponseBodyComplete: failed to convert the response to LLMResponse.")
 		return reqCtx, err
 	}
-	response := &Response{
-		RequestId: requestID,
-		Headers:   reqCtx.Response.Headers,
-		// Currently use the first choice as the response body to process.
-		Body: llmResponse.GetFirstChoiceContent(),
-	}
-	d.runResponseCompletePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod)
+	d.runResponseCompletePlugins(ctx, reqCtx.SchedulingRequest, llmResponse, reqCtx.TargetPod)
 
 	logger.V(logutil.DEBUG).Info("Exiting HandleResponseBodyComplete")
 	return reqCtx, nil
@@ -353,7 +347,7 @@ func (d *Director) runResponseStreamingPlugins(ctx context.Context, request *sch
 	}
 }
 
-func (d *Director) runResponseCompletePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) {
+func (d *Director) runResponseCompletePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *schedulingtypes.LLMResponse, targetPod *backend.Pod) {
 	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
 	for _, plugin := range d.requestControlPlugins.responseCompletePlugins {
 		loggerDebug.Info("Running ResponseComplete plugin", "plugin", plugin.TypedName())
diff --git a/pkg/epp/requestcontrol/director_test.go b/pkg/epp/requestcontrol/director_test.go
@@ -677,6 +677,10 @@ func TestDirector_HandleResponseComplete(t *testing.T) {
 			"total_tokens": 3
 		}
 	}`
+	wantLLMResponse, err := schedulingtypes.NewLLMResponseFromBytes([]byte(chatCompletionJSON))
+	if err != nil {
+		t.Fatalf("NewLLMResponseFromBytes failed with error: %v", err)
+	}
 
 	reqCtx := &handlers.RequestContext{
 		Request: &handlers.Request{
@@ -691,21 +695,15 @@ func TestDirector_HandleResponseComplete(t *testing.T) {
 		TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}},
 	}
 
-	_, err := director.HandleResponseBodyComplete(ctx, reqCtx)
+	_, err = director.HandleResponseBodyComplete(ctx, reqCtx)
 	if err != nil {
 		t.Fatalf("HandleResponseBodyComplete() returned unexpected error: %v", err)
 	}
 
-	if diff := cmp.Diff("test-req-id-for-complete", pc1.lastRespOnComplete.RequestId); diff != "" {
-		t.Errorf("Scheduler.OnComplete RequestId mismatch (-want +got):\n%s", diff)
-	}
-	if diff := cmp.Diff(reqCtx.Response.Headers, pc1.lastRespOnComplete.Headers); diff != "" {
-		t.Errorf("Scheduler.OnComplete response headers mismatch (-want +got):\n%s", diff)
-	}
 	if diff := cmp.Diff("namespace1/test-pod-name", pc1.lastTargetPodOnComplete); diff != "" {
 		t.Errorf("Scheduler.OnComplete TargetPodName mismatch (-want +got):\n%s", diff)
 	}
-	if diff := cmp.Diff("Hello!", pc1.lastRespOnComplete.Body); diff != "" {
+	if diff := cmp.Diff(wantLLMResponse, pc1.lastRespOnComplete); diff != "" {
 		t.Errorf("Scheduler.OnComplete response body mismatch (-want +got):\n%s", diff)
 	}
 }
@@ -730,7 +728,7 @@ type testResponseStreaming struct {
 
 type testResponseComplete struct {
 	tn                      plugins.TypedName
-	lastRespOnComplete      *Response
+	lastRespOnComplete      *schedulingtypes.LLMResponse
 	lastTargetPodOnComplete string
 }
 
@@ -774,7 +772,7 @@ func (p *testResponseStreaming) ResponseStreaming(_ context.Context, _ *scheduli
 	p.lastTargetPodOnStreaming = targetPod.NamespacedName.String()
 }
 
-func (p *testResponseComplete) ResponseComplete(_ context.Context, _ *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) {
+func (p *testResponseComplete) ResponseComplete(_ context.Context, _ *schedulingtypes.LLMRequest, response *schedulingtypes.LLMResponse, targetPod *backend.Pod) {
 	p.lastRespOnComplete = response
 	p.lastTargetPodOnComplete = targetPod.NamespacedName.String()
 }
diff --git a/pkg/epp/requestcontrol/plugins.go b/pkg/epp/requestcontrol/plugins.go
@@ -55,5 +55,5 @@ type ResponseStreaming interface {
 // ResponseComplete is called by the director after the complete response is sent.
 type ResponseComplete interface {
 	plugins.Plugin
-	ResponseComplete(ctx context.Context, request *types.LLMRequest, response *Response, targetPod *backend.Pod)
+	ResponseComplete(ctx context.Context, request *types.LLMRequest, response *types.LLMResponse, targetPod *backend.Pod)
 }
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -375,19 +375,25 @@ func getUserInputBytes(request *types.LLMRequest) ([]byte, error) {
 	}
 
 	// must be chat-completions request at this point, return bytes of entire messages
-	return json.Marshal(request.Body.ChatCompletions.Messages)
+	return types.MarshalMessagesToJSON(request.Body.ChatCompletions.Messages...)
 }
 
-func (p *Plugin) ResponseComplete(ctx context.Context, request *types.LLMRequest, response *requestcontrol.Response, targetPod *backend.Pod) {
+func (p *Plugin) ResponseComplete(ctx context.Context, request *types.LLMRequest, response *types.LLMResponse, targetPod *backend.Pod) {
 	state, err := plugins.ReadPluginStateKey[*SchedulingContextState](p.pluginState, request.RequestId, plugins.StateKey(p.TypedName().String()))
 	if err != nil {
 		log.FromContext(ctx).Error(err, "failed to read prefix plugin state", "requestID", request.RequestId)
 		return
 	}
 	p.pluginState.Delete(request.RequestId) // delete the state explicitly after completing using it.
+
+	reponseForKVCache, err := response.FirstChoiceContent()
+	if err != nil {
+		log.FromContext(ctx).Error(err, "failed to get first choice content", "requestID", request.RequestId)
+		return
+	}
 	var input bytes.Buffer
 	input.Write(state.RestBytes)
-	input.Write([]byte(response.Body))
+	input.Write(reponseForKVCache)
 
 	server := ServerID(targetPod.NamespacedName)
 	prevBlockHash := defaultPrevBlock(request)
@@ -396,8 +402,7 @@ func (p *Plugin) ResponseComplete(ctx context.Context, request *types.LLMRequest
 		prevBlockHash = state.PrefixHashes[len(state.PrefixHashes)-1]
 		prevBlockHashLength = len(state.PrefixHashes)
 	}
-	inputBytes := input.Bytes()
-	hashBlocks, _ := hashInputWithPrevBlockHash(ctx, prevBlockHash, prevBlockHashLength, inputBytes, p.config.DefaultBlockSize, p.config.MaxPrefixBlocksToMatch)
+	hashBlocks, _ := hashInputWithPrevBlockHash(ctx, prevBlockHash, prevBlockHashLength, input.Bytes(), p.config.DefaultBlockSize, p.config.MaxPrefixBlocksToMatch)
 	p.wg.Add(1)
 	go func() {
 		p.indexer.Add(hashBlocks, server)
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
@@ -30,7 +30,6 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
-	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 )
 
@@ -251,7 +250,16 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
 	//   - Response Body:  "bb"
 	//   - Cached Sequence: "aaaaaabb" (length 8)
 	// This sequence creates two 4-character blocks to be cached: "aaaa" and "aabb".
-	plugin.ResponseComplete(context.Background(), req1, &requestcontrol.Response{Body: "bb"}, pod1.GetPod())
+	resp1 := &types.LLMResponse{
+		Completion: &types.CompletionResponse{
+			Choices: []types.CompletionChoice{
+				{
+					Text: "bb",
+				},
+			},
+		},
+	}
+	plugin.ResponseComplete(context.Background(), req1, resp1, pod1.GetPod())
 	plugin.wg.Wait()
 
 	// -- Second Request: Multi-turn Follow-up --
@@ -362,6 +370,19 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 	plugin.PreRequest(context.Background(), req1, schedulingResult, 0)
 	plugin.wg.Wait()
 
+	resp1 := &types.LLMResponse{
+		ChatCompletion: &types.ChatCompletionResponse{
+			Choices: []types.ChatChoice{
+				{
+					Message: types.Message{Role: "assistant", Content: "I'm doing well, thank you! How can I help you today?"},
+				},
+			},
+		},
+	}
+	// Trigger to simulate the resp1 is added to the kvCache recording.
+	plugin.ResponseComplete(context.Background(), req1, resp1, pod1.GetPod())
+	plugin.wg.Wait()
+
 	// Second request adds assistant response and new user message (conversation grows)
 	req2 := &types.LLMRequest{
 		RequestId:   uuid.NewString(),
@@ -389,13 +410,27 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 	cachedBlocks := state.PrefixCacheServers[ServerID(pod1.GetPod().NamespacedName)]
 	expectedScore := float64(cachedBlocks) / float64(extendedHashCount)
 	assert.Equal(t, expectedScore, scores[pod1], "pod1 should have prefix cache hit")
+	assert.Greater(t, scores[pod1], float64(0.5), "given the response is also prefix cached the cache hit should be well above 0.5")
 	assert.Equal(t, float64(0), scores[pod2], "pod2 should have no cache hit")
 
 	// Simulate pod1 was picked again
 	plugin.PreRequest(context.Background(), req2, schedulingResult, 0)
 	plugin.wg.Wait()
 
-	// Third request continues the conversation even further
+	resp2 := &types.LLMResponse{
+		ChatCompletion: &types.ChatCompletionResponse{
+			Choices: []types.ChatChoice{
+				{
+					Message: types.Message{Role: "assistant", Content: "Prefix caching is a technique where..."},
+				},
+			},
+		},
+	}
+	// Trigger to simulate the resp1 is added to the kvCache recording.
+	plugin.ResponseComplete(context.Background(), req2, resp2, pod1.GetPod())
+	plugin.wg.Wait()
+
+	// Third request is the whole above conversation to make the cache hit to 1.0.
 	req3 := &types.LLMRequest{
 		RequestId:   uuid.NewString(),
 		TargetModel: "test-model1",
@@ -407,7 +442,6 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 					{Role: "assistant", Content: "I'm doing well, thank you! How can I help you today?"},
 					{Role: "user", Content: "Can you explain how prefix caching works?"},
 					{Role: "assistant", Content: "Prefix caching is a technique where..."},
-					{Role: "user", Content: "That's very helpful, thank you!"},
 				},
 			},
 		},
@@ -424,7 +458,7 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
 	cachedBlocks = state.PrefixCacheServers[ServerID(pod1.GetPod().NamespacedName)]
 	expectedScore = float64(cachedBlocks) / float64(longHashCount)
 	assert.Equal(t, expectedScore, scores[pod1], "pod1 should have higher prefix cache hit")
-	assert.Greater(t, scores[pod1], float64(0.5), "cache hit rate should be substantial for growing conversation")
+	assert.Equal(t, scores[pod1], float64(1), "cache hit rate should be substantial for growing conversation")
 	assert.Equal(t, float64(0), scores[pod2], "pod2 should still have no cache hit")
 }
 
diff --git a/pkg/epp/scheduling/types/llmresponse.go b/pkg/epp/scheduling/types/llmresponse.go
@@ -26,19 +26,19 @@ import (
 type LLMResponse struct {
 	// ChatCompletion is the representation of the OpenAI /v1/chat/completions response body.
 	ChatCompletion *ChatCompletionResponse `json:"chat_completion,omitempty"`
-	// LegacyCompletion is the representation of the OpenAI /v1/completions response body.
-	LegacyCompletion *LegacyCompletionResponse `json:"legacy_completion,omitempty"`
+	// Completion is the representation of the OpenAI /v1/completions response body.
+	Completion *CompletionResponse `json:"legacy_completion,omitempty"`
 }
 
-// GetFirstChoiceContent extracts the primary text content from the first choice
-// in either a ChatCompletion or a LegacyCompletion response.
-func (res *LLMResponse) GetFirstChoiceContent() string {
+// FirstChoiceContent extracts the first choice of the response.
+func (res *LLMResponse) FirstChoiceContent() ([]byte, error) {
 	if res.ChatCompletion != nil && len(res.ChatCompletion.Choices) > 0 {
-		return res.ChatCompletion.Choices[0].Message.Content
-	} else if res.LegacyCompletion != nil && len(res.LegacyCompletion.Choices) > 0 {
-		return res.LegacyCompletion.Choices[0].Text
+		return MarshalMessagesToJSON(res.ChatCompletion.Choices[0].Message)
 	}
-	return ""
+	if res.Completion != nil && len(res.Completion.Choices) > 0 {
+		return []byte(res.Completion.Choices[0].Text), nil
+	}
+	return nil, fmt.Errorf("no choices found in the LLM response")
 }
 
 // ChatCompletionResponse represents the full response body for the chat completions API.
@@ -60,8 +60,8 @@ func (r *ChatCompletionResponse) String() string {
 
 // ChatChoice represents a single choice in the chat completion response.
 type ChatChoice struct {
-	Message      ChatMessage `json:"message"`
-	FinishReason string      `json:"finish_reason"`
+	Message      Message `json:"message"`
+	FinishReason string  `json:"finish_reason"`
 }
 
 // ChatMessage represents the message object within a choice.
@@ -70,13 +70,13 @@ type ChatMessage struct {
 	Content string `json:"content"`
 }
 
-// LegacyCompletionResponse represents the full response body for the legacy completions API.
-type LegacyCompletionResponse struct {
-	Choices []LegacyChoice `json:"choices"`
-	Usage   *Usage         `json:"usage,omitempty"`
+// CompletionResponse represents the full response body for the legacy completions API.
+type CompletionResponse struct {
+	Choices []CompletionChoice `json:"choices"`
+	Usage   *Usage             `json:"usage,omitempty"`
 }
 
-func (r *LegacyCompletionResponse) String() string {
+func (r *CompletionResponse) String() string {
 	if r == nil {
 		return nilString
 	}
@@ -87,8 +87,8 @@ func (r *LegacyCompletionResponse) String() string {
 	return fmt.Sprintf("{TextLength: %d, Usage: %v}", textLen, r.Usage)
 }
 
-// LegacyChoice represents a single choice in the legacy completion response.
-type LegacyChoice struct {
+// CompletionChoice represents a single choice in the legacy completion response.
+type CompletionChoice struct {
 	Text         string `json:"text"`
 	FinishReason string `json:"finish_reason"`
 }
@@ -124,10 +124,10 @@ func NewLLMResponseFromBytes(body []byte) (*LLMResponse, error) {
 	}
 
 	// Try to unmarshal as a LegacyCompletionResponse.
-	var legacyResp LegacyCompletionResponse
+	var legacyResp CompletionResponse
 	if err := json.Unmarshal(body, &legacyResp); err == nil {
 		if len(legacyResp.Choices) > 0 {
-			return &LLMResponse{LegacyCompletion: &legacyResp}, nil
+			return &LLMResponse{Completion: &legacyResp}, nil
 		}
 	}
 
diff --git a/pkg/epp/scheduling/types/llmresponse_test.go b/pkg/epp/scheduling/types/llmresponse_test.go
diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go
diff --git a/pkg/epp/scheduling/types/types_test.go b/pkg/epp/scheduling/types/types_test.go

Original file line number	Diff line number	Diff line change
`@@ -303,6 +303,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)`
`303`	`303`	`break`
`304`	`304`	`}`
`305`	`305`
	`306`	`+ reqCtx.Response.Body = body`
`306`	`307`	`reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody)`
`307`	`308`	`if responseErr != nil {`
`308`	`309`	`if logger.V(logutil.DEBUG).Enabled() {`
Original file line number	Diff line number	Diff line change
`@@ -677,6 +677,10 @@ func TestDirector_HandleResponseComplete(t *testing.T) {`
`677`	`677`	`"total_tokens": 3`
`678`	`678`	`}`
`679`	`679`	}`
	`680`	`+ wantLLMResponse, err := schedulingtypes.NewLLMResponseFromBytes([]byte(chatCompletionJSON))`
	`681`	`+ if err != nil {`
	`682`	`+ t.Fatalf("NewLLMResponseFromBytes failed with error: %v", err)`
	`683`	`+ }`
`680`	`684`
`681`	`685`	`reqCtx := &handlers.RequestContext{`
`682`	`686`	`Request: &handlers.Request{`
`@@ -691,21 +695,15 @@ func TestDirector_HandleResponseComplete(t *testing.T) {`
`691`	`695`	`TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}},`
`692`	`696`	`}`
`693`	`697`
`694`		`- _, err := director.HandleResponseBodyComplete(ctx, reqCtx)`
	`698`	`+ _, err = director.HandleResponseBodyComplete(ctx, reqCtx)`
`695`	`699`	`if err != nil {`
`696`	`700`	`t.Fatalf("HandleResponseBodyComplete() returned unexpected error: %v", err)`
`697`	`701`	`}`
`698`	`702`
`699`		`- if diff := cmp.Diff("test-req-id-for-complete", pc1.lastRespOnComplete.RequestId); diff != "" {`
`700`		`- t.Errorf("Scheduler.OnComplete RequestId mismatch (-want +got):\n%s", diff)`
`701`		`- }`
`702`		`- if diff := cmp.Diff(reqCtx.Response.Headers, pc1.lastRespOnComplete.Headers); diff != "" {`
`703`		`- t.Errorf("Scheduler.OnComplete response headers mismatch (-want +got):\n%s", diff)`
`704`		`- }`
`705`	`703`	`if diff := cmp.Diff("namespace1/test-pod-name", pc1.lastTargetPodOnComplete); diff != "" {`
`706`	`704`	`t.Errorf("Scheduler.OnComplete TargetPodName mismatch (-want +got):\n%s", diff)`
`707`	`705`	`}`
`708`		`- if diff := cmp.Diff("Hello!", pc1.lastRespOnComplete.Body); diff != "" {`
	`706`	`+ if diff := cmp.Diff(wantLLMResponse, pc1.lastRespOnComplete); diff != "" {`
`709`	`707`	`t.Errorf("Scheduler.OnComplete response body mismatch (-want +got):\n%s", diff)`
`710`	`708`	`}`
`711`	`709`	`}`
`@@ -730,7 +728,7 @@ type testResponseStreaming struct {`
`730`	`728`
`731`	`729`	`type testResponseComplete struct {`
`732`	`730`	`tn plugins.TypedName`
`733`		`- lastRespOnComplete *Response`
	`731`	`+ lastRespOnComplete *schedulingtypes.LLMResponse`
`734`	`732`	`lastTargetPodOnComplete string`
`735`	`733`	`}`
`736`	`734`
`@@ -774,7 +772,7 @@ func (p testResponseStreaming) ResponseStreaming(_ context.Context, _ scheduli`
`774`	`772`	`p.lastTargetPodOnStreaming = targetPod.NamespacedName.String()`
`775`	`773`	`}`
`776`	`774`
`777`		`-func (p testResponseComplete) ResponseComplete(_ context.Context, _ schedulingtypes.LLMRequest, response Response, targetPod backend.Pod) {`
	`775`	`+func (p testResponseComplete) ResponseComplete(_ context.Context, _ schedulingtypes.LLMRequest, response schedulingtypes.LLMResponse, targetPod backend.Pod) {`
`778`	`776`	`p.lastRespOnComplete = response`
`779`	`777`	`p.lastTargetPodOnComplete = targetPod.NamespacedName.String()`
`780`	`778`	`}`
Original file line number	Diff line number	Diff line change
`@@ -55,5 +55,5 @@ type ResponseStreaming interface {`
`55`	`55`	`// ResponseComplete is called by the director after the complete response is sent.`
`56`	`56`	`type ResponseComplete interface {`
`57`	`57`	`plugins.Plugin`
`58`		`- ResponseComplete(ctx context.Context, request types.LLMRequest, response Response, targetPod *backend.Pod)`
	`58`	`+ ResponseComplete(ctx context.Context, request types.LLMRequest, response types.LLMResponse, targetPod *backend.Pod)`
`59`	`59`	`}`