Add reponse to prefix cache in nonStreaming mode.

zetxqx · zetxqx · commit 028974c565d0 · 2025-10-28T20:36:26.000Z
diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go
@@ -115,6 +115,7 @@ type Request struct {
 }
 type Response struct {
 	Headers map[string]string
+	Body    []byte
 }
 type StreamRequestState int
 
diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go
@@ -280,13 +280,20 @@ func (d *Director) HandleResponseBodyStreaming(ctx context.Context, reqCtx *hand
 
 // HandleResponseBodyComplete is called when the response body is fully received.
 func (d *Director) HandleResponseBodyComplete(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) {
-	logger := log.FromContext(ctx).WithValues("stage", "bodyChunk")
+	requestID := reqCtx.Request.Headers[requtil.RequestIdHeaderKey]
+	logger := log.FromContext(ctx).WithValues("stage", "bodyChunk", requtil.RequestIdHeaderKey, requestID)
 	logger.V(logutil.DEBUG).Info("Entering HandleResponseBodyComplete")
+	llmResponse, err := schedulingtypes.NewLLMResponseFromBytes(reqCtx.Response.Body)
+	if err != nil {
+		logger.Error(err, "HandleResponseBodyComplete: failed to convert the response to LLMResponse.")
+		return reqCtx, err
+	}
 	response := &Response{
-		RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey],
+		RequestId: requestID,
 		Headers:   reqCtx.Response.Headers,
+		// Currently use the first choice as the response body to process.
+		Body: llmResponse.GetFirstChoiceContent(),
 	}
-
 	d.runResponseCompletePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod)
 
 	logger.V(logutil.DEBUG).Info("Exiting HandleResponseBodyComplete")
diff --git a/pkg/epp/requestcontrol/director_test.go b/pkg/epp/requestcontrol/director_test.go
@@ -696,6 +696,23 @@ func TestDirector_HandleResponseComplete(t *testing.T) {
 	mockSched := &mockScheduler{}
 	director := NewDirectorWithConfig(ds, mockSched, nil, NewConfig().WithResponseCompletePlugins(pc1))
 
+	chatCompletionJSON := `{
+		"choices": [
+			{
+				"message": {
+					"role": "assistant",
+					"content": "Hello!"
+				},
+				"finish_reason": "stop"
+			}
+		],
+		"usage": {
+			"prompt_tokens": 1,
+			"completion_tokens": 2,
+			"total_tokens": 3
+		}
+	}`
+
 	reqCtx := &handlers.RequestContext{
 		Request: &handlers.Request{
 			Headers: map[string]string{
@@ -704,6 +721,7 @@ func TestDirector_HandleResponseComplete(t *testing.T) {
 		},
 		Response: &handlers.Response{
 			Headers: map[string]string{"X-Test-Complete-Header": "CompleteValue"},
+			Body:    []byte(chatCompletionJSON),
 		},
 		TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}},
 	}
@@ -717,11 +735,14 @@ func TestDirector_HandleResponseComplete(t *testing.T) {
 		t.Errorf("Scheduler.OnComplete RequestId mismatch (-want +got):\n%s", diff)
 	}
 	if diff := cmp.Diff(reqCtx.Response.Headers, pc1.lastRespOnComplete.Headers); diff != "" {
-		t.Errorf("Scheduler.OnComplete Headers mismatch (-want +got):\n%s", diff)
+		t.Errorf("Scheduler.OnComplete response headers mismatch (-want +got):\n%s", diff)
 	}
 	if diff := cmp.Diff("namespace1/test-pod-name", pc1.lastTargetPodOnComplete); diff != "" {
 		t.Errorf("Scheduler.OnComplete TargetPodName mismatch (-want +got):\n%s", diff)
 	}
+	if diff := cmp.Diff("Hello!", pc1.lastRespOnComplete.Body); diff != "" {
+		t.Errorf("Scheduler.OnComplete response body mismatch (-want +got):\n%s", diff)
+	}
 }
 
 const (
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
@@ -17,6 +17,7 @@ limitations under the License.
 package prefix
 
 import (
+	"bytes"
 	"context"
 	"encoding/binary"
 	"encoding/json"
@@ -28,6 +29,7 @@ import (
 	k8stypes "k8s.io/apimachinery/pkg/types"
 	"sigs.k8s.io/controller-runtime/pkg/log"
 
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
@@ -117,6 +119,10 @@ var _ plugins.StateData = &SchedulingContextState{}
 type SchedulingContextState struct {
 	// PrefixHashes is a list of prefix hashes of the request prompt broken into blocks.
 	PrefixHashes []BlockHash
+	// RestBytes is the trailing bytes that not able to fill in a full block and left over.
+	// If not empty, this will be used as the starting block for the following response that will
+	// be added to the response as well. This happens especially at the multi-turn scenario.
+	RestBytes []byte
 	// A map of server to its longest prefix cache match length.
 	PrefixCacheServers map[ServerID]int
 }
@@ -193,9 +199,10 @@ func (p *Plugin) WithName(name string) *Plugin {
 // Score returns the scoring result for the given list of pods based on context.
 func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	// pre score step, hashing prompt and find longest prefix match.
-	hashes := hashPrompt(ctx, request, getBlockSize(pods, p.config.DefaultBlockSize), p.config.MaxPrefixBlocksToMatch)
+	hashes, restBytes := hashPrompt(ctx, request, getBlockSize(pods, p.config.DefaultBlockSize), p.config.MaxPrefixBlocksToMatch)
 	state := &SchedulingContextState{
 		PrefixHashes:       hashes,
+		RestBytes:          restBytes,
 		PrefixCacheServers: p.matchLongestPrefix(ctx, hashes),
 	}
 
@@ -301,47 +308,59 @@ func (m *Plugin) CleanUpInactivePods(ctx context.Context, handle plugins.Handle)
 // hashPrompt divides the prompt into blocks and calculate the prefix cache for each block.
 // hash[0] is calculated including the model name and cache_salt(if provided), since different models generally don't share prefix cache.
 // For block i, hash(i) = hash(block i content, hash(i-1)).
-func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash {
+// Also return the extra string.
+func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) ([]BlockHash, []byte) {
 	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
 	if request == nil || request.Body == nil {
 		loggerDebug.Info("Request or request data is nil, skipping hashing")
-		return nil
+		return nil, nil
 	}
 
 	userInput, err := getUserInputBytes(request)
 	if err != nil {
 		loggerDebug.Error(err, "Failed to get user input bytes")
-		return nil
+		return nil, nil
 	}
+	prevBlockHash := defaultPrevBlock(request)
+	return hashInputWithPrevBlockHash(ctx, prevBlockHash, 0, userInput, cacheBlockSize, maxPrefixBlocks)
+}
 
-	if len(userInput) < cacheBlockSize {
-		loggerDebug.Info("Request body too small for prefix cache", "size", len(userInput), "block size", cacheBlockSize)
-		return nil
-	}
-	if len(userInput) > cacheBlockSize*maxPrefixBlocks {
-		loggerDebug.Info("Truncating input", "size", len(userInput), "max prefix blocks", maxPrefixBlocks, "block size", cacheBlockSize)
-		userInput = userInput[:maxPrefixBlocks*cacheBlockSize]
-	}
-	// Split the body into blocks of size cacheBlockSize.
-	// If the last block is smaller than cacheBlockSize, it will be ignored.
-	res := make([]BlockHash, 0, len(userInput)/cacheBlockSize)
-	// Add the model to the first block hash so that different models have different hashes even with the same body.
+func defaultPrevBlock(request *types.LLMRequest) BlockHash {
 	h := xxhash.New()
+	// Add the model to the first block hash so that different models have different hashes even with the same body.
 	_, _ = h.Write([]byte(request.TargetModel))
 	if cacheSalt := request.Body.CacheSalt(); cacheSalt != "" {
 		_, _ = h.Write([]byte(cacheSalt))
 	}
 
-	prevBlockHash := BlockHash(h.Sum64())
-	for i := 0; i+cacheBlockSize <= len(userInput); i += cacheBlockSize {
+	return BlockHash(h.Sum64())
+}
+
+func hashInputWithPrevBlockHash(ctx context.Context, prevBlockHash BlockHash, prevBlockLength int, input []byte, cacheBlockSize int, maxPrefixBlocks int) ([]BlockHash, []byte) {
+	loggerDebug := log.FromContext(ctx).V(logutil.DEBUG)
+	if len(input)+prevBlockLength < cacheBlockSize {
+		loggerDebug.Info("Request body too small for prefix cache", "size", len(input), "block size", cacheBlockSize)
+		return nil, input
+	}
+	if len(input)+prevBlockLength > cacheBlockSize*maxPrefixBlocks {
+		loggerDebug.Info("Truncating input", "size", len(input), "max prefix blocks", maxPrefixBlocks, "block size", cacheBlockSize)
+		input = input[:(maxPrefixBlocks*cacheBlockSize - prevBlockLength)]
+	}
+	// Split the body into blocks of size cacheBlockSize.
+	// If the last block is smaller than cacheBlockSize, it will be ignored.
+	res := make([]BlockHash, 0, len(input)/cacheBlockSize)
+	lastOffSet := 0
+	h := xxhash.New()
+	for i := 0; i+cacheBlockSize <= len(input); i += cacheBlockSize {
 		h.Reset()
-		_, _ = h.Write(userInput[i : i+cacheBlockSize])
+		_, _ = h.Write(input[i : i+cacheBlockSize])
 		_, _ = h.Write(toBytes(prevBlockHash))
 		res = append(res, BlockHash(h.Sum64()))
 
 		prevBlockHash = res[len(res)-1]
+		lastOffSet = i + cacheBlockSize
 	}
-	return res
+	return res, input[lastOffSet:]
 }
 
 func toBytes(i BlockHash) []byte {
@@ -359,6 +378,33 @@ func getUserInputBytes(request *types.LLMRequest) ([]byte, error) {
 	return json.Marshal(request.Body.ChatCompletions.Messages)
 }
 
+func (p *Plugin) ResponseComplete(ctx context.Context, request *types.LLMRequest, response *requestcontrol.Response, targetPod *backend.Pod) {
+	state, err := plugins.ReadPluginStateKey[*SchedulingContextState](p.pluginState, request.RequestId, plugins.StateKey(p.TypedName().String()))
+	if err != nil {
+		log.FromContext(ctx).Error(err, "failed to read prefix plugin state", "requestID", request.RequestId)
+		return
+	}
+	p.pluginState.Delete(request.RequestId) // delete the state explicitly after completing using it.
+	var input bytes.Buffer
+	input.Write(state.RestBytes)
+	input.Write([]byte(response.Body))
+
+	server := ServerID(targetPod.NamespacedName)
+	prevBlockHash := defaultPrevBlock(request)
+	prevBlockHashLength := 0
+	if len(state.PrefixHashes) > 0 {
+		prevBlockHash = state.PrefixHashes[len(state.PrefixHashes)-1]
+		prevBlockHashLength = len(state.PrefixHashes)
+	}
+	inputBytes := input.Bytes()
+	hashBlocks, _ := hashInputWithPrevBlockHash(ctx, prevBlockHash, prevBlockHashLength, inputBytes, p.config.DefaultBlockSize, p.config.MaxPrefixBlocksToMatch)
+	p.wg.Add(1)
+	go func() {
+		p.indexer.Add(hashBlocks, server)
+		p.wg.Done()
+	}()
+}
+
 func getBlockSize(pods []types.Pod, defaultBlockSize int) int {
 	if len(pods) == 0 {
 		return defaultBlockSize
diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
@@ -30,6 +30,7 @@ import (
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
 	backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
 )
 
@@ -199,6 +200,89 @@ func TestPrefixPluginCompletion(t *testing.T) {
 	plugin.wg.Wait()
 }
 
+func TestPrefixPluginCompletionWithResponse(t *testing.T) {
+	config := Config{
+		DefaultBlockSize:       4,
+		MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
+		LRUCapacityPerServer:   DefaultLRUCapacityPerServer,
+	}
+	plugin := New(context.Background(), config)
+
+	pod1 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}
+	pod2 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}
+	pods := []types.Pod{pod1, pod2}
+
+	// -- First Request --
+	// This initial request will populate the cache.
+	req1 := &types.LLMRequest{
+		RequestId:   uuid.NewString(),
+		TargetModel: "test-model1",
+		Body: &types.LLMRequestBody{
+			Completions: &types.CompletionsRequest{
+				Prompt: "aaaaaa",
+			},
+		},
+	}
+	scores := plugin.Score(context.Background(), types.NewCycleState(), req1, pods)
+	state, err := plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req1.RequestId, plugins.StateKey(plugin.TypedName().String()))
+	assert.NoError(t, err)
+	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
+	// Input size is 6, hash block size is 4, so the last 2 characters are ignored.
+	// Total hashes = 1 (for the "aaaa" block) + 1 (for the model prefix).
+	assert.Equal(t, 1, len(state.PrefixHashes), "number of hashes is incorrect")
+	assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers yet")
+	assert.Equal(t, float64(0), scores[pod1], "score for pod1 should be 0 on first request")
+	assert.Equal(t, float64(0), scores[pod2], "score for pod2 should be 0 on first request")
+
+	// Simulate that the scheduler picked pod1 for the first request.
+	schedulingResult := &types.SchedulingResult{
+		PrimaryProfileName: "default",
+		ProfileResults: map[string]*types.ProfileRunResult{
+			"default": {TargetPods: []types.Pod{pod1}},
+		},
+	}
+	plugin.PreRequest(context.Background(), req1, schedulingResult, 0)
+	plugin.wg.Wait()
+
+	// -- Simulate Response Completion --
+	// The ResponseComplete hook is called. The plugin should update pod1's KV cache
+	// with the full context of the completed interaction (prompt + response).
+	//   - Initial Prompt: "aaaaaa"
+	//   - Response Body:  "bb"
+	//   - Cached Sequence: "aaaaaabb" (length 8)
+	// This sequence creates two 4-character blocks to be cached: "aaaa" and "aabb".
+	plugin.ResponseComplete(context.Background(), req1, &requestcontrol.Response{Body: "bb"}, pod1.GetPod())
+	plugin.wg.Wait()
+
+	// -- Second Request: Multi-turn Follow-up --
+	// This request simulates a follow-up message in a chat. The prompt contains the
+	// entire conversation history ("aaaaaabb") plus new text ("cc").
+	// The plugin should find that the first two blocks ("aaaa", "aabb") of this new
+	// prompt are already cached on pod1, giving it a perfect match score of 1.0.
+	// Pod2 has no matching cache entries and should score 0.
+	req2 := &types.LLMRequest{
+		RequestId:   uuid.NewString(),
+		TargetModel: "test-model1",
+		Body: &types.LLMRequestBody{
+			Completions: &types.CompletionsRequest{
+				Prompt: "aaaaaabbcc",
+			},
+		},
+	}
+	scores = plugin.Score(context.Background(), types.NewCycleState(), req2, pods)
+	state, err = plugins.ReadPluginStateKey[*SchedulingContextState](plugin.pluginState, req2.RequestId, plugins.StateKey(plugin.TypedName().String()))
+	assert.NoError(t, err)
+	t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers)
+	// Input size is 10, hash block size is 4. The prompt "aaaaaabb" generates 2 hashes.
+	// The last 2 characters ("cc") are ignored.
+	assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect")
+	// It should find a server (pod1) that has cached the prefixes.
+	assert.Equal(t, 1, len(state.PrefixCacheServers), "a cached server should have been found")
+	// The score for pod1 should be 1.0 because both prompt blocks ("aaaa" and "aabb") were found in its cache.
+	assert.Equal(t, float64(1), scores[pod1], "score for pod1 should be a perfect match")
+	assert.Equal(t, float64(0), scores[pod2], "score for pod2 should be 0")
+}
+
 func TestPrefixPluginChatCompletions(t *testing.T) {
 	config := Config{
 		DefaultBlockSize:       4,
diff --git a/pkg/epp/scheduling/types/llmresponse.go b/pkg/epp/scheduling/types/llmresponse.go
diff --git a/pkg/epp/scheduling/types/llmresponse_test.go b/pkg/epp/scheduling/types/llmresponse_test.go

Original file line number	Diff line number	Diff line change
`@@ -115,6 +115,7 @@ type Request struct {`
`115`	`115`	`}`
`116`	`116`	`type Response struct {`
`117`	`117`	`Headers map[string]string`
	`118`	`+ Body []byte`
`118`	`119`	`}`
`119`	`120`	`type StreamRequestState int`
`120`	`121`