@@ -30,7 +30,6 @@ import (
3030 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
3131 backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
3232 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
33- "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
3433 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
3534)
3635
@@ -201,8 +200,9 @@ func TestPrefixPluginCompletion(t *testing.T) {
201200}
202201
203202func TestPrefixPluginCompletionWithResponse (t * testing.T ) {
203+ const defaultBlockSize = 4
204204 config := Config {
205- DefaultBlockSize : 4 ,
205+ DefaultBlockSize : defaultBlockSize ,
206206 MaxPrefixBlocksToMatch : DefaultMaxPrefixBlocks ,
207207 LRUCapacityPerServer : DefaultLRUCapacityPerServer ,
208208 }
@@ -231,6 +231,9 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
231231 // Total hashes = 1 (for the "aaaa" block) + 1 (for the model prefix).
232232 assert .Equal (t , 1 , len (state .PrefixHashes ), "number of hashes is incorrect" )
233233 assert .Equal (t , 0 , len (state .PrefixCacheServers ), "there shouldn't be any cached servers yet" )
234+ // The last 2 characters are recorded in restBytes of the state.
235+ assert .Equal (t , 2 , len (state .RestBytes ), "number of restBytes is incorrect" )
236+ assert .Equal (t , defaultBlockSize , state .BlockSize , "blockSize is incorrect" )
234237 assert .Equal (t , float64 (0 ), scores [pod1 ], "score for pod1 should be 0 on first request" )
235238 assert .Equal (t , float64 (0 ), scores [pod2 ], "score for pod2 should be 0 on first request" )
236239
@@ -251,7 +254,16 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
251254 // - Response Body: "bb"
252255 // - Cached Sequence: "aaaaaabb" (length 8)
253256 // This sequence creates two 4-character blocks to be cached: "aaaa" and "aabb".
254- plugin .ResponseComplete (context .Background (), req1 , & requestcontrol.Response {Body : "bb" }, pod1 .GetPod ())
257+ resp1 := & types.LLMResponse {
258+ Completion : & types.CompletionResponse {
259+ Choices : []types.CompletionChoice {
260+ {
261+ Text : "bb" ,
262+ },
263+ },
264+ },
265+ }
266+ plugin .ResponseComplete (context .Background (), req1 , resp1 , pod1 .GetPod ())
255267 plugin .wg .Wait ()
256268
257269 // -- Second Request: Multi-turn Follow-up --
@@ -278,6 +290,9 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
278290 assert .Equal (t , 2 , len (state .PrefixHashes ), "number of hashes is incorrect" )
279291 // It should find a server (pod1) that has cached the prefixes.
280292 assert .Equal (t , 1 , len (state .PrefixCacheServers ), "a cached server should have been found" )
293+ // The last 2 characters ("cc") are recorded in restBytes of the state.
294+ assert .Equal (t , 2 , len (state .RestBytes ), "number of restBytes is incorrect" )
295+ assert .Equal (t , defaultBlockSize , state .BlockSize , "blockSize is incorrect" )
281296 // The score for pod1 should be 1.0 because both prompt blocks ("aaaa" and "aabb") were found in its cache.
282297 assert .Equal (t , float64 (1 ), scores [pod1 ], "score for pod1 should be a perfect match" )
283298 assert .Equal (t , float64 (0 ), scores [pod2 ], "score for pod2 should be 0" )
@@ -362,6 +377,19 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
362377 plugin .PreRequest (context .Background (), req1 , schedulingResult )
363378 plugin .wg .Wait ()
364379
380+ resp1 := & types.LLMResponse {
381+ ChatCompletion : & types.ChatCompletionResponse {
382+ Choices : []types.ChatChoice {
383+ {
384+ Message : types.Message {Role : "assistant" , Content : "I'm doing well, thank you! How can I help you today?" },
385+ },
386+ },
387+ },
388+ }
389+ // Trigger to simulate the resp1 is added to the kvCache recording.
390+ plugin .ResponseComplete (context .Background (), req1 , resp1 , pod1 .GetPod ())
391+ plugin .wg .Wait ()
392+
365393 // Second request adds assistant response and new user message (conversation grows)
366394 req2 := & types.LLMRequest {
367395 RequestId : uuid .NewString (),
@@ -389,13 +417,27 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
389417 cachedBlocks := state .PrefixCacheServers [ServerID (pod1 .GetPod ().NamespacedName )]
390418 expectedScore := float64 (cachedBlocks ) / float64 (extendedHashCount )
391419 assert .Equal (t , expectedScore , scores [pod1 ], "pod1 should have prefix cache hit" )
420+ assert .Greater (t , scores [pod1 ], float64 (0.5 ), "given the response is also prefix cached the cache hit should be well above 0.5" )
392421 assert .Equal (t , float64 (0 ), scores [pod2 ], "pod2 should have no cache hit" )
393422
394423 // Simulate pod1 was picked again
395424 plugin .PreRequest (context .Background (), req2 , schedulingResult )
396425 plugin .wg .Wait ()
397426
398- // Third request continues the conversation even further
427+ resp2 := & types.LLMResponse {
428+ ChatCompletion : & types.ChatCompletionResponse {
429+ Choices : []types.ChatChoice {
430+ {
431+ Message : types.Message {Role : "assistant" , Content : "Prefix caching is a technique where..." },
432+ },
433+ },
434+ },
435+ }
436+ // Trigger to simulate the resp1 is added to the kvCache recording.
437+ plugin .ResponseComplete (context .Background (), req2 , resp2 , pod1 .GetPod ())
438+ plugin .wg .Wait ()
439+
440+ // Third request is the whole above conversation to make the cache hit to 1.0.
399441 req3 := & types.LLMRequest {
400442 RequestId : uuid .NewString (),
401443 TargetModel : "test-model1" ,
@@ -424,7 +466,7 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
424466 cachedBlocks = state .PrefixCacheServers [ServerID (pod1 .GetPod ().NamespacedName )]
425467 expectedScore = float64 (cachedBlocks ) / float64 (longHashCount )
426468 assert .Equal (t , expectedScore , scores [pod1 ], "pod1 should have higher prefix cache hit" )
427- assert .Greater (t , scores [pod1 ], float64 (0.5 ), "cache hit rate should be substantial for growing conversation" )
469+ assert .Equal (t , scores [pod1 ], float64 (1 ), "cache hit rate should be substantial for growing conversation" )
428470 assert .Equal (t , float64 (0 ), scores [pod2 ], "pod2 should still have no cache hit" )
429471}
430472
0 commit comments