@@ -30,7 +30,6 @@ import (
3030 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
3131 backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics"
3232 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins"
33- "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol"
3433 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
3534)
3635
@@ -251,7 +250,16 @@ func TestPrefixPluginCompletionWithResponse(t *testing.T) {
251250 // - Response Body: "bb"
252251 // - Cached Sequence: "aaaaaabb" (length 8)
253252 // This sequence creates two 4-character blocks to be cached: "aaaa" and "aabb".
254- plugin .ResponseComplete (context .Background (), req1 , & requestcontrol.Response {Body : "bb" }, pod1 .GetPod ())
253+ resp1 := & types.LLMResponse {
254+ Completion : & types.CompletionResponse {
255+ Choices : []types.CompletionChoice {
256+ {
257+ Text : "bb" ,
258+ },
259+ },
260+ },
261+ }
262+ plugin .ResponseComplete (context .Background (), req1 , resp1 , pod1 .GetPod ())
255263 plugin .wg .Wait ()
256264
257265 // -- Second Request: Multi-turn Follow-up --
@@ -362,6 +370,19 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
362370 plugin .PreRequest (context .Background (), req1 , schedulingResult , 0 )
363371 plugin .wg .Wait ()
364372
373+ resp1 := & types.LLMResponse {
374+ ChatCompletion : & types.ChatCompletionResponse {
375+ Choices : []types.ChatChoice {
376+ {
377+ Message : types.Message {Role : "assistant" , Content : "I'm doing well, thank you! How can I help you today?" },
378+ },
379+ },
380+ },
381+ }
382+ // Trigger to simulate the resp1 is added to the kvCache recording.
383+ plugin .ResponseComplete (context .Background (), req1 , resp1 , pod1 .GetPod ())
384+ plugin .wg .Wait ()
385+
365386 // Second request adds assistant response and new user message (conversation grows)
366387 req2 := & types.LLMRequest {
367388 RequestId : uuid .NewString (),
@@ -389,13 +410,27 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
389410 cachedBlocks := state .PrefixCacheServers [ServerID (pod1 .GetPod ().NamespacedName )]
390411 expectedScore := float64 (cachedBlocks ) / float64 (extendedHashCount )
391412 assert .Equal (t , expectedScore , scores [pod1 ], "pod1 should have prefix cache hit" )
413+ assert .Greater (t , scores [pod1 ], float64 (0.5 ), "given the response is also prefix cached the cache hit should be well above 0.5" )
392414 assert .Equal (t , float64 (0 ), scores [pod2 ], "pod2 should have no cache hit" )
393415
394416 // Simulate pod1 was picked again
395417 plugin .PreRequest (context .Background (), req2 , schedulingResult , 0 )
396418 plugin .wg .Wait ()
397419
398- // Third request continues the conversation even further
420+ resp2 := & types.LLMResponse {
421+ ChatCompletion : & types.ChatCompletionResponse {
422+ Choices : []types.ChatChoice {
423+ {
424+ Message : types.Message {Role : "assistant" , Content : "Prefix caching is a technique where..." },
425+ },
426+ },
427+ },
428+ }
429+ // Trigger to simulate the resp1 is added to the kvCache recording.
430+ plugin .ResponseComplete (context .Background (), req2 , resp2 , pod1 .GetPod ())
431+ plugin .wg .Wait ()
432+
433+ // Third request is the whole above conversation to make the cache hit to 1.0.
399434 req3 := & types.LLMRequest {
400435 RequestId : uuid .NewString (),
401436 TargetModel : "test-model1" ,
@@ -407,7 +442,6 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
407442 {Role : "assistant" , Content : "I'm doing well, thank you! How can I help you today?" },
408443 {Role : "user" , Content : "Can you explain how prefix caching works?" },
409444 {Role : "assistant" , Content : "Prefix caching is a technique where..." },
410- {Role : "user" , Content : "That's very helpful, thank you!" },
411445 },
412446 },
413447 },
@@ -424,7 +458,7 @@ func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
424458 cachedBlocks = state .PrefixCacheServers [ServerID (pod1 .GetPod ().NamespacedName )]
425459 expectedScore = float64 (cachedBlocks ) / float64 (longHashCount )
426460 assert .Equal (t , expectedScore , scores [pod1 ], "pod1 should have higher prefix cache hit" )
427- assert .Greater (t , scores [pod1 ], float64 (0.5 ), "cache hit rate should be substantial for growing conversation" )
461+ assert .Equal (t , scores [pod1 ], float64 (1 ), "cache hit rate should be substantial for growing conversation" )
428462 assert .Equal (t , float64 (0 ), scores [pod2 ], "pod2 should still have no cache hit" )
429463}
430464
0 commit comments