@@ -17,16 +17,15 @@ limitations under the License.
1717package handlers
1818
1919import (
20+ "bytes"
2021 "context"
21- "encoding/json"
22- "fmt"
23- "strings"
2422
2523 configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
2624 extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
2725 "sigs.k8s.io/controller-runtime/pkg/log"
2826
2927 "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics"
28+ "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types"
3029 logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
3130)
3231
@@ -36,49 +35,56 @@ const (
3635)
3736
3837// HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling.
39- func (s * StreamingServer ) HandleResponseBody (ctx context.Context , reqCtx * RequestContext , response map [ string ] any ) (* RequestContext , error ) {
38+ func (s * StreamingServer ) HandleResponseBody (ctx context.Context , reqCtx * RequestContext , body [] byte ) (* RequestContext , error ) {
4039 logger := log .FromContext (ctx )
41- responseBytes , err := json . Marshal ( response )
40+ llmResponse , err := types . NewLLMResponseFromBytes ( body )
4241 if err != nil {
43- return reqCtx , fmt .Errorf ("error marshalling responseBody - %w" , err )
44- }
45- if response ["usage" ] != nil {
46- usg := response ["usage" ].(map [string ]any )
47- usage := Usage {
48- PromptTokens : int (usg ["prompt_tokens" ].(float64 )),
49- CompletionTokens : int (usg ["completion_tokens" ].(float64 )),
50- TotalTokens : int (usg ["total_tokens" ].(float64 )),
42+ logger .Error (err , "failed to create LLMResponse from bytes" )
43+ } else {
44+ reqCtx .SchedulingResponse = llmResponse
45+ if usage := reqCtx .SchedulingResponse .Usage (); usage != nil {
46+ reqCtx .Usage = usage
47+ logger .V (logutil .VERBOSE ).Info ("Response generated" , "usage" , usage )
5148 }
52- reqCtx .Usage = usage
53- logger .V (logutil .VERBOSE ).Info ("Response generated" , "usage" , reqCtx .Usage )
5449 }
55- reqCtx .ResponseSize = len (responseBytes )
50+ reqCtx .ResponseSize = len (body )
5651 // ResponseComplete is to indicate the response is complete. In non-streaming
5752 // case, it will be set to be true once the response is processed; in
5853 // streaming case, it will be set to be true once the last chunk is processed.
5954 // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178)
6055 // will add the processing for streaming case.
6156 reqCtx .ResponseComplete = true
6257
63- reqCtx .respBodyResp = generateResponseBodyResponses (responseBytes , true )
58+ reqCtx .respBodyResp = generateResponseBodyResponses (body , true )
6459
6560 return s .director .HandleResponseBodyComplete (ctx , reqCtx )
6661}
6762
6863// The function is to handle streaming response if the modelServer is streaming.
69- func (s * StreamingServer ) HandleResponseBodyModelStreaming (ctx context.Context , reqCtx * RequestContext , responseText string ) {
64+ func (s * StreamingServer ) HandleResponseBodyModelStreaming (ctx context.Context , reqCtx * RequestContext , streamBody [] byte ) {
7065 logger := log .FromContext (ctx )
7166 _ , err := s .director .HandleResponseBodyStreaming (ctx , reqCtx )
7267 if err != nil {
7368 logger .Error (err , "error in HandleResponseBodyStreaming" )
7469 }
75- if strings .Contains (responseText , streamingEndMsg ) {
70+ }
71+
72+ func (s * StreamingServer ) HandleResponseBodyModelStreamingComplete (ctx context.Context , reqCtx * RequestContext , streamBody []byte ) {
73+ logger := log .FromContext (ctx )
74+ if bytes .Contains (streamBody , []byte (streamingEndMsg )) {
7675 reqCtx .ResponseComplete = true
77- resp := parseRespForUsage (ctx , responseText )
78- reqCtx .Usage = resp .Usage
79- metrics .RecordInputTokens (reqCtx .IncomingModelName , reqCtx .TargetModelName , resp .Usage .PromptTokens )
80- metrics .RecordOutputTokens (reqCtx .IncomingModelName , reqCtx .TargetModelName , resp .Usage .CompletionTokens )
81- _ , err := s .director .HandleResponseBodyComplete (ctx , reqCtx )
76+ resp , err := types .NewLLMResponseFromStream (streamBody )
77+ if err != nil {
78+ logger .Error (err , "error in converting stream response to LLMResponse." )
79+ } else {
80+ reqCtx .SchedulingResponse = resp
81+ if usage := resp .Usage (); usage != nil {
82+ reqCtx .Usage = usage
83+ metrics .RecordInputTokens (reqCtx .IncomingModelName , reqCtx .TargetModelName , usage .PromptTokens )
84+ metrics .RecordOutputTokens (reqCtx .IncomingModelName , reqCtx .TargetModelName , usage .CompletionTokens )
85+ }
86+ }
87+ _ , err = s .director .HandleResponseBodyComplete (ctx , reqCtx )
8288 if err != nil {
8389 logger .Error (err , "error in HandleResponseBodyComplete" )
8490 }
@@ -153,41 +159,6 @@ func (s *StreamingServer) generateResponseHeaders(reqCtx *RequestContext) []*con
153159 return headers
154160}
155161
156- // Example message if "stream_options": {"include_usage": "true"} is included in the request:
157- // data: {"id":"...","object":"text_completion","created":1739400043,"model":"food-review-0","choices":[],
158- // "usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}
159- //
160- // data: [DONE]
161- //
162- // Noticed that vLLM returns two entries in one response.
163- // We need to strip the `data:` prefix and next Data: [DONE] from the message to fetch response data.
164- //
165- // If include_usage is not included in the request, `data: [DONE]` is returned separately, which
166- // indicates end of streaming.
167- func parseRespForUsage (ctx context.Context , responseText string ) ResponseBody {
168- response := ResponseBody {}
169- logger := log .FromContext (ctx )
170-
171- lines := strings .Split (responseText , "\n " )
172- for _ , line := range lines {
173- if ! strings .HasPrefix (line , streamingRespPrefix ) {
174- continue
175- }
176- content := strings .TrimPrefix (line , streamingRespPrefix )
177- if content == "[DONE]" {
178- continue
179- }
180-
181- byteSlice := []byte (content )
182- if err := json .Unmarshal (byteSlice , & response ); err != nil {
183- logger .Error (err , "unmarshaling response body" )
184- continue
185- }
186- }
187-
188- return response
189- }
190-
191162type ResponseBody struct {
192163 Usage Usage `json:"usage"`
193164}
0 commit comments