@@ -895,6 +895,7 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
895895 opts .Messages = protoMessages
896896 opts .UseTokenizerTemplate = config .TemplateConfig .UseTokenizerTemplate
897897
898+ // TODO: We can use the PredictStream method, but then can we stream the results of that to TTS?
898899 reply , err := session .ModelInterface .Predict (context .TODO (), & opts )
899900 if err != nil {
900901 sendError (c , "inference_failed" , fmt .Sprintf ("backend error: %v" , err ), "" , item .ID )
@@ -956,18 +957,34 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
956957 sendError (c , "tts_error" , fmt .Sprintf ("Failed to read TTS audio: %v" , err ), "" , item .ID )
957958 return
958959 }
960+ audioString := base64 .StdEncoding .EncodeToString (audioBytes )
959961
960- // For some reason OpenAI doesn't send the audio now according to the docs. The user must request it with conversation.item.retrieve.
961- // This (almost) makes sense when not using a real any-to-any model because we can send the transcript before the audio is ready.
962- // However we don't do that for now, to keep things simple we only send the done event once we have the audio ready
962+ sendEvent (c , types.ResponseAudioDeltaEvent {
963+ ServerEventBase : types.ServerEventBase {
964+ Type : types .ServerEventTypeResponseAudioDelta ,
965+ },
966+ ItemID : item .ID ,
967+ // TODO: OutputIndex and ContentIndex
968+ Delta : audioString ,
969+ })
970+ sendEvent (c , types.ResponseAudioDoneEvent {
971+ ServerEventBase : types.ServerEventBase {
972+ Type : types .ServerEventTypeResponseAudioDone ,
973+ },
974+ ItemID : item .ID ,
975+ // TODO: Indexs
976+ })
977+
978+ // OpenAI does not send the audio as part of the conversation.
979+ // It's sent as audio deltas or the user can request it with conversation.item.retrieve.
963980 conv .Lock .Lock ()
964981 doneEvent := types.ConversationItemDoneEvent {
965982 ServerEventBase : types.ServerEventBase {
966- Type : types .ServerEventTypeConversationItemAdded ,
983+ Type : types .ServerEventTypeConversationItemDone ,
967984 },
968985 Item : item ,
969986 }
970- item .Content [0 ].Audio = base64 . StdEncoding . EncodeToString ( audioBytes )
987+ item .Content [0 ].Audio = audioString
971988 conv .Lock .Unlock ()
972989
973990 sendEvent (c , doneEvent )
0 commit comments