Skip to content

Commit 2a5a7e5

Browse files
committed
send audio delta
1 parent c62040e commit 2a5a7e5

1 file changed

Lines changed: 22 additions & 5 deletions

File tree

core/http/endpoints/openai/realtime.go

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,7 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
895895
opts.Messages = protoMessages
896896
opts.UseTokenizerTemplate = config.TemplateConfig.UseTokenizerTemplate
897897

898+
// TODO: We can use the PredictStream method, but then can we stream the results of that to TTS?
898899
reply, err := session.ModelInterface.Predict(context.TODO(), &opts)
899900
if err != nil {
900901
sendError(c, "inference_failed", fmt.Sprintf("backend error: %v", err), "", item.ID)
@@ -956,18 +957,34 @@ func generateResponse(config *config.ModelConfig, evaluator *templates.Evaluator
956957
sendError(c, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.ID)
957958
return
958959
}
960+
audioString := base64.StdEncoding.EncodeToString(audioBytes)
959961

960-
// For some reason OpenAI doesn't send the audio now according to the docs. The user must request it with conversation.item.retrieve.
961-
// This (almost) makes sense when not using a real any-to-any model because we can send the transcript before the audio is ready.
962-
// However we don't do that for now, to keep things simple we only send the done event once we have the audio ready
962+
sendEvent(c, types.ResponseAudioDeltaEvent{
963+
ServerEventBase: types.ServerEventBase{
964+
Type: types.ServerEventTypeResponseAudioDelta,
965+
},
966+
ItemID: item.ID,
967+
// TODO: OutputIndex and ContentIndex
968+
Delta: audioString,
969+
})
970+
sendEvent(c, types.ResponseAudioDoneEvent{
971+
ServerEventBase: types.ServerEventBase{
972+
Type: types.ServerEventTypeResponseAudioDone,
973+
},
974+
ItemID: item.ID,
975+
// TODO: Indexs
976+
})
977+
978+
// OpenAI does not send the audio as part of the conversation.
979+
// It's sent as audio deltas or the user can request it with conversation.item.retrieve.
963980
conv.Lock.Lock()
964981
doneEvent := types.ConversationItemDoneEvent{
965982
ServerEventBase: types.ServerEventBase{
966-
Type: types.ServerEventTypeConversationItemAdded,
983+
Type: types.ServerEventTypeConversationItemDone,
967984
},
968985
Item: item,
969986
}
970-
item.Content[0].Audio = base64.StdEncoding.EncodeToString(audioBytes)
987+
item.Content[0].Audio = audioString
971988
conv.Lock.Unlock()
972989

973990
sendEvent(c, doneEvent)

0 commit comments

Comments
 (0)