Skip to content

Commit bcd0d32

Browse files
committed
fix(streaming): skip chat deltas for role-init elements to prevent first token duplication
When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first streaming token produces a JSON array with two elements: a role-init chunk and the actual content chunk. The grpc-server loop called attach_chat_deltas for both elements with the same raw_result pointer, stamping the first token's ChatDelta.Content on both replies. The Go side accumulated both, emitting the first content token twice to SSE clients. Fix: in the array iteration loops in PredictStream, detect role-init elements (delta has "role" key) and skip attach_chat_deltas for them. Only content/reasoning elements get chat deltas attached. Reasoning models are unaffected because their first token goes into reasoning_content, not content.
1 parent 706cf5d commit bcd0d32

File tree

2 files changed

+54
-3
lines changed

2 files changed

+54
-3
lines changed

backend/cpp/llama-cpp/grpc-server.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1716,12 +1716,23 @@ class BackendServiceImpl final : public backend::Backend::Service {
17161716
}
17171717
};
17181718

1719-
// Process first result
1719+
// Process first result.
1720+
// When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first token may
1721+
// produce a JSON array with a role-init element followed by the
1722+
// actual content element. We must only attach chat deltas to the
1723+
// content element — attaching to both would duplicate the first
1724+
// token since oaicompat_msg_diffs is the same for both.
17201725
json first_res_json = first_result->to_json();
17211726
if (first_res_json.is_array()) {
17221727
for (const auto & res : first_res_json) {
17231728
auto reply = build_reply_from_json(res, first_result.get());
1724-
attach_chat_deltas(reply, first_result.get());
1729+
// Skip chat deltas for role-init elements (have "role" in
1730+
// delta but no content/reasoning diffs of their own).
1731+
bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
1732+
res["choices"][0].value("delta", json::object()).contains("role");
1733+
if (!is_role_init) {
1734+
attach_chat_deltas(reply, first_result.get());
1735+
}
17251736
writer->Write(reply);
17261737
}
17271738
} else {
@@ -1745,7 +1756,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
17451756
if (res_json.is_array()) {
17461757
for (const auto & res : res_json) {
17471758
auto reply = build_reply_from_json(res, result.get());
1748-
attach_chat_deltas(reply, result.get());
1759+
bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
1760+
res["choices"][0].value("delta", json::object()).contains("role");
1761+
if (!is_role_init) {
1762+
attach_chat_deltas(reply, result.get());
1763+
}
17491764
writer->Write(reply);
17501765
}
17511766
} else {

core/http/app_test.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -978,6 +978,42 @@ parameters:
978978
Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
979979
})
980980

981+
It("does not duplicate the first content token in streaming chat completions", Label("llama-gguf-stream"), func() {
982+
if runtime.GOOS != "linux" {
983+
Skip("test supported only on linux")
984+
}
985+
stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{
986+
Model: "testmodel.ggml",
987+
Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}},
988+
})
989+
Expect(err).ToNot(HaveOccurred())
990+
defer stream.Close()
991+
992+
var contentParts []string
993+
for {
994+
chunk, err := stream.Recv()
995+
if err == io.EOF {
996+
break
997+
}
998+
Expect(err).ToNot(HaveOccurred())
999+
if len(chunk.Choices) > 0 {
1000+
delta := chunk.Choices[0].Delta.Content
1001+
if delta != "" {
1002+
contentParts = append(contentParts, delta)
1003+
}
1004+
}
1005+
}
1006+
1007+
Expect(contentParts).ToNot(BeEmpty(), "Expected streaming content tokens")
1008+
// The first content token should appear exactly once.
1009+
// A bug in grpc-server.cpp caused the role-init array element
1010+
// to get the same ChatDelta stamped, duplicating the first token.
1011+
if len(contentParts) >= 2 {
1012+
Expect(contentParts[0]).ToNot(Equal(contentParts[1]),
1013+
"First content token was duplicated: %v", contentParts[:2])
1014+
}
1015+
})
1016+
9811017
It("returns logprobs in chat completions when requested", func() {
9821018
if runtime.GOOS != "linux" {
9831019
Skip("test only on linux")

0 commit comments

Comments
 (0)