Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1716,12 +1716,23 @@ class BackendServiceImpl final : public backend::Backend::Service {
}
};

// Process first result
// Process first result.
// When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first token may
// produce a JSON array with a role-init element followed by the
// actual content element. We must only attach chat deltas to the
// content element — attaching to both would duplicate the first
// token since oaicompat_msg_diffs is the same for both.
json first_res_json = first_result->to_json();
if (first_res_json.is_array()) {
for (const auto & res : first_res_json) {
auto reply = build_reply_from_json(res, first_result.get());
attach_chat_deltas(reply, first_result.get());
// Skip chat deltas for role-init elements (have "role" in
// delta but no content/reasoning diffs of their own).
bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
res["choices"][0].value("delta", json::object()).contains("role");
if (!is_role_init) {
attach_chat_deltas(reply, first_result.get());
}
writer->Write(reply);
}
} else {
Expand All @@ -1745,7 +1756,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
if (res_json.is_array()) {
for (const auto & res : res_json) {
auto reply = build_reply_from_json(res, result.get());
attach_chat_deltas(reply, result.get());
bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
res["choices"][0].value("delta", json::object()).contains("role");
if (!is_role_init) {
attach_chat_deltas(reply, result.get());
}
writer->Write(reply);
}
} else {
Expand Down
36 changes: 36 additions & 0 deletions core/http/app_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,42 @@ parameters:
Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
})

It("does not duplicate the first content token in streaming chat completions", Label("llama-gguf", "llama-gguf-stream"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{
Model: "testmodel.ggml",
Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}},
})
Expect(err).ToNot(HaveOccurred())
defer stream.Close()

var contentParts []string
for {
chunk, err := stream.Recv()
if err == io.EOF {
break
}
Expect(err).ToNot(HaveOccurred())
if len(chunk.Choices) > 0 {
delta := chunk.Choices[0].Delta.Content
if delta != "" {
contentParts = append(contentParts, delta)
}
}
}

Expect(contentParts).ToNot(BeEmpty(), "Expected streaming content tokens")
// The first content token should appear exactly once.
// A bug in grpc-server.cpp caused the role-init array element
// to get the same ChatDelta stamped, duplicating the first token.
if len(contentParts) >= 2 {
Expect(contentParts[0]).ToNot(Equal(contentParts[1]),
"First content token was duplicated: %v", contentParts[:2])
}
})

It("returns logprobs in chat completions when requested", func() {
if runtime.GOOS != "linux" {
Skip("test only on linux")
Expand Down
Loading