mudler · mudler · Apr 10, 2026 · Apr 9, 2026
diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -1716,12 +1716,23 @@ class BackendServiceImpl final : public backend::Backend::Service {
             }
         };
 
-        // Process first result
+        // Process first result.
+        // When TASK_RESPONSE_TYPE_OAI_CHAT is used, the first token may
+        // produce a JSON array with a role-init element followed by the
+        // actual content element. We must only attach chat deltas to the
+        // content element — attaching to both would duplicate the first
+        // token since oaicompat_msg_diffs is the same for both.
         json first_res_json = first_result->to_json();
         if (first_res_json.is_array()) {
             for (const auto & res : first_res_json) {
                 auto reply = build_reply_from_json(res, first_result.get());
-                attach_chat_deltas(reply, first_result.get());
+                // Skip chat deltas for role-init elements (have "role" in
+                // delta but no content/reasoning diffs of their own).
+                bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
+                                    res["choices"][0].value("delta", json::object()).contains("role");
+                if (!is_role_init) {
+                    attach_chat_deltas(reply, first_result.get());
+                }
                 writer->Write(reply);
             }
         } else {
@@ -1745,7 +1756,11 @@ class BackendServiceImpl final : public backend::Backend::Service {
             if (res_json.is_array()) {
                 for (const auto & res : res_json) {
                     auto reply = build_reply_from_json(res, result.get());
-                    attach_chat_deltas(reply, result.get());
+                    bool is_role_init = res.contains("choices") && !res["choices"].empty() &&
+                                        res["choices"][0].value("delta", json::object()).contains("role");
+                    if (!is_role_init) {
+                        attach_chat_deltas(reply, result.get());
+                    }
                     writer->Write(reply);
                 }
             } else {

diff --git a/core/http/app_test.go b/core/http/app_test.go
@@ -978,6 +978,42 @@ parameters:
 			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
 		})
 
+		It("does not duplicate the first content token in streaming chat completions", Label("llama-gguf", "llama-gguf-stream"), func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+			stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{
+				Model:    "testmodel.ggml",
+				Messages: []openai.ChatCompletionMessage{{Role: "user", Content: testPrompt}},
+			})
+			Expect(err).ToNot(HaveOccurred())
+			defer stream.Close()
+
+			var contentParts []string
+			for {
+				chunk, err := stream.Recv()
+				if err == io.EOF {
+					break
+				}
+				Expect(err).ToNot(HaveOccurred())
+				if len(chunk.Choices) > 0 {
+					delta := chunk.Choices[0].Delta.Content
+					if delta != "" {
+						contentParts = append(contentParts, delta)
+					}
+				}
+			}
+
+			Expect(contentParts).ToNot(BeEmpty(), "Expected streaming content tokens")
+			// The first content token should appear exactly once.
+			// A bug in grpc-server.cpp caused the role-init array element
+			// to get the same ChatDelta stamped, duplicating the first token.
+			if len(contentParts) >= 2 {
+				Expect(contentParts[0]).ToNot(Equal(contentParts[1]),
+					"First content token was duplicated: %v", contentParts[:2])
+			}
+		})
+
 		It("returns logprobs in chat completions when requested", func() {
 			if runtime.GOOS != "linux" {
 				Skip("test only on linux")