StanfordSpezi · PSchmiedmayer · May 13, 2024 · May 2, 2024 · May 3, 2024 · May 3, 2024
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -136,4 +136,6 @@ jobs:
     needs: [buildandtest_ios, buildandtest_visionos, buildandtest_macos, buildandtestuitests_ios, buildandtestuitests_ipad, buildandtestuitests_visionos]
     uses: StanfordSpezi/.github/.github/workflows/create-and-upload-coverage-report.yml@v2
     with:
-      coveragereports: 'SpeziLLM-iOS.xcresult SpeziLLM-visionOS.xcresult SpeziLLM-macOS.xcresult TestApp-iOS.xcresult TestApp-iPad.xcresult TestApp-visionOS.xcresult'
+      coveragereports: 'SpeziLLM-iOS.xcresult SpeziLLM-visionOS.xcresult SpeziLLM-macOS.xcresult TestApp-iOS.xcresult TestApp-iPad.xcresult TestApp-visionOS.xcresult'
+    secrets:
+      token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/Package.swift b/Package.swift
@@ -28,7 +28,7 @@ let package = Package(
     ],
     dependencies: [
         .package(url: "https://github.com/StanfordBDHG/OpenAI", .upToNextMinor(from: "0.2.8")),
-        .package(url: "https://github.com/StanfordBDHG/llama.cpp", .upToNextMinor(from: "0.2.1")),
+        .package(url: "https://github.com/StanfordBDHG/llama.cpp", .upToNextMinor(from: "0.3.3")),
         .package(url: "https://github.com/StanfordSpezi/Spezi", from: "1.2.1"),
         .package(url: "https://github.com/StanfordSpezi/SpeziFoundation", from: "1.0.4"),
         .package(url: "https://github.com/StanfordSpezi/SpeziStorage", from: "1.0.2"),

diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ Spezi LLM provides a number of targets to help developers integrate LLMs in thei
 - [SpeziLLM](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillm): Base infrastructure of LLM execution in the Spezi ecosystem.
 - [SpeziLLMLocal](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmlocal): Local LLM execution capabilities directly on-device. Enables running open-source LLMs like [Meta's Llama2 models](https://ai.meta.com/llama/).
 - [SpeziLLMLocalDownload](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmlocaldownload): Download and storage manager of local Language Models, including onboarding views. 
-- [SpeziLLMOpenAI](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmopenai): Integration with [OpenAIs GPT models](https://openai.com/gpt-4) via using OpenAIs API service.
+- [SpeziLLMOpenAI](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmopenai): Integration with OpenAI's GPT models via using OpenAI's API service.
 - [SpeziLLMFog](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmfog): Discover and dispatch LLM inference jobs to Fog node resources within the local network.
 
 The section below highlights the setup and basic use of the [SpeziLLMLocal](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmlocal), [SpeziLLMOpenAI](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmopenai), and [SpeziLLMFog](https://swiftpackageindex.com/stanfordspezi/spezillm/documentation/spezillmfog) targets in order to integrate Language Models in a Spezi-based application. 

diff --git a/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift b/Sources/SpeziLLMLocal/Configuration/LLMLocalContextParameters.swift
@@ -119,16 +119,6 @@
             wrapped.rope_freq_scale = newValue
         }
     }
-
-    /// Set the usage of experimental `mul_mat_q` kernels
-    var useMulMatQKernels: Bool {
-        get {
-            wrapped.mul_mat_q
-        }
-        set {
-            wrapped.mul_mat_q = newValue
-        }
-    }
 
     /// If `true`, offload the KQV ops (including the KV cache) to GPU
     var offloadKQV: Bool {
@@ -173,10 +163,10 @@
     /// If `true`, the mode is set to embeddings only
     var embeddingsOnly: Bool {
         get {
-            wrapped.embedding
+            wrapped.embeddings
         }
         set {
-            wrapped.embedding = newValue
+            wrapped.embeddings = newValue
         }
     }
 
@@ -191,7 +181,6 @@
     ///   - threadCountBatch: Number of threads used by LLM for batch processing, defaults to the processor count of the device.
     ///   - ropeFreqBase: RoPE base frequency, defaults to `0` indicating the default from model.
     ///   - ropeFreqScale: RoPE frequency scaling factor, defaults to `0` indicating the default from model.
-    ///   - useMulMatQKernels: Usage of experimental `mul_mat_q` kernels, defaults to `true`.
     ///   - offloadKQV: Offloads the KQV ops (including the KV cache) to GPU, defaults to `true`.
     ///   - kvKeyType: ``GGMLType`` of the key of the KV cache, defaults to ``GGMLType/f16``.
     ///   - kvValueType: ``GGMLType`` of the value of the KV cache, defaults to ``GGMLType/f16``.
@@ -205,7 +194,6 @@
         threadCountBatch: UInt32 = .init(ProcessInfo.processInfo.processorCount),
         ropeFreqBase: Float = 0.0,
         ropeFreqScale: Float = 0.0,
-        useMulMatQKernels: Bool = true,
         offloadKQV: Bool = true,
         kvKeyType: GGMLType = .f16,
         kvValueType: GGMLType = .f16,
@@ -221,7 +209,6 @@
         self.threadCountBatch = threadCountBatch
         self.ropeFreqBase = ropeFreqBase
         self.ropeFreqScale = ropeFreqScale
-        self.useMulMatQKernels = useMulMatQKernels
         self.offloadKQV = offloadKQV
         self.kvKeyType = kvKeyType
         self.kvValueType = kvValueType

diff --git a/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift b/Sources/SpeziLLMLocal/LLMLocalSchema+PromptFormatting.swift
@@ -12,6 +12,86 @@
 extension LLMLocalSchema {
     /// Holds default prompt formatting strategies for [Llama2](https://ai.meta.com/llama/) as well as [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/) models.
     public enum PromptFormattingDefaults {
+        /// Prompt formatting closure for the [Llama3](https://ai.meta.com/llama/) model
+        public static let llama3: (@Sendable (LLMContext) throws -> String) = { chat in // swiftlint:disable:this closure_body_length
+            /// BOS token of the LLM, used at the start of each prompt passage.
+            let BEGINOFTEXT = "<|begin_of_text|>"
+            /// The system identifier.
+            let SYSTEM = "system"
+            /// The user identifier.
+            let USER = "user"
+            /// The assistant identifier.
+            let ASSISTANT = "assistant"
+            /// The start token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
+            let STARTHEADERID = "<|start_header_id|>"
+            /// The end token for enclosing the role of a particular message, e.g. <|start_header_id|>{role}<|end_header_id|>
+            let ENDHEADERID = "<|end_header_id|>"
+            /// The token that signifies the end of the message in a turn.
+            let EOTID = "<|eot_id|>"
+
+            guard chat.first?.role == .system else {
+                throw LLMLocalError.illegalContext
+            }
+
+            var systemPrompts: [String] = []
+            var initialUserPrompt: String = ""
+
+            for contextEntity in chat {
+                if contextEntity.role != .system {
+                    if contextEntity.role == .user {
+                        initialUserPrompt = contextEntity.content
+                        break
+                    } else {
+                        throw LLMLocalError.illegalContext
+                    }
+                }
+
+                systemPrompts.append(contextEntity.content)
+            }
+
+            /// Build the initial Llama3 prompt structure
+            /// 
+            /// Template of the prompt structure:
+            /// <|begin_of_text|>
+            /// <|start_header_id|>user<|end_header_id|>
+            /// {{ user_message }}<|eot_id|>
+            /// <|start_header_id|>assistant<|end_header_id|>
+            var prompt = """
+            \(BEGINOFTEXT)
+            \(STARTHEADERID)\(SYSTEM)\(ENDHEADERID)
+            \(systemPrompts.joined(separator: " "))\(EOTID)
+
+            \(STARTHEADERID)\(USER)\(ENDHEADERID)
+            \(initialUserPrompt)\(EOTID)
+
+            """ + " "   // Add a spacer to the generated output from the model
+
+            for contextEntity in chat.dropFirst(2) {
+                if contextEntity.role == .assistant() {
+                    /// Append response from assistant to the Llama3 prompt structure
+                    prompt += """
+                    \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
+                    \(contextEntity.content)
+                    \(EOTID)
+                    """
+                } else if contextEntity.role == .user {
+                    /// Append response from user to the Llama3 prompt structure
+                    prompt += """
+                    \(STARTHEADERID)\(USER)\(ENDHEADERID)
+                    \(contextEntity.content)
+                    \(EOTID)
+                    """ + " "   // Add a spacer to the generated output from the model
+                }
+            }
+
+            prompt +=
+            """
+            \(STARTHEADERID)\(ASSISTANT)\(ENDHEADERID)
+            """
+
+            return prompt
+        }
+
         /// Prompt formatting closure for the [Llama2](https://ai.meta.com/llama/) model
         public static let llama2: (@Sendable (LLMContext) throws -> String) = { chat in     // swiftlint:disable:this closure_body_length
             /// BOS token of the LLM, used at the start of each prompt passage.

diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Generation.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Generation.swift
@@ -131,7 +131,7 @@
                 return
             }
 
-            var nextStringPiece = String(llama_token_to_piece(self.modelContext, nextTokenId))
+            var nextStringPiece = String(llama_token_to_piece(self.modelContext, nextTokenId, true))
             // As first character is sometimes randomly prefixed by a single space (even though prompt has an additional character)
             if decodedTokens == 0 && nextStringPiece.starts(with: " ") {
                 nextStringPiece = String(nextStringPiece.dropFirst())

diff --git a/Sources/SpeziLLMLocal/LLMLocalSession+Tokenization.swift b/Sources/SpeziLLMLocal/LLMLocalSession+Tokenization.swift
@@ -75,7 +75,7 @@
     /// - Note: Used only for debug purposes
     func detokenize(tokens: [LLMLocalToken]) -> [(LLMLocalToken, String)] {
         tokens.reduce(into: [(LLMLocalToken, String)]()) { partialResult, token in
-            partialResult.append((token, String(llama_token_to_piece(self.modelContext, token))))
+            partialResult.append((token, String(llama_token_to_piece(self.modelContext, token, true))))
         }
     }
 }
diff --git a/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager+DefaultUrls.swift b/Sources/SpeziLLMLocalDownload/LLMLocalDownloadManager+DefaultUrls.swift
@@ -12,6 +12,17 @@
 extension LLMLocalDownloadManager {
     /// Defaults of possible LLMs to download via the ``LLMLocalDownloadManager``.
     public enum LLMUrlDefaults {
+        /// LLama 3 8B model with `Q4_K_M` quantization in its instruct variation (~5 GB)
+        public static var llama3InstructModelUrl: URL {
+            guard let url = URL(string: "https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q4_K_M.gguf") else {
+                preconditionFailure("""
+                    SpeziLLM: Invalid LLMUrlDefaults LLM download URL.
+                """)
+            }
+
+            return url
+        }
+
         /// LLama 2 7B model with `Q4_K_M` quantization in its chat variation (~3.5GB)
         public static var llama2ChatModelUrl: URL {
             guard let url = URL(string: "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf") else {

diff --git a/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift b/Tests/UITests/TestApp/LLMLocal/LLMLocalChatTestView.swift
@@ -27,7 +27,8 @@ struct LLMLocalChatTestView: View {
                     with: LLMLocalSchema(
                         modelPath: .cachesDirectory.appending(path: "llm.gguf"),
                         parameters: .init(maxOutputLength: 512),
-                        contextParameters: .init(contextWindowSize: 1024)
+                        contextParameters: .init(contextWindowSize: 1024),
+                        formatChat: LLMLocalSchema.PromptFormattingDefaults.llama3
                     )
                 )
             }

diff --git a/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingDownloadView.swift b/Tests/UITests/TestApp/LLMLocal/Onboarding/LLMLocalOnboardingDownloadView.swift
@@ -20,7 +20,7 @@ struct LLMLocalOnboardingDownloadView: View {
     var body: some View {
         LLMLocalDownloadView(
             downloadDescription: "LLM_DOWNLOAD_DESCRIPTION",
-            llmDownloadUrl: LLMLocalDownloadManager.LLMUrlDefaults.llama2ChatModelUrl /// By default, download the Llama2 model
+            llmDownloadUrl: LLMLocalDownloadManager.LLMUrlDefaults.llama3InstructModelUrl /// By default, download the Llama3 model
         ) {
             onboardingNavigationPath.nextStep()
         }

diff --git a/Tests/UITests/TestApp/Resources/Localizable.xcstrings b/Tests/UITests/TestApp/Resources/Localizable.xcstrings
@@ -56,7 +56,7 @@
         "en" : {
           "stringUnit" : {
             "state" : "translated",
-            "value" : "By default, the application downloads the Llama 2 7B model in its chat variation. The size of the model is around 3.5GB."
+            "value" : "By default, the application downloads the Llama 3 8B model in its instruct variation. The size of the model is around 5GB."
           }
         }
       }