withDefaultDevice() (#15)

atdrendel · web-flow · commit 2dafb02d6e36 · 2025-08-18T01:23:43.000+02:00
Expose SHLLM.withDefaultDevice(_:_:) to allow clients to run inference on the CPU or GPU.
diff --git a/Sources/SHLLM/SHLLM.swift b/Sources/SHLLM/SHLLM.swift
@@ -40,6 +40,30 @@ public enum SHLLM {
         return true
     }
 
+    public static func withDefaultDevice<R>(
+        _ device: MLX.DeviceType,
+        _ body: () throws -> R
+    ) rethrows -> R {
+        switch device {
+        case .cpu:
+            try MLX.Device.withDefaultDevice(.cpu, body)
+        case .gpu:
+            try MLX.Device.withDefaultDevice(.gpu, body)
+        }
+    }
+
+    public static func withDefaultDevice<R>(
+        _ device: MLX.DeviceType,
+        _ body: () async throws -> R
+    ) async rethrows -> R {
+        switch device {
+        case .cpu:
+            try await MLX.Device.withDefaultDevice(.cpu, body)
+        case .gpu:
+            try await MLX.Device.withDefaultDevice(.gpu, body)
+        }
+    }
+
     static var assertSupportedDevice: Void {
         get throws {
             guard isSupportedDevice else {
@@ -60,6 +84,8 @@ public enum SHLLM {
 
 extension Chat.Message: @retroactive @unchecked Sendable {}
 
+@_exported import enum MLX.DeviceType
+
 @_exported import protocol MLXLMCommon.LanguageModel
 
 @_exported import class MLXLLM.Gemma2Model
diff --git a/Tests/SHLLMTests/SHLLMTests.swift b/Tests/SHLLMTests/SHLLMTests.swift
@@ -27,3 +27,70 @@ func recommendedMaxWorkingSetSize() async throws {
     let recommended = SHLLM.recommendedMaxWorkingSetSize
     #expect(recommended > 0)
 }
+
+// NOTE: Running inference on the CPU takes way too long.
+@Test(.enabled(if: false))
+func onCPU() async throws {
+    guard SHLLM.isSupportedDevice else {
+        Swift.print("⚠️ Metal GPU not available")
+        return
+    }
+
+    let input: UserInput = .init(messages: [
+        ["role": "system", "content": "You are a helpful assistant."],
+        ["role": "user", "content": "What is the meaning of life?"],
+    ])
+
+    try await SHLLM.withDefaultDevice(.cpu) {
+        guard let llm = try loadModel(
+            directory: LLM.gemma3_1B,
+            input: input,
+            customConfiguration: { config in
+                var config = config
+                config.extraEOSTokens = ["<end_of_turn>"]
+                return config
+            }
+        ) as LLM<Gemma3TextModel>? else { return }
+
+        var response = ""
+        for try await token in llm.text {
+            response += token
+        }
+
+        Swift.print(response)
+        #expect(!response.isEmpty)
+    }
+}
+
+@Test()
+func onGPU() async throws {
+    guard SHLLM.isSupportedDevice else {
+        Swift.print("⚠️ Metal GPU not available")
+        return
+    }
+
+    let input: UserInput = .init(messages: [
+        ["role": "system", "content": "You are a helpful assistant."],
+        ["role": "user", "content": "What is the meaning of life?"],
+    ])
+
+    try await SHLLM.withDefaultDevice(.gpu) {
+        guard let llm = try loadModel(
+            directory: LLM.gemma3_1B,
+            input: input,
+            customConfiguration: { config in
+                var config = config
+                config.extraEOSTokens = ["<end_of_turn>"]
+                return config
+            }
+        ) as LLM<Gemma3TextModel>? else { return }
+
+        var response = ""
+        for try await token in llm.text {
+            response += token
+        }
+
+        Swift.print(response)
+        #expect(!response.isEmpty)
+    }
+}