Abort generation on Ctrl-C

frost-beta · Oct 2, 2024 · 3c3f148 · 3c3f148
1 parent 4e98dea
commit 3c3f148
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 6 deletions.
diff --git a/src/base.ts b/src/base.ts
@@ -180,6 +180,7 @@ export async function loadModel(dir: string): Promise<BaseModel> {
  * Options passed to step.
  */
 export interface StepOptions {
+  signal?: AbortSignal;
   kvCache?: BaseKVCache[];
   topP?: number;
   temperature?: number;
@@ -192,6 +193,7 @@ export async function* step(promptEmbeds: mx.array,
                             model: BaseModel,
                             eosToken: number,
                             {
+                              signal,
                               kvCache,
                               topP = 0.8,
                               temperature = 1,
@@ -209,10 +211,12 @@ export async function* step(promptEmbeds: mx.array,
 
   // Forward prompt by steps so we don't use too much RAM.
   // See also https://github.com/ml-explore/mlx-examples/pull/931
-  let nextToken: number;
+  let nextToken = eosToken;
   const prefillStepSize = 512;
   const embeddingsSize = promptEmbeds.shape[1];
   for (let offset = 0; offset < embeddingsSize;) {
+    if (signal?.aborted)
+      break;
     await mx.tidy(async () => {
       const size = Math.min(prefillStepSize, embeddingsSize - offset);
       const chunk = promptEmbeds.index(mx.Slice(), mx.Slice(offset, offset + size));
@@ -231,6 +235,9 @@ export async function* step(promptEmbeds: mx.array,
     // Quit after getting EOS.
     if (nextToken == eosToken)
       break;
+    // The generation is aborted.
+    if (signal?.aborted)
+      break;
     yield nextToken;
     // Forward the token to model and free intermediate tensors.
     [ nextToken ] = await mx.tidy(async () => {

diff --git a/src/chat.ts b/src/chat.ts
@@ -3,7 +3,7 @@
 import readline from 'node:readline/promises';
 import {styleText} from 'node:util';
 import {core as mx} from '@frost-beta/mlx';
-import {LLM, parseArgs, loadLLM} from './llm.js';
+import {LLMGenerateOptions, LLM, parseArgs, loadLLM} from './llm.js';
 import {Message} from './tokenizer.js';
 
 const [ argv, options ] = parseArgs(process.argv.slice(2));
@@ -12,9 +12,9 @@ if (argv.length < 1) {
   process.exit(0);
 }
 
-main(argv[0]);
+main(argv[0], options);
 
-async function main(dir: string) {
+async function main(dir: string, options: LLMGenerateOptions) {
   const llm = await loadLLM(dir);
 
   // Records the messages.
@@ -34,17 +34,29 @@ async function main(dir: string) {
     output: process.stdout,
   });
   rl.once('close', () => process.stdout.write('\n'));
+
+  // Chat loop.
   while (!process.stdin.closed) {
     const question = await rl.question(youPrompt);
     messages.push({role: 'user', content: question});
     process.stdout.write(botPrompt);
-    const reply = await talk(llm, messages.at(-1), messages.length == 1);
+    const reply = await talk(rl, llm, messages.at(-1), messages.length == 1, options);
     messages.push({role: 'assistant', content: reply});
   }
 }
 
 // Send full messages history to model and get response.
-async function talk(llm: LLM, message: Message, firstMessage: boolean) {
+async function talk(rl: readline.Interface,
+                    llm: LLM,
+                    message: Message,
+                    firstMessage: boolean,
+                    options: LLMGenerateOptions) {
+  // Interrupt generation when Ctrl-C is pressed.
+  const controller = new AbortController();
+  options.signal = controller.signal;
+  const abort = () => controller.abort();
+  rl.on('SIGINT', abort);
+
   // Translate the messages to tokens.
   const promptEmbeds = await llm.applyChatTemplate([ message ], {
     // Some chat templates add a system prompt automatically and we need to trim
@@ -73,5 +85,6 @@ async function talk(llm: LLM, message: Message, firstMessage: boolean) {
     // it is good chance to just release all the memory cache.
     mx.metal.clearCache();
   }
+  rl.removeListener('SIGINT', abort);
   return result;
 }