Skip to content

Commit

Permalink
Abort generation on Ctrl-C
Browse files Browse the repository at this point in the history
  • Loading branch information
zcbenz committed Oct 2, 2024
1 parent 4e98dea commit 3c3f148
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 6 deletions.
9 changes: 8 additions & 1 deletion src/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ export async function loadModel(dir: string): Promise<BaseModel> {
* Options passed to step.
*/
export interface StepOptions {
signal?: AbortSignal;
kvCache?: BaseKVCache[];
topP?: number;
temperature?: number;
Expand All @@ -192,6 +193,7 @@ export async function* step(promptEmbeds: mx.array,
model: BaseModel,
eosToken: number,
{
signal,
kvCache,
topP = 0.8,
temperature = 1,
Expand All @@ -209,10 +211,12 @@ export async function* step(promptEmbeds: mx.array,

// Forward prompt by steps so we don't use too much RAM.
// See also https://github.com/ml-explore/mlx-examples/pull/931
let nextToken: number;
let nextToken = eosToken;
const prefillStepSize = 512;
const embeddingsSize = promptEmbeds.shape[1];
for (let offset = 0; offset < embeddingsSize;) {
if (signal?.aborted)
break;
await mx.tidy(async () => {
const size = Math.min(prefillStepSize, embeddingsSize - offset);
const chunk = promptEmbeds.index(mx.Slice(), mx.Slice(offset, offset + size));
Expand All @@ -231,6 +235,9 @@ export async function* step(promptEmbeds: mx.array,
// Quit after getting EOS.
if (nextToken == eosToken)
break;
// The generation is aborted.
if (signal?.aborted)
break;
yield nextToken;
// Forward the token to model and free intermediate tensors.
[ nextToken ] = await mx.tidy(async () => {
Expand Down
23 changes: 18 additions & 5 deletions src/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import readline from 'node:readline/promises';
import {styleText} from 'node:util';
import {core as mx} from '@frost-beta/mlx';
import {LLM, parseArgs, loadLLM} from './llm.js';
import {LLMGenerateOptions, LLM, parseArgs, loadLLM} from './llm.js';
import {Message} from './tokenizer.js';

const [ argv, options ] = parseArgs(process.argv.slice(2));
Expand All @@ -12,9 +12,9 @@ if (argv.length < 1) {
process.exit(0);
}

main(argv[0]);
main(argv[0], options);

async function main(dir: string) {
async function main(dir: string, options: LLMGenerateOptions) {
const llm = await loadLLM(dir);

// Records the messages.
Expand All @@ -34,17 +34,29 @@ async function main(dir: string) {
output: process.stdout,
});
rl.once('close', () => process.stdout.write('\n'));

// Chat loop.
while (!process.stdin.closed) {
const question = await rl.question(youPrompt);
messages.push({role: 'user', content: question});
process.stdout.write(botPrompt);
const reply = await talk(llm, messages.at(-1), messages.length == 1);
const reply = await talk(rl, llm, messages.at(-1), messages.length == 1, options);
messages.push({role: 'assistant', content: reply});
}
}

// Send full messages history to model and get response.
async function talk(llm: LLM, message: Message, firstMessage: boolean) {
async function talk(rl: readline.Interface,
llm: LLM,
message: Message,
firstMessage: boolean,
options: LLMGenerateOptions) {
// Interrupt generation when Ctrl-C is pressed.
const controller = new AbortController();
options.signal = controller.signal;
const abort = () => controller.abort();
rl.on('SIGINT', abort);

// Translate the messages to tokens.
const promptEmbeds = await llm.applyChatTemplate([ message ], {
// Some chat templates add a system prompt automatically and we need to trim
Expand Down Expand Up @@ -73,5 +85,6 @@ async function talk(llm: LLM, message: Message, firstMessage: boolean) {
// it is good chance to just release all the memory cache.
mx.metal.clearCache();
}
rl.removeListener('SIGINT', abort);
return result;
}

0 comments on commit 3c3f148

Please sign in to comment.