From fb4bd92e1f61bda0aa6ea7f3be9a8d28106f93d1 Mon Sep 17 00:00:00 2001 From: Sawyer Hollenshead Date: Thu, 29 Dec 2022 14:58:28 -0800 Subject: [PATCH] Recursive summarization --- index.js | 81 ----------------------------------- package-lock.json | 22 +++++++++- package.json | 5 ++- src/cli-prompts.js | 54 ++++++++++++++++++++++++ src/completions.js | 102 +++++++++++++++++++++++++++++++++++++++++++++ src/index.js | 33 +++++++++++++++ src/logger.js | 12 ++++++ src/parse-url.js | 19 +++++++++ tsconfig.json | 12 ++++++ 9 files changed, 256 insertions(+), 84 deletions(-) delete mode 100644 index.js create mode 100644 src/cli-prompts.js create mode 100644 src/completions.js create mode 100644 src/index.js create mode 100644 src/logger.js create mode 100644 src/parse-url.js create mode 100644 tsconfig.json diff --git a/index.js b/index.js deleted file mode 100644 index 2eff2e6..0000000 --- a/index.js +++ /dev/null @@ -1,81 +0,0 @@ -import { Configuration, OpenAIApi } from "openai"; -import { JSDOM } from "jsdom"; -import { Readability } from "@mozilla/readability"; -import cliSpinners from "cli-spinners"; -import dotenv from "dotenv"; -import inquirer from "inquirer"; -import { oraPromise } from "ora"; -import * as colors from "yoctocolors"; - -// Load .env file -dotenv.config(); - -const openai = new OpenAIApi( - new Configuration({ - apiKey: process.env.OPENAI_KEY, - }) -); - -/** - * Get the main content of the URL - */ -const url = process.argv[2]; -if (!url) { - console.error("Pass a URL as the last argument"); - process.exit(1); -} -const dom = await JSDOM.fromURL(url); -const article = new Readability(dom.window.document).parse(); -if (!article) { - console.error("Couldn't parse the URL"); - process.exit(1); -} -console.log(`${colors.bgCyan(colors.black(` ${article.title} `))}\n`); - -/** - * Get the prompt from the user - */ -const customPromptChoice = "[Custom prompt]"; -const answers = await inquirer.prompt([ - { - type: "list", - name: "prompt", - message: "Select prompt:", - choices: [ - "Summarize this", - "List 10 key takeaways", - "List all entities, grouped by type or category", - "Write an abstract for this", - customPromptChoice, - ], - }, - { - type: "input", - name: "customPrompt", - message: "Custom prompt (the URL's content comes after this prompt):", - when: (answers) => answers.prompt === customPromptChoice, - }, -]); - -/** - * Run the prompt and URL's content through OpenAI's API - */ -const prompt = answers.customPrompt ?? answers.prompt; -const content = article.textContent.replace(/\n/g, " "); -const oraOptions = { - spinner: cliSpinners.earth, - text: "Generating response...", -}; -const response = await oraPromise( - openai.createCompletion({ - model: "text-davinci-003", - prompt: `${prompt}:\n\n${content}`, - // 0.1 provides more straightforward and consistent responses. Higher numbers provides more diverse responses. - temperature: 0.1, - max_tokens: 500, - }), - oraOptions -); - -console.log(`\n${colors.bgGreen(colors.black("Response:"))}`); -console.log(response.data.choices[0].text); diff --git a/package-lock.json b/package-lock.json index 939f970..e44ca09 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,7 +19,8 @@ }, "devDependencies": { "@types/jsdom": "^20.0.1", - "@types/node": "^18.11.18" + "@types/node": "^18.11.18", + "typescript": "^4.9.4" }, "engines": { "node": ">=18.0.0" @@ -1088,6 +1089,19 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/typescript": { + "version": "4.9.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.4.tgz", + "integrity": "sha512-Uz+dTXYzxXXbsFpM86Wh3dKCxrQqUcVMxwU54orwlJjOpO3ao8L7j5lH+dWfTwgCwIuM9GQ2kvVotzYJMXTBZg==", + "dev": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=4.2.0" + } + }, "node_modules/universalify": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz", @@ -1966,6 +1980,12 @@ "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-3.5.0.tgz", "integrity": "sha512-bI3zRmZC8K0tUz1HjbIOAGQwR2CoPQG68N5IF7gm0LBl8QSNXzkmaWnkWccCUL5uG9mCsp4sBwC8SBrNSISWew==" }, + "typescript": { + "version": "4.9.4", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.4.tgz", + "integrity": "sha512-Uz+dTXYzxXXbsFpM86Wh3dKCxrQqUcVMxwU54orwlJjOpO3ao8L7j5lH+dWfTwgCwIuM9GQ2kvVotzYJMXTBZg==", + "dev": true + }, "universalify": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz", diff --git a/package.json b/package.json index fa51679..b19699a 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,7 @@ "version": "1.0.0", "type": "module", "scripts": { - "x": "node index.js" + "x": "node src/index.js" }, "engines": { "node": ">=18.0.0" @@ -20,6 +20,7 @@ }, "devDependencies": { "@types/jsdom": "^20.0.1", - "@types/node": "^18.11.18" + "@types/node": "^18.11.18", + "typescript": "^4.9.4" } } diff --git a/src/cli-prompts.js b/src/cli-prompts.js new file mode 100644 index 0000000..74e040e --- /dev/null +++ b/src/cli-prompts.js @@ -0,0 +1,54 @@ +import inquirer from "inquirer"; + +const customPromptChoice = "[Custom prompt]"; +const prompts = [ + { + prompt: "Summarize this", + combinationPrompt: "Combine these summaries into an overall summary:", + }, + { + prompt: "List 10 key takeaways", + combinationPrompt: + "Combine these takeaways into an overall list of 10 key takeaways", + }, + { + prompt: "List all entities, grouped by type or category", + combinationPrompt: + "Combine these lists of entities, but preserve the grouping", + }, + { + prompt: "Write an abstract for this", + combinationPrompt: "Write an abstract for this", + }, +]; + +export async function runCliPrompts() { + const answers = await inquirer.prompt([ + { + type: "list", + name: "prompt", + message: "Select prompt:", + choices: [...prompts.map((p) => p.prompt), customPromptChoice], + }, + { + type: "input", + name: "customPrompt", + message: "Custom prompt (e.g 'Summarize this')", + when: (answers) => answers.prompt === customPromptChoice, + }, + { + type: "input", + name: "customCombinationPrompt", + message: + "Custom combination prompt (e.g 'Combine these summaries into an overall summary')", + when: (answers) => answers.prompt === customPromptChoice, + }, + ]); + + const prompt = answers.customPrompt ?? answers.prompt; + const combinationPrompt = + answers.customCombinationPrompt ?? + prompts.find((p) => p.prompt === prompt)?.combinationPrompt; + + return { prompt, combinationPrompt }; +} diff --git a/src/completions.js b/src/completions.js new file mode 100644 index 0000000..28c7c5c --- /dev/null +++ b/src/completions.js @@ -0,0 +1,102 @@ +import { Configuration, OpenAIApi } from "openai"; +import cliSpinners from "cli-spinners"; +import { oraPromise } from "ora"; +import { logger } from "./logger.js"; + +/** + * + * @param {string} prompt + * @param {"text-davinci-003"|"text-curie-001"} model - Davinci is the most powerful model, but it's also the most expensive + * @returns + */ +async function createCompletion(prompt, model = "text-davinci-003") { + const openai = new OpenAIApi( + new Configuration({ + apiKey: process.env.OPENAI_KEY, + }) + ); + + const response = await openai.createCompletion({ + prompt, + model, + // 0.1 provides more straightforward and consistent responses. Higher numbers provides more diverse responses. + temperature: 0.1, + max_tokens: 500, + }); + + return response; +} + +/** + * Break the page's content into roughly equally distributed + * chunks while preserving sentences, so that we don't exceed + * the API's max token limit + */ +function chunkTheContent(content) { + const maxChunkSize = 3500 * 4; // ~1 token = 4 characters + const chunks = []; + let chunk = ""; + for (const sentence of content.split(/(?<=[.?!])\s+/)) { + if (chunk.length + sentence.length > maxChunkSize) { + chunks.push(chunk); + chunk = ""; + } + chunk += sentence + " "; + } + + if (chunks.length === 0) return [content]; + + return chunks; +} + +export async function getCompletion({ content, prompt, combinationPrompt }) { + const chunks = chunkTheContent(content); + + const chunkRequests = + // limit to 40 chunks to avoid excessive API usage + chunks.slice(0, 40).map((chunk, index) => + oraPromise( + async () => { + const response = await createCompletion( + `${prompt}:\n\n###${chunk}\n\n###` + ); + return { index, response }; + }, + { + spinner: cliSpinners.earth, + text: "Generating response...", + } + ) + ); + + const resolvedRequests = await Promise.all(chunkRequests); + // Preserve the order of the content completions + const responses = resolvedRequests + .sort((a, b) => a.index - b.index) + .map((r) => r.response); + + if (chunks.length === 1) return responses[0].data.choices[0].text; + + /** + * Do one final completion against the combination of all the completions + */ + const combinedCompletions = responses + .map((r) => r.data.choices[0].text) + .join("\n----\n"); + + const finalCompletion = await oraPromise( + createCompletion( + `${combinationPrompt}:\n\n###${combinedCompletions}\n\n###` + ), + { + spinner: cliSpinners.moon, + text: "Combining responses...", + } + ); + + logger.warn( + `Since the page's content was so long, the following response is formed by running a combination prompt against a series of responses (${chunks.length}) to smaller chunks of the content.` + ); + + return finalCompletion.data.choices[0].text; +} diff --git a/src/index.js b/src/index.js new file mode 100644 index 0000000..fc3530c --- /dev/null +++ b/src/index.js @@ -0,0 +1,33 @@ +import { config } from "dotenv"; +import { runCliPrompts } from "./cli-prompts.js"; +import { parseUrl } from "./parse-url.js"; +import { logger } from "./logger.js"; +import { getCompletion } from "./completions.js"; + +// Load .env file +config(); + +/** + * 1. Get the main content of the URL + */ +const article = await parseUrl(); +const content = article.textContent.replace(/\n/g, " "); +logger.info(article.title); + +/** + * 2. Get the prompt from the user + */ +const { prompt, combinationPrompt } = await runCliPrompts(); + +/** + * 3. Run the prompt against the URL's content + */ +const completion = await getCompletion({ + content, + prompt, + combinationPrompt, +}); + +logger.success("Response ⤵️ "); + +logger.log(completion); diff --git a/src/logger.js b/src/logger.js new file mode 100644 index 0000000..4fb2d44 --- /dev/null +++ b/src/logger.js @@ -0,0 +1,12 @@ +import * as colors from "yoctocolors"; + +export const logger = { + log: (message) => console.log(message), + info: (message) => + console.log(`${colors.bgCyan(colors.black(` ${message} `))}\n`), + error: (message) => + console.log(`${colors.bgRed(colors.white(` ${message} `))}\n`), + success: (message) => + console.log(`${colors.bgGreen(colors.black(` ${message} `))}\n`), + warn: (message) => console.log(`${colors.yellow(message)}\n`), +}; diff --git a/src/parse-url.js b/src/parse-url.js new file mode 100644 index 0000000..460d9d4 --- /dev/null +++ b/src/parse-url.js @@ -0,0 +1,19 @@ +import { JSDOM } from "jsdom"; +import { Readability } from "@mozilla/readability"; + +export async function parseUrl() { + const url = process.argv[2]; + if (!url) { + console.error("Pass a URL as the last argument"); + process.exit(1); + } + const dom = await JSDOM.fromURL(url); + const article = new Readability(dom.window.document).parse(); + + if (!article) { + console.error("Couldn't parse the URL"); + process.exit(1); + } + + return article; +} diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..5d7b561 --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,12 @@ +{ + "compilerOptions": { + "outDir": "./lib", + "allowJs": true, + "noImplicitAny": false, + "checkJs": true, + "forceConsistentCasingInFileNames": true, + "module": "NodeNext", + "strict": true, + "target": "ES2022" + } +}