From fb4bd92e1f61bda0aa6ea7f3be9a8d28106f93d1 Mon Sep 17 00:00:00 2001
From: Sawyer Hollenshead <git@sawyerh.com>
Date: Thu, 29 Dec 2022 14:58:28 -0800
Subject: [PATCH] Recursive summarization

---
 index.js           |  81 -----------------------------------
 package-lock.json  |  22 +++++++++-
 package.json       |   5 ++-
 src/cli-prompts.js |  54 ++++++++++++++++++++++++
 src/completions.js | 102 +++++++++++++++++++++++++++++++++++++++++++++
 src/index.js       |  33 +++++++++++++++
 src/logger.js      |  12 ++++++
 src/parse-url.js   |  19 +++++++++
 tsconfig.json      |  12 ++++++
 9 files changed, 256 insertions(+), 84 deletions(-)
 delete mode 100644 index.js
 create mode 100644 src/cli-prompts.js
 create mode 100644 src/completions.js
 create mode 100644 src/index.js
 create mode 100644 src/logger.js
 create mode 100644 src/parse-url.js
 create mode 100644 tsconfig.json

diff --git a/index.js b/index.js
deleted file mode 100644
index 2eff2e6..0000000
--- a/index.js
+++ /dev/null
@@ -1,81 +0,0 @@
-import { Configuration, OpenAIApi } from "openai";
-import { JSDOM } from "jsdom";
-import { Readability } from "@mozilla/readability";
-import cliSpinners from "cli-spinners";
-import dotenv from "dotenv";
-import inquirer from "inquirer";
-import { oraPromise } from "ora";
-import * as colors from "yoctocolors";
-
-// Load .env file
-dotenv.config();
-
-const openai = new OpenAIApi(
-  new Configuration({
-    apiKey: process.env.OPENAI_KEY,
-  })
-);
-
-/**
- * Get the main content of the URL
- */
-const url = process.argv[2];
-if (!url) {
-  console.error("Pass a URL as the last argument");
-  process.exit(1);
-}
-const dom = await JSDOM.fromURL(url);
-const article = new Readability(dom.window.document).parse();
-if (!article) {
-  console.error("Couldn't parse the URL");
-  process.exit(1);
-}
-console.log(`${colors.bgCyan(colors.black(` ${article.title} `))}\n`);
-
-/**
- * Get the prompt from the user
- */
-const customPromptChoice = "[Custom prompt]";
-const answers = await inquirer.prompt([
-  {
-    type: "list",
-    name: "prompt",
-    message: "Select prompt:",
-    choices: [
-      "Summarize this",
-      "List 10 key takeaways",
-      "List all entities, grouped by type or category",
-      "Write an abstract for this",
-      customPromptChoice,
-    ],
-  },
-  {
-    type: "input",
-    name: "customPrompt",
-    message: "Custom prompt (the URL's content comes after this prompt):",
-    when: (answers) => answers.prompt === customPromptChoice,
-  },
-]);
-
-/**
- * Run the prompt and URL's content through OpenAI's API
- */
-const prompt = answers.customPrompt ?? answers.prompt;
-const content = article.textContent.replace(/\n/g, " ");
-const oraOptions = {
-  spinner: cliSpinners.earth,
-  text: "Generating response...",
-};
-const response = await oraPromise(
-  openai.createCompletion({
-    model: "text-davinci-003",
-    prompt: `${prompt}:\n\n${content}`,
-    // 0.1 provides more straightforward and consistent responses. Higher numbers provides more diverse responses.
-    temperature: 0.1,
-    max_tokens: 500,
-  }),
-  oraOptions
-);
-
-console.log(`\n${colors.bgGreen(colors.black("Response:"))}`);
-console.log(response.data.choices[0].text);
diff --git a/package-lock.json b/package-lock.json
index 939f970..e44ca09 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -19,7 +19,8 @@
       },
       "devDependencies": {
         "@types/jsdom": "^20.0.1",
-        "@types/node": "^18.11.18"
+        "@types/node": "^18.11.18",
+        "typescript": "^4.9.4"
       },
       "engines": {
         "node": ">=18.0.0"
@@ -1088,6 +1089,19 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/typescript": {
+      "version": "4.9.4",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.4.tgz",
+      "integrity": "sha512-Uz+dTXYzxXXbsFpM86Wh3dKCxrQqUcVMxwU54orwlJjOpO3ao8L7j5lH+dWfTwgCwIuM9GQ2kvVotzYJMXTBZg==",
+      "dev": true,
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=4.2.0"
+      }
+    },
     "node_modules/universalify": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
@@ -1966,6 +1980,12 @@
       "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-3.5.0.tgz",
       "integrity": "sha512-bI3zRmZC8K0tUz1HjbIOAGQwR2CoPQG68N5IF7gm0LBl8QSNXzkmaWnkWccCUL5uG9mCsp4sBwC8SBrNSISWew=="
     },
+    "typescript": {
+      "version": "4.9.4",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.9.4.tgz",
+      "integrity": "sha512-Uz+dTXYzxXXbsFpM86Wh3dKCxrQqUcVMxwU54orwlJjOpO3ao8L7j5lH+dWfTwgCwIuM9GQ2kvVotzYJMXTBZg==",
+      "dev": true
+    },
     "universalify": {
       "version": "0.2.0",
       "resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
diff --git a/package.json b/package.json
index fa51679..b19699a 100644
--- a/package.json
+++ b/package.json
@@ -3,7 +3,7 @@
   "version": "1.0.0",
   "type": "module",
   "scripts": {
-    "x": "node index.js"
+    "x": "node src/index.js"
   },
   "engines": {
     "node": ">=18.0.0"
@@ -20,6 +20,7 @@
   },
   "devDependencies": {
     "@types/jsdom": "^20.0.1",
-    "@types/node": "^18.11.18"
+    "@types/node": "^18.11.18",
+    "typescript": "^4.9.4"
   }
 }
diff --git a/src/cli-prompts.js b/src/cli-prompts.js
new file mode 100644
index 0000000..74e040e
--- /dev/null
+++ b/src/cli-prompts.js
@@ -0,0 +1,54 @@
+import inquirer from "inquirer";
+
+const customPromptChoice = "[Custom prompt]";
+const prompts = [
+  {
+    prompt: "Summarize this",
+    combinationPrompt: "Combine these summaries into an overall summary:",
+  },
+  {
+    prompt: "List 10 key takeaways",
+    combinationPrompt:
+      "Combine these takeaways into an overall list of 10 key takeaways",
+  },
+  {
+    prompt: "List all entities, grouped by type or category",
+    combinationPrompt:
+      "Combine these lists of entities, but preserve the grouping",
+  },
+  {
+    prompt: "Write an abstract for this",
+    combinationPrompt: "Write an abstract for this",
+  },
+];
+
+export async function runCliPrompts() {
+  const answers = await inquirer.prompt([
+    {
+      type: "list",
+      name: "prompt",
+      message: "Select prompt:",
+      choices: [...prompts.map((p) => p.prompt), customPromptChoice],
+    },
+    {
+      type: "input",
+      name: "customPrompt",
+      message: "Custom prompt (e.g 'Summarize this')",
+      when: (answers) => answers.prompt === customPromptChoice,
+    },
+    {
+      type: "input",
+      name: "customCombinationPrompt",
+      message:
+        "Custom combination prompt (e.g 'Combine these summaries into an overall summary')",
+      when: (answers) => answers.prompt === customPromptChoice,
+    },
+  ]);
+
+  const prompt = answers.customPrompt ?? answers.prompt;
+  const combinationPrompt =
+    answers.customCombinationPrompt ??
+    prompts.find((p) => p.prompt === prompt)?.combinationPrompt;
+
+  return { prompt, combinationPrompt };
+}
diff --git a/src/completions.js b/src/completions.js
new file mode 100644
index 0000000..28c7c5c
--- /dev/null
+++ b/src/completions.js
@@ -0,0 +1,102 @@
+import { Configuration, OpenAIApi } from "openai";
+import cliSpinners from "cli-spinners";
+import { oraPromise } from "ora";
+import { logger } from "./logger.js";
+
+/**
+ *
+ * @param {string} prompt
+ * @param {"text-davinci-003"|"text-curie-001"} model - Davinci is the most powerful model, but it's also the most expensive
+ * @returns
+ */
+async function createCompletion(prompt, model = "text-davinci-003") {
+  const openai = new OpenAIApi(
+    new Configuration({
+      apiKey: process.env.OPENAI_KEY,
+    })
+  );
+
+  const response = await openai.createCompletion({
+    prompt,
+    model,
+    // 0.1 provides more straightforward and consistent responses. Higher numbers provides more diverse responses.
+    temperature: 0.1,
+    max_tokens: 500,
+  });
+
+  return response;
+}
+
+/**
+ * Break the page's content into roughly equally distributed
+ * chunks while preserving sentences, so that we don't exceed
+ * the API's max token limit
+ */
+function chunkTheContent(content) {
+  const maxChunkSize = 3500 * 4; // ~1 token = 4 characters
+  const chunks = [];
+  let chunk = "";
+  for (const sentence of content.split(/(?<=[.?!])\s+/)) {
+    if (chunk.length + sentence.length > maxChunkSize) {
+      chunks.push(chunk);
+      chunk = "";
+    }
+    chunk += sentence + " ";
+  }
+
+  if (chunks.length === 0) return [content];
+
+  return chunks;
+}
+
+export async function getCompletion({ content, prompt, combinationPrompt }) {
+  const chunks = chunkTheContent(content);
+
+  const chunkRequests =
+    // limit to 40 chunks to avoid excessive API usage
+    chunks.slice(0, 40).map((chunk, index) =>
+      oraPromise(
+        async () => {
+          const response = await createCompletion(
+            `${prompt}:\n\n###${chunk}\n\n###`
+          );
+          return { index, response };
+        },
+        {
+          spinner: cliSpinners.earth,
+          text: "Generating response...",
+        }
+      )
+    );
+
+  const resolvedRequests = await Promise.all(chunkRequests);
+  // Preserve the order of the content completions
+  const responses = resolvedRequests
+    .sort((a, b) => a.index - b.index)
+    .map((r) => r.response);
+
+  if (chunks.length === 1) return responses[0].data.choices[0].text;
+
+  /**
+   * Do one final completion against the combination of all the completions
+   */
+  const combinedCompletions = responses
+    .map((r) => r.data.choices[0].text)
+    .join("\n----\n");
+
+  const finalCompletion = await oraPromise(
+    createCompletion(
+      `${combinationPrompt}:\n\n###${combinedCompletions}\n\n###`
+    ),
+    {
+      spinner: cliSpinners.moon,
+      text: "Combining responses...",
+    }
+  );
+
+  logger.warn(
+    `Since the page's content was so long, the following response is formed by running a combination prompt against a series of responses (${chunks.length}) to smaller chunks of the content.`
+  );
+
+  return finalCompletion.data.choices[0].text;
+}
diff --git a/src/index.js b/src/index.js
new file mode 100644
index 0000000..fc3530c
--- /dev/null
+++ b/src/index.js
@@ -0,0 +1,33 @@
+import { config } from "dotenv";
+import { runCliPrompts } from "./cli-prompts.js";
+import { parseUrl } from "./parse-url.js";
+import { logger } from "./logger.js";
+import { getCompletion } from "./completions.js";
+
+// Load .env file
+config();
+
+/**
+ * 1. Get the main content of the URL
+ */
+const article = await parseUrl();
+const content = article.textContent.replace(/\n/g, " ");
+logger.info(article.title);
+
+/**
+ * 2. Get the prompt from the user
+ */
+const { prompt, combinationPrompt } = await runCliPrompts();
+
+/**
+ * 3. Run the prompt against the URL's content
+ */
+const completion = await getCompletion({
+  content,
+  prompt,
+  combinationPrompt,
+});
+
+logger.success("Response ⤵️ ");
+
+logger.log(completion);
diff --git a/src/logger.js b/src/logger.js
new file mode 100644
index 0000000..4fb2d44
--- /dev/null
+++ b/src/logger.js
@@ -0,0 +1,12 @@
+import * as colors from "yoctocolors";
+
+export const logger = {
+  log: (message) => console.log(message),
+  info: (message) =>
+    console.log(`${colors.bgCyan(colors.black(` ${message} `))}\n`),
+  error: (message) =>
+    console.log(`${colors.bgRed(colors.white(` ${message} `))}\n`),
+  success: (message) =>
+    console.log(`${colors.bgGreen(colors.black(` ${message} `))}\n`),
+  warn: (message) => console.log(`${colors.yellow(message)}\n`),
+};
diff --git a/src/parse-url.js b/src/parse-url.js
new file mode 100644
index 0000000..460d9d4
--- /dev/null
+++ b/src/parse-url.js
@@ -0,0 +1,19 @@
+import { JSDOM } from "jsdom";
+import { Readability } from "@mozilla/readability";
+
+export async function parseUrl() {
+  const url = process.argv[2];
+  if (!url) {
+    console.error("Pass a URL as the last argument");
+    process.exit(1);
+  }
+  const dom = await JSDOM.fromURL(url);
+  const article = new Readability(dom.window.document).parse();
+
+  if (!article) {
+    console.error("Couldn't parse the URL");
+    process.exit(1);
+  }
+
+  return article;
+}
diff --git a/tsconfig.json b/tsconfig.json
new file mode 100644
index 0000000..5d7b561
--- /dev/null
+++ b/tsconfig.json
@@ -0,0 +1,12 @@
+{
+  "compilerOptions": {
+    "outDir": "./lib",
+    "allowJs": true,
+    "noImplicitAny": false,
+    "checkJs": true,
+    "forceConsistentCasingInFileNames": true,
+    "module": "NodeNext",
+    "strict": true,
+    "target": "ES2022"
+  }
+}