feat: add scripts/crawler.mjs

sigoden · Jun 26, 2024 · 1577f4e · 1577f4e
1 parent 8c1f169
commit 1577f4e
Show file tree

Hide file tree

Showing 3 changed files with 222 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,9 @@ functions.json
 /tools/test.*
 /.env
 *.cmd
-__pycache__
+__pycache__
+/venv
+node_modules
+package.json
+package-lock.json
+*.lock
diff --git a/README.md b/README.md
@@ -130,7 +130,7 @@ The agent has the following folder structure:
 ```
 └── agents
     └── myagent
-        ├── embeddings/                     # Contains RAG files for knownledge
+        ├── embeddings/                     # Contains RAG files for knowledge
         ├── functions.json                  # Function declarations file (Auto-generated)
         ├── index.yaml                      # Agent definition file
         └── tools.{sh,js,py}                # Agent tools script

diff --git a/scripts/crawler.mjs b/scripts/crawler.mjs
@@ -0,0 +1,215 @@
+#!/usr/bin/env node
+
+/**
+ * Crawler document website.
+ * 
+ * The script can be used in following scenarios:
+ * 1. Generate knowledge.json for the agent
+ * > node scripts/crawler.mjs https://github.com/reactjs/react.dev/tree/main/src/content/reference tmp/knowledge.json
+ * 2. To be used as a `recursive_url` document loader of AIChat
+ * > recursive_url: 'node <path-to-llm-functions>/scripts/crawler.mjs $1 $2'
+ */
+
+// DEPS: npm i @octokit/rest cheerio  html-to-text node-fetch https-proxy-agent
+
+import { Octokit } from "@octokit/rest";
+import * as cheerio from "cheerio";
+import { URL } from "node:url";
+import { writeFileSync } from "node:fs";
+import { compile } from "html-to-text";
+import fetch from "node-fetch";
+import { HttpsProxyAgent } from "https-proxy-agent";
+
+const compiledConvert = compile({ wordwrap: false, selectors: [{ selector: 'a', options: { ignoreHref: true } }] });
+
+const MAX_DEPTH = parseInt(process.env.CRAWLER_MAX_DEPTH) || 3;;
+
+const MAX_CONCURRENT = parseInt(process.env.CRAWLER_MAX_CONCURRENT) || 5;
+
+const IGNORE_LINKS = new Set();
+
+const IGNORE_PATHS_ENDING_IN = [
+  "search.html",
+  "search",
+  "changelog",
+  "changelog.html",
+];
+
+let fetchOptions = {
+  headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" },
+};
+
+async function main() {
+  const [startUrlRaw, outfile] = process.argv.slice(2);
+  if (!startUrlRaw || !outfile) {
+    console.log("Usage: ./crawler.mjs <url> <outfile>");
+    process.exit(1);
+  }
+  if (startUrlRaw.startsWith("https://") && process.env["HTTPS_PROXY"]) {
+    fetchOptions["agent"] = new HttpsProxyAgent(process.env["HTTPS_PROXY"]);
+  }
+  let pages = [];
+  for await (const page of crawlPage(startUrlRaw, MAX_DEPTH)) {
+    pages.push(page);
+  }
+  const output = JSON.stringify(pages, null, 2);
+  writeFileSync(outfile, output);
+}
+
+/**
+ * 
+ * @param {String} startUrl 
+ * @param {number} maxDepth 
+ */
+async function* crawlPage(startUrlRaw, maxDepth = 3) {
+  if (!startUrlRaw.endsWith("/")) {
+    startUrlRaw += "/"
+  }
+  console.log("Starting crawl from: ", startUrlRaw, " - Max Depth: ", maxDepth);
+  const startUrl = new URL(startUrlRaw);
+  let paths = [{ path: startUrl.pathname, depth: 0 }];
+
+  if (startUrl.hostname === "github.com") {
+    const githubLinks = await crawlGithubRepo(startUrl);
+    paths = githubLinks.map((link) => ({
+      path: link,
+      depth: 1,
+    }));
+  }
+
+  let index = 0;
+  while (index < paths.length) {
+    const batch = paths.slice(index, index + MAX_CONCURRENT);
+
+    const promises = batch.map(({ path, depth }) =>
+      getLinksFromUrl(startUrlRaw, path).then((links) => ({
+        links,
+        path,
+        depth,
+      })),
+    );
+
+    const results = await Promise.all(promises);
+    for (const {
+      links: { markdown, links: linksArray },
+      path,
+      depth,
+    } of results) {
+      if (markdown !== "" && depth <= maxDepth) {
+        yield {
+          path: new URL(path, startUrl).toString(),
+          markdown,
+        };
+      }
+
+      if (depth < maxDepth) {
+        for (let link of linksArray) {
+          if (!paths.some((p) => p.path === link)) {
+            paths.push({ path: link, depth: depth + 1 });
+          }
+        }
+      }
+    }
+
+    index += batch.length;
+  }
+  console.log("Crawl completed");
+}
+
+/**
+ * 
+ * @param {import("node:url").Url} startUrl 
+ * @returns 
+ */
+async function crawlGithubRepo(startUrl) {
+  const octokit = new Octokit({
+    auth: undefined,
+  });
+
+  const [_, owner, repo, scope, branch, ...pathParts] = startUrl.pathname.split("/");
+  if (scope !== "tree" && !branch) {
+    throw new Error("Invalid Github URL. It must follow the format: https://github.com/<owner>/<repo>/tree/<branch>/<path>")
+  }
+  const rootPath = pathParts.join("/");
+
+  const tree = await octokit.request(
+    "GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
+    {
+      owner,
+      repo,
+      tree_sha: branch,
+      headers: {
+        "X-GitHub-Api-Version": "2022-11-28",
+      },
+      recursive: "true",
+    },
+  );
+
+  const paths = tree.data.tree
+    .filter((file) => file.type === "blob" && file.path?.endsWith(".md") && file.path.startsWith(rootPath))
+    .map(
+      (file) =>
+        `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`,
+    );
+
+  return paths;
+}
+
+/**
+ * 
+ * @param {String} startUrlRaw 
+ * @param {String} path 
+ * @returns 
+ */
+async function getLinksFromUrl(startUrlRaw, path) {
+  const location = new URL(path, startUrlRaw).toString();
+
+  console.log(`Crawl ${location}`)
+
+  const response = await fetch(location, fetchOptions);
+  const html = await response.text();
+
+  let links = [];
+
+  if (startUrlRaw.includes("github.com")) {
+    return {
+      markdown: html,
+      links,
+    };
+  }
+
+  const $ = cheerio.load(html);
+
+  IGNORE_LINKS.add(path);
+  if (path.endsWith("/")) {
+    IGNORE_LINKS.add(`${path}index.html`);
+  }
+
+  $("a").each((_, element) => {
+    const href = $(element).attr("href");
+    if (!href) {
+      return;
+    }
+
+    const parsedUrl = new URL(href, startUrlRaw);
+    if (parsedUrl.toString().startsWith(startUrlRaw)) {
+      const link = parsedUrl.pathname;
+      if (
+        !IGNORE_LINKS.has(link) &&
+        !link.includes("#") &&
+        !IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
+      ) {
+        links.push(link);
+      }
+    }
+  });
+
+  links = [...new Set(links)];
+
+  return {
+    markdown: compiledConvert(html),
+    links,
+  };
+}
+
+main();