diff --git a/scripts/crawler.mjs b/scripts/crawler.mjs deleted file mode 100755 index 4db2e00..0000000 --- a/scripts/crawler.mjs +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env node - -/** - * Crawler document website. - * - * The script can be used in following scenarios: - * 1. Generate knowledge.json for the agent - * > node scripts/crawler.mjs https://github.com/reactjs/react.dev/tree/main/src/content/reference tmp/knowledge.json - * 2. To be used as a `recursive_url` document loader of AIChat - * > recursive_url: 'node /scripts/crawler.mjs $1 $2' - */ - -// DEPS: npm i @octokit/rest cheerio html-to-text node-fetch https-proxy-agent - -import { Octokit } from "@octokit/rest"; -import * as cheerio from "cheerio"; -import { URL } from "node:url"; -import { writeFileSync } from "node:fs"; -import { compile } from "html-to-text"; -import fetch from "node-fetch"; -import { HttpsProxyAgent } from "https-proxy-agent"; - -const compiledConvert = compile({ wordwrap: false, selectors: [{ selector: 'a', options: { ignoreHref: true } }] }); - -const MAX_DEPTH = parseInt(process.env.CRAWLER_MAX_DEPTH) || 3;; - -const MAX_CONCURRENT = parseInt(process.env.CRAWLER_MAX_CONCURRENT) || 5; - -const IGNORE_LINKS = new Set(); - -const IGNORE_PATHS_ENDING_IN = [ - "search.html", - "search", - "changelog", - "changelog.html", -]; - -let fetchOptions = { - headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" }, -}; - -async function main() { - const [startUrlRaw, outfile] = process.argv.slice(2); - if (!startUrlRaw || !outfile) { - console.log("Usage: ./crawler.mjs "); - process.exit(1); - } - if (startUrlRaw.startsWith("https://") && process.env["HTTPS_PROXY"]) { - fetchOptions["agent"] = new HttpsProxyAgent(process.env["HTTPS_PROXY"]); - } - let pages = []; - for await (const page of crawlPage(startUrlRaw, MAX_DEPTH)) { - pages.push(page); - } - const output = JSON.stringify(pages, null, 2); - writeFileSync(outfile, output); -} - -/** - * - * @param {String} startUrl - * @param {number} maxDepth - */ -async function* crawlPage(startUrlRaw, maxDepth = 3) { - if (!startUrlRaw.endsWith("/")) { - startUrlRaw += "/" - } - console.log("Starting crawl from: ", startUrlRaw, " - Max Depth: ", maxDepth); - const startUrl = new URL(startUrlRaw); - let paths = [{ path: startUrl.pathname, depth: 0 }]; - - if (startUrl.hostname === "github.com") { - const githubLinks = await crawlGithubRepo(startUrl); - paths = githubLinks.map((link) => ({ - path: link, - depth: 1, - })); - } - - let index = 0; - while (index < paths.length) { - const batch = paths.slice(index, index + MAX_CONCURRENT); - - const promises = batch.map(({ path, depth }) => - getLinksFromUrl(startUrlRaw, path).then((links) => ({ - links, - path, - depth, - })), - ); - - const results = await Promise.all(promises); - for (const { - links: { markdown, links: linksArray }, - path, - depth, - } of results) { - if (markdown !== "" && depth <= maxDepth) { - yield { - path: new URL(path, startUrl).toString(), - markdown, - }; - } - - if (depth < maxDepth) { - for (let link of linksArray) { - if (!paths.some((p) => p.path === link)) { - paths.push({ path: link, depth: depth + 1 }); - } - } - } - } - - index += batch.length; - } - console.log("Crawl completed"); -} - -/** - * - * @param {import("node:url").Url} startUrl - * @returns - */ -async function crawlGithubRepo(startUrl) { - const octokit = new Octokit({ - auth: undefined, - }); - - const [_, owner, repo, scope, branch, ...pathParts] = startUrl.pathname.split("/"); - if (scope !== "tree" && !branch) { - throw new Error("Invalid Github URL. It must follow the format: https://github.com///tree//") - } - const rootPath = pathParts.join("/"); - - const tree = await octokit.request( - "GET /repos/{owner}/{repo}/git/trees/{tree_sha}", - { - owner, - repo, - tree_sha: branch, - headers: { - "X-GitHub-Api-Version": "2022-11-28", - }, - recursive: "true", - }, - ); - - const paths = tree.data.tree - .filter((file) => file.type === "blob" && file.path?.endsWith(".md") && file.path.startsWith(rootPath)) - .map( - (file) => - `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`, - ); - - return paths; -} - -/** - * - * @param {String} startUrlRaw - * @param {String} path - * @returns - */ -async function getLinksFromUrl(startUrlRaw, path) { - const location = new URL(path, startUrlRaw).toString(); - - console.log(`Crawl ${location}`) - - const response = await fetch(location, fetchOptions); - const html = await response.text(); - - let links = []; - - if (startUrlRaw.includes("github.com")) { - return { - markdown: html, - links, - }; - } - - const $ = cheerio.load(html); - - IGNORE_LINKS.add(path); - if (path.endsWith("/")) { - IGNORE_LINKS.add(`${path}index.html`); - } - - $("a").each((_, element) => { - const href = $(element).attr("href"); - if (!href) { - return; - } - - const parsedUrl = new URL(href, startUrlRaw); - if (parsedUrl.toString().startsWith(startUrlRaw)) { - const link = parsedUrl.pathname; - if ( - !IGNORE_LINKS.has(link) && - !link.includes("#") && - !IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending)) - ) { - links.push(link); - } - } - }); - - links = [...new Set(links)]; - - return { - markdown: compiledConvert(html), - links, - }; -} - -main();