Skip to content

Commit

Permalink
feat: add scripts/crawler.mjs
Browse files Browse the repository at this point in the history
  • Loading branch information
sigoden committed Jun 26, 2024
1 parent 8c1f169 commit 1577f4e
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 2 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,9 @@ functions.json
/tools/test.*
/.env
*.cmd
__pycache__
__pycache__
/venv
node_modules
package.json
package-lock.json
*.lock
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ The agent has the following folder structure:
```
└── agents
└── myagent
├── embeddings/ # Contains RAG files for knownledge
├── embeddings/ # Contains RAG files for knowledge
├── functions.json # Function declarations file (Auto-generated)
├── index.yaml # Agent definition file
└── tools.{sh,js,py} # Agent tools script
Expand Down
215 changes: 215 additions & 0 deletions scripts/crawler.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#!/usr/bin/env node

/**
* Crawler document website.
*
* The script can be used in following scenarios:
* 1. Generate knowledge.json for the agent
* > node scripts/crawler.mjs https://github.com/reactjs/react.dev/tree/main/src/content/reference tmp/knowledge.json
* 2. To be used as a `recursive_url` document loader of AIChat
* > recursive_url: 'node <path-to-llm-functions>/scripts/crawler.mjs $1 $2'
*/

// DEPS: npm i @octokit/rest cheerio html-to-text node-fetch https-proxy-agent

import { Octokit } from "@octokit/rest";
import * as cheerio from "cheerio";
import { URL } from "node:url";
import { writeFileSync } from "node:fs";
import { compile } from "html-to-text";
import fetch from "node-fetch";
import { HttpsProxyAgent } from "https-proxy-agent";

const compiledConvert = compile({ wordwrap: false, selectors: [{ selector: 'a', options: { ignoreHref: true } }] });

const MAX_DEPTH = parseInt(process.env.CRAWLER_MAX_DEPTH) || 3;;

const MAX_CONCURRENT = parseInt(process.env.CRAWLER_MAX_CONCURRENT) || 5;

const IGNORE_LINKS = new Set();

const IGNORE_PATHS_ENDING_IN = [
"search.html",
"search",
"changelog",
"changelog.html",
];

let fetchOptions = {
headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" },
};

async function main() {
const [startUrlRaw, outfile] = process.argv.slice(2);
if (!startUrlRaw || !outfile) {
console.log("Usage: ./crawler.mjs <url> <outfile>");
process.exit(1);
}
if (startUrlRaw.startsWith("https://") && process.env["HTTPS_PROXY"]) {
fetchOptions["agent"] = new HttpsProxyAgent(process.env["HTTPS_PROXY"]);
}
let pages = [];
for await (const page of crawlPage(startUrlRaw, MAX_DEPTH)) {
pages.push(page);
}
const output = JSON.stringify(pages, null, 2);
writeFileSync(outfile, output);
}

/**
*
* @param {String} startUrl
* @param {number} maxDepth
*/
async function* crawlPage(startUrlRaw, maxDepth = 3) {
if (!startUrlRaw.endsWith("/")) {
startUrlRaw += "/"
}
console.log("Starting crawl from: ", startUrlRaw, " - Max Depth: ", maxDepth);
const startUrl = new URL(startUrlRaw);
let paths = [{ path: startUrl.pathname, depth: 0 }];

if (startUrl.hostname === "github.com") {
const githubLinks = await crawlGithubRepo(startUrl);
paths = githubLinks.map((link) => ({
path: link,
depth: 1,
}));
}

let index = 0;
while (index < paths.length) {
const batch = paths.slice(index, index + MAX_CONCURRENT);

const promises = batch.map(({ path, depth }) =>
getLinksFromUrl(startUrlRaw, path).then((links) => ({
links,
path,
depth,
})),
);

const results = await Promise.all(promises);
for (const {
links: { markdown, links: linksArray },
path,
depth,
} of results) {
if (markdown !== "" && depth <= maxDepth) {
yield {
path: new URL(path, startUrl).toString(),
markdown,
};
}

if (depth < maxDepth) {
for (let link of linksArray) {
if (!paths.some((p) => p.path === link)) {
paths.push({ path: link, depth: depth + 1 });
}
}
}
}

index += batch.length;
}
console.log("Crawl completed");
}

/**
*
* @param {import("node:url").Url} startUrl
* @returns
*/
async function crawlGithubRepo(startUrl) {
const octokit = new Octokit({
auth: undefined,
});

const [_, owner, repo, scope, branch, ...pathParts] = startUrl.pathname.split("/");
if (scope !== "tree" && !branch) {
throw new Error("Invalid Github URL. It must follow the format: https://github.com/<owner>/<repo>/tree/<branch>/<path>")
}
const rootPath = pathParts.join("/");

const tree = await octokit.request(
"GET /repos/{owner}/{repo}/git/trees/{tree_sha}",
{
owner,
repo,
tree_sha: branch,
headers: {
"X-GitHub-Api-Version": "2022-11-28",
},
recursive: "true",
},
);

const paths = tree.data.tree
.filter((file) => file.type === "blob" && file.path?.endsWith(".md") && file.path.startsWith(rootPath))
.map(
(file) =>
`https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`,
);

return paths;
}

/**
*
* @param {String} startUrlRaw
* @param {String} path
* @returns
*/
async function getLinksFromUrl(startUrlRaw, path) {
const location = new URL(path, startUrlRaw).toString();

console.log(`Crawl ${location}`)

const response = await fetch(location, fetchOptions);
const html = await response.text();

let links = [];

if (startUrlRaw.includes("github.com")) {
return {
markdown: html,
links,
};
}

const $ = cheerio.load(html);

IGNORE_LINKS.add(path);
if (path.endsWith("/")) {
IGNORE_LINKS.add(`${path}index.html`);
}

$("a").each((_, element) => {
const href = $(element).attr("href");
if (!href) {
return;
}

const parsedUrl = new URL(href, startUrlRaw);
if (parsedUrl.toString().startsWith(startUrlRaw)) {
const link = parsedUrl.pathname;
if (
!IGNORE_LINKS.has(link) &&
!link.includes("#") &&
!IGNORE_PATHS_ENDING_IN.some((ending) => link.endsWith(ending))
) {
links.push(link);
}
}
});

links = [...new Set(links)];

return {
markdown: compiledConvert(html),
links,
};
}

main();

0 comments on commit 1577f4e

Please sign in to comment.