Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(docs): move algolia data ingestion to docs-tools #2239

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/apps/docs/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
"private": true,
"scripts": {
"7d:ingest": "npx 7d ingest --files 'src/pages/**/*.md' --files 'src/pages/**/*.mdx' --files 'src/specs/**/*.json' --namespace kda-docs",
"7d:ingest-algolia-dev": "pnpm run build:scripts && pnpm run algolia-dev",
"7d:ingest-algolia-prod": "pnpm run build:scripts && pnpm run algolia-prod",
"7d:query": "npx 7d",
"algolia-dev": "ALGOLIA_INDEX_NAME=docs_website_dev OPENAI_API_KEY=dummy-key121 npx 7d ingest --files 'src/pages/**/*.md' --files 'src/pages/**/*.mdx' --files 'src/specs/**/*.json' --db algolia --skip-embeddings",
"algolia-prod": "ALGOLIA_INDEX_NAME=docs_website_prod OPENAI_API_KEY=dummy-key121 npx 7d ingest --files 'src/pages/**/*.md' --files 'src/pages/**/*.mdx' --files 'src/specs/**/*.json' --db algolia --skip-embeddings",
"algoliadev:scripts": "tsx ./src/scripts/algoliaIngestion.ts",
"build": "pnpm run build:scripts && pnpm run build:next",
"build:changelogs": "tsx ./src/scripts/importChangelogs/index.ts",
"build:e2e": "pnpm run build:scripts && NODE_ENV=test pnpm run build:next",
Expand All @@ -20,6 +20,7 @@
"format": "pnpm run --sequential /^format:.*/",
"format:lint": "pnpm run lint:src --fix",
"format:src": "prettier . --cache --write",
"ingest-algolia-dev": "pnpm run build:scripts && pnpm run algoliadev:scripts",
"lint": "pnpm run /^lint:.*/",
"lint:fmt": "prettier . --cache --check",
"lint:pkg": "lint-package",
Expand Down
32 changes: 32 additions & 0 deletions packages/apps/docs/src/scripts/algoliaIngestion.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { ingest } from '@kadena/docs-tools';
import type { IScriptResult } from './types';
import { initFunc } from './utils/build';
// eslint-disable-next-line @typescript-eslint/no-floating-promises
(async function (): Promise<void> {
async function ingestion(): Promise<IScriptResult> {
const errors = [];
const success = [];
const namespace = 'docs_playground';
Copy link
Contributor Author

@realdreamer realdreamer Jun 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is hardcoded for now to test it locally and play around. we need to change this to support as param/cli args for dev and prod support

console.log(`\nINGESTING DATA TO "${namespace}" INDEX\n`);
try {
await ingest({
sourceIdentifiers: ['src/pages/**/*.md', 'src/specs/**/*.json'],
source: 'fs',
db: 'Algolia',
isDryRun: false,
ignore: [],
namespace,
});
success.push(`Ingestion successful for ${namespace} index`);
} catch (e) {
errors.push(e);
errors.push(`Ingestion failed for ${namespace} index`);
}
return {
errors,
success,
};
}

await initFunc(ingestion, 'Ingesting data to Algolia');
})();
2 changes: 1 addition & 1 deletion packages/apps/docs/src/scripts/build.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import { validateLinks } from './validateLinks';

// eslint-disable-next-line @typescript-eslint/no-floating-promises
(async function (): Promise<void> {
//starting with a cleanslate, removing the tempdir.
// starting with a cleanslate, removing the tempdir.
deleteTempDir();
await initFunc(movePages, 'Move all pages from docs with config.yaml');
await initFunc(fixLocalLinks, 'fix local links from the config.yaml');
Expand Down
19 changes: 18 additions & 1 deletion packages/tools/docs-tools/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,25 @@
},
"dependencies": {
"acorn": "~8.11.2",
"algoliasearch": "^4.20.0",
"cheerio": "1.0.0-rc.12",
"dotenv": "~16.4.5",
"fast-glob": "^3.3.1",
"isomorphic-unfetch": "^4.0.2",
"mdast-util-from-markdown": "~1.3.0",
"mdast-util-frontmatter": "~1.0.1",
"mdast-util-gfm": "3.0.0",
"mdast-util-to-markdown": "~1.5.0",
"mdast-util-to-string": "~3.2.0",
"micromark-extension-frontmatter": "~1.1.0",
"remark-gfm": "^4.0.0"
"ora": "7.0.1",
"remark": "~14.0.3",
"remark-frontmatter": "~5.0.0",
"remark-gfm": "^4.0.0",
"remark-inline-links": "^6.0.1",
"remark-mdx": "^2.3.0",
"unist-builder": "^4.0.0",
"yaml": "~2.1.1"
},
"devDependencies": {
"@kadena-dev/eslint-config": "workspace:*",
Expand All @@ -62,9 +76,12 @@
"mdast": "^3.0.0",
"prettier": "~3.2.5",
"react": "^18.2.0",
"remark-parse": "^11.0.0",
"remark-stringify": "^11.0.0",
"rimraf": "~5.0.1",
"tsc-alias": "~1.8.7",
"typescript": "5.4.5",
"unified": "11.0.3",
"vitest": "^1.6.0",
"vitest-dom": "^0.1.1"
}
Expand Down
2 changes: 2 additions & 0 deletions packages/tools/docs-tools/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { ingest } from './ingestion/cli/command/ingest';
import remarkAdmonitions from './remarkAdmonition';
import remarkCheckForCodeTitle from './remarkCheckForCodeTitle';
import remarkFigureOutOfParagraph from './remarkFigureOutOfParagraph';
Expand Down Expand Up @@ -46,6 +47,7 @@ export {
getPathName,
getReadTime,
getUrlNameOfPageFile,
ingest,
isMarkDownFile,
remarkAdmonitions,
remarkCheckForCodeTitle,
Expand Down
72 changes: 72 additions & 0 deletions packages/tools/docs-tools/src/ingestion/cli/client/algolia.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import type { SearchClient } from 'algoliasearch';
import algoliasearch from 'algoliasearch';
import type { IMetaData } from '../../shared/index.js';
import { ALGOLIA_API_KEY, ALGOLIA_APP_ID, ALGOLIA_INDEX_NAME } from '../env.js';
import type {
IQueryOptions,
IUpsertVectorOptions,
VectorDatabase,
} from '../types.js';

interface IMetaDataHit extends IMetaData {
metadata: Record<string, unknown>;
}

export class Algolia implements VectorDatabase {
private _appId: string;
private _apiKey: string;
private _indexName: string;
private _client?: SearchClient;

public constructor() {
if (!ALGOLIA_APP_ID)
throw new Error('Missing ALGOLIA_APP_ID environment variable');
if (!ALGOLIA_API_KEY)
throw new Error('Missing ALGOLIA_API_KEY environment variable');
if (!ALGOLIA_INDEX_NAME)
throw new Error('Missing ALGOLIA_INDEX_NAME environment variable');
this._appId = ALGOLIA_APP_ID;
this._apiKey = ALGOLIA_API_KEY;
this._indexName = ALGOLIA_INDEX_NAME;
}

public setClient(): void {
// @ts-ignore This expression is not callable, ts(2349)
const client = algoliasearch(this._appId, this._apiKey);
this._client = client;
}

public getClient(): SearchClient | undefined {
if (!this._client) this.setClient();
return this._client;
}

public getIndex(): ReturnType<SearchClient['initIndex']> | undefined {
const _client = this.getClient();
return _client?.initIndex(this._indexName);
}

public async upsertVectors({
vectors,
}: IUpsertVectorOptions): Promise<number> {
const index = this.getIndex();
if (!index) return 0;

const objects = vectors.map((v) => ({ objectID: v.id, ...v.metadata }));
const { objectIDs } = await index.saveObjects(objects);
return objectIDs.length;
}

public async query({ embedding }: IQueryOptions): Promise<IMetaData[]> {
const index = this.getIndex();
if (!index) return [];
const { hits } = await index.search<IMetaDataHit>(embedding.join(','));
return hits.map((hit) => ({
filePath: hit.filePath,
url: hit.url,
content: hit.content,
title: hit.title,
...hit.metadata,
}));
}
}
127 changes: 127 additions & 0 deletions packages/tools/docs-tools/src/ingestion/cli/command/ingest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import * as fs from 'fs';
import path from 'node:path';
import type { IMetaData } from '../../shared/index.js';
import { CHUNK_SIZE } from '../../shared/index.js';
import { Algolia } from '../client/algolia.js';
import { fetchDocuments, sources } from '../fetcher/index.js';
import { parseDocument } from '../parser/index.js';
import { generateId } from '../util/crypto.js';
import ora from '../util/ora.js';
import { getInitUsage } from '../util/usage.js';

const targets = {
Algolia,
};

interface IOptions {
source?: string;
sourceIdentifiers: string[];
ignore: string[];
db?: string;
namespace: string;
isDryRun: boolean;
}

const isValidSource = (source?: string): source is keyof typeof sources =>
Boolean(source && source in sources);
const isValidTarget = (target?: string): target is keyof typeof targets =>
Boolean(target && target in targets);

export const ingest = async (options: IOptions): Promise<void> => {
const { source, sourceIdentifiers, ignore, db, namespace, isDryRun } =
options;

if (!isValidSource(source)) throw new Error(`Invalid --source: ${source}`);
if (!isValidTarget(db)) throw new Error(`Invalid --db: ${db}`);
const spinner = ora(`Fetching files`).start();

const files = await fetchDocuments(source, sourceIdentifiers, {
ignore,
});

spinner.succeed();

if (files.length > 0) {
const spinner = ora('Creating and upserting vector embeddings').start();

const DB = new targets[db]();

const counters = {
files: files.length,
vectors: 0,
usage: getInitUsage(),
};

try {
for (const file of files) {
const { content, url, filePath } = file;

if (!content) continue;

spinner.text = `Creating and upserting vector embedding for: ${filePath}`;

const { title, sections } = await parseDocument(
filePath,
content,
CHUNK_SIZE,
);

console.log({
title,
sections,
});

if (isDryRun) continue;
spinner.text = `Creating and upserting vector embedding for: ${filePath}`;

const vectors = sections.map((section) => {
// eslint-disable-next-line prefer-template
const id = generateId(filePath + '\n' + section.content.trim());
// const id = generateId(`${filePath}'\n'${section.content.trim()}`);
const metadata: IMetaData = {
title: title || '',
url,
filePath,
content: section.content,
header: section.header,
tags: section.tags,
};
console.log('metadata: ', metadata);
return { id, metadata };
});

console.log('vectors: ', vectors);
const fileDest = path.join('./dump', '', `${filePath}.json`);
const dirname = path.dirname(fileDest);
if (!fs.existsSync(dirname)) {
fs.mkdirSync(dirname, { recursive: true });
}

fs.writeFileSync(fileDest, JSON.stringify(vectors), 'utf-8');

const insertedVectorCount = await DB.upsertVectors({
namespace,
vectors,
});

counters.vectors += insertedVectorCount;
}

spinner.succeed('Creating and upserting vectors');
} catch (error) {
console.log(error);
if (error instanceof Error) spinner.fail(error.message);
else throw error;
} finally {
const messages = [
`Fetched ${counters.files} file(s) from ${source}`,
`upserted ${counters.vectors} vectors to ${db}`,
];
ora(messages.join(', ')).info();
}
} else {
throw new Error(
`Unable to find files to ingest (source: ${source}, patterns: ${sourceIdentifiers.join(',')})`,
);
}
};
11 changes: 11 additions & 0 deletions packages/tools/docs-tools/src/ingestion/cli/env.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import * as dotenv from 'dotenv';
import { get } from './util/storage.js';

dotenv.config();

export const ALGOLIA_APP_ID =
process.env.ALGOLIA_APP_ID ?? get('env', 'ALGOLIA_APP_ID');
export const ALGOLIA_API_KEY =
process.env.ALGOLIA_API_KEY ?? get('env', 'ALGOLIA_API_KEY');
export const ALGOLIA_INDEX_NAME =
process.env.ALGOLIA_INDEX_NAME ?? get('env', 'ALGOLIA_INDEX_NAME');
19 changes: 19 additions & 0 deletions packages/tools/docs-tools/src/ingestion/cli/fetcher/fs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import fg from 'fast-glob';
import fs from 'node:fs/promises';
import type { FetchFiles } from '../types.js';

interface IFileData {
filePath: string;
url: string;
content: Buffer;
}

const getFileData = async (filePath: string): Promise<IFileData> => {
console.log('files: ', filePath);
return { filePath, url: '', content: await fs.readFile(filePath) };
};

export const fetchFiles: FetchFiles = async (patterns, { ignore }) => {
const files = await fg(patterns, { ignore });
return Promise.all(files.map(getFileData));
};
21 changes: 21 additions & 0 deletions packages/tools/docs-tools/src/ingestion/cli/fetcher/http.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import type { FetchFiles } from '../types.js';

export const fetchFiles: FetchFiles = async (urls) => {
return Promise.all(
urls.map(async (url) => {
const _url = new URL(url);
const response = await fetch(_url);

if (!response.ok)
console.error(`${response.status} ${response.statusText}: ${url}`);

const filePath = _url.pathname.replace(/(\.html?)?$/, '.html');

return {
filePath,
url,
content: Buffer.from(await response.arrayBuffer()),
};
}),
);
};
16 changes: 16 additions & 0 deletions packages/tools/docs-tools/src/ingestion/cli/fetcher/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import type { IFile } from '../types.js';
import * as fs from './fs.js';
import * as http from './http.js';

export const sources = {
fs,
http,
} as const;

export const fetchDocuments = async (
source: keyof typeof sources,
identifiers: string[],
options: { ignore: string[] },
): Promise<IFile[]> => {
return sources[source].fetchFiles(identifiers, options);
};
Loading
Loading