Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] feat: Voyage embeddings #1574

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/tender-candles-shop.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@llamaindex/voyage-ai": major
---

Adding VoyageAI embedding package
8 changes: 8 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@ pnpm install

### Build the packages

You'll need Turbo to build the packages. If you don't have it, you can install it by running:

```shell
pnpm install turbo --global
```

To build all packages, run:

```shell
# Build all packages
turbo build --filter "./packages/*"
Expand Down
17 changes: 17 additions & 0 deletions examples/voyage-ai/embedding.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { VoyageAIEmbedding } from "llamaindex";

async function main() {
// API token can be provided as an environment variable too
// using VOYAGE_API_TOKEN variable
const apiKey = process.env.VOYAGE_API_TOKEN ?? "YOUR_API_TOKEN";
const model = "voyage-3-lite";
const embedModel = new VoyageAIEmbedding({
model,
apiKey,
});
const texts = ["hello", "world"];
const embeddings = await embedModel.getTextEmbeddingsBatch(texts);
console.log(`\nWe have ${embeddings.length} embeddings`);
}

main().catch(console.error);
40 changes: 40 additions & 0 deletions packages/providers/voyage-ai/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"name": "@llamaindex/voyage-ai",
"description": "VoyageAI Adapter for LlamaIndex",
"version": "0.0.1",
"type": "module",
"main": "./dist/index.cjs",
"module": "./dist/index.js",
"exports": {
".": {
"require": {
"types": "./dist/index.d.cts",
"default": "./dist/index.cjs"
},
"import": {
"types": "./dist/index.d.ts",
"default": "./dist/index.js"
}
}
},
"files": [
"dist"
],
"repository": {
"type": "git",
"url": "https://github.com/run-llama/LlamaIndexTS.git",
"directory": "packages/providers/voyage-ai"
},
"scripts": {
"build": "bunchee",
"dev": "bunchee --watch"
},
"devDependencies": {
"bunchee": "6.0.3"
},
"dependencies": {
"@llamaindex/core": "workspace:*",
"@llamaindex/env": "workspace:*",
"voyageai": "0.0.3-1"
}
}
137 changes: 137 additions & 0 deletions packages/providers/voyage-ai/src/embedding.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import { BaseEmbedding } from "@llamaindex/core/embeddings";
import type { MessageContentDetail } from "@llamaindex/core/llms";
import { extractSingleText } from "@llamaindex/core/utils";
import { getEnv } from "@llamaindex/env";
import { VoyageAI, VoyageAIClient } from "voyageai";

const DEFAULT_MODEL = "voyage-3";
const API_TOKEN_ENV_VARIABLE_NAME = "VOYAGE_API_TOKEN";
// const API_ROOT = "https://api.voyageai.com/v1/embeddings";
const DEFAULT_TIMEOUT = 60 * 1000;
const DEFAULT_MAX_RETRIES = 5;

/**
* VoyageAIEmbedding is an alias for VoyageAI that implements the BaseEmbedding interface.
*/
export class VoyageAIEmbedding extends BaseEmbedding {
/**
* VoyageAI model to use
* @default "voyage-3"
* @see https://docs.voyageai.com/docs/embeddings
*/
model: string;

/**
* VoyageAI API token
* @see https://docs.voyageai.com/docs/api-key-and-installation
* If not provided, it will try to get the token from the environment variable `VOYAGE_API_KEY`
*
*/
apiKey: string;

/**
* Maximum number of retries
* @default 5
*/
maxRetries: number;

/**
* Timeout in seconds
* @default 60
*/
timeout: number;
/**
* Whether to truncate the input texts to fit within the context length. Defaults to `true`.
* If `true`, over-length input texts will be truncated to fit within the context length, before vectorized by the embedding model.
* If `false`, an error will be raised if any given text exceeds the context length.
*/
truncation: boolean;

/**
* VoyageAI supports `document` and `query` as input types, or it can be left undefined. Using an input type prepends the input with a prompt before embedding.
* Example from their docs: using "query" adds "Represent the query for retrieving supporting documents:"
* VoyageAI says these types improve performance, but it will add to token usage. Embeddings with input types are compatible with those that don't use them.
* Setting this to `query` will use the `query` input type for getQueryEmbedding(s).
* Setting this to `document` will use the `document` input type for getTextEmbedding(s).
* Setting this to `both` will do both of the above.
* By default, this is undefined, which means no input types are used.
* @see https://docs.voyageai.com/docs/embeddings
* @default undefined
*/
useInputTypes: "query" | "document" | "both" | undefined;

/**
* VoyageAI client
*/
client: VoyageAIClient;

constructor(init?: Partial<VoyageAIEmbedding>) {
super();

this.model = init?.model ?? DEFAULT_MODEL;
this.apiKey = init?.apiKey ?? getEnv(API_TOKEN_ENV_VARIABLE_NAME) ?? "";
this.maxRetries = init?.maxRetries ?? DEFAULT_MAX_RETRIES;
this.timeout = init?.timeout ?? DEFAULT_TIMEOUT;
this.truncation = init?.truncation ?? true;
this.useInputTypes = init?.useInputTypes;
this.client = new VoyageAIClient({
apiKey: this.apiKey,
});
}

async getTextEmbedding(text: string): Promise<number[]> {
const embeddings = await this.getVoyageAIEmbedding([text], "document");
return embeddings[0]!;
}

async getQueryEmbedding(
query: MessageContentDetail,
): Promise<number[] | null> {
const text = extractSingleText(query);
if (text) {
const embeddings = await this.getVoyageAIEmbedding([text], "query");
return embeddings[0]!;
} else {
return null;
}
}

getTextEmbeddings = async (texts: string[]): Promise<number[][]> => {
return this.getVoyageAIEmbedding(texts, "document");
};

async getQueryEmbeddings(queries: string[]): Promise<number[][]> {
return this.getVoyageAIEmbedding(queries, "query");
}

private getInputType(requestType: "query" | "document") {
if (this.useInputTypes === "both") {
return requestType;
} else if (this.useInputTypes === requestType) {
return requestType;
} else {
return undefined;
}
}

private async getVoyageAIEmbedding(
inputs: VoyageAI.EmbedRequestInput,
inputType: VoyageAI.EmbedRequestInputType,
): Promise<number[][]> {
const request: VoyageAI.EmbedRequest = {
model: this.model,
input: inputs,
truncation: this.truncation,
};
const preferredInputType = this.getInputType(inputType);
if (preferredInputType) {
request.inputType = preferredInputType;
}
const response = await this.client.embed(request);
if (response.data) {
return response.data.map((item) => item.embedding ?? []);
} else {
throw new Error("Failed to get embeddings from VoyageAI");
}
}
}
1 change: 1 addition & 0 deletions packages/providers/voyage-ai/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export { VoyageAIEmbedding } from "./embedding";
19 changes: 19 additions & 0 deletions packages/providers/voyage-ai/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"extends": "../../../tsconfig.json",
"compilerOptions": {
"target": "ESNext",
"module": "ESNext",
"moduleResolution": "bundler",
"outDir": "./lib",
"tsBuildInfoFile": "./lib/.tsbuildinfo"
},
"include": ["./src"],
"references": [
{
"path": "../openai/tsconfig.json"
},
{
"path": "../../env/tsconfig.json"
}
]
}
33 changes: 33 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.