diff --git a/chromadb/test/ef/test_voyage_multimodal.py b/chromadb/test/ef/test_voyage_multimodal.py new file mode 100644 index 00000000000..467ea111a32 --- /dev/null +++ b/chromadb/test/ef/test_voyage_multimodal.py @@ -0,0 +1,156 @@ +import os +from typing import Generator +import numpy as np +import pytest +import chromadb +from chromadb.api.types import ( + Embeddable, + EmbeddingFunction, +) +from chromadb.test.property.invariants import _exact_distances +from chromadb.config import Settings +from chromadb.utils.embedding_functions import VoyageAIEmbeddingFunction +from chromadb.test.ef.test_multimodal_ef import random_image, random_document + + +@pytest.fixture +def multimodal_collection() -> Generator[chromadb.Collection, None, None]: + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + + ef: EmbeddingFunction[Embeddable] = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + ) + + settings = Settings() + if os.environ.get("CHROMA_INTEGRATION_TEST_ONLY"): + host = os.environ.get("CHROMA_SERVER_HOST", "localhost") + port = int(os.environ.get("CHROMA_SERVER_HTTP_PORT", 0)) + settings.chroma_api_impl = "chromadb.api.fastapi.FastAPI" + settings.chroma_server_http_port = port + settings.chroma_server_host = host + + client = chromadb.Client(settings=settings) + collection = client.create_collection( + name="multimodal_collection", embedding_function=ef + ) + yield collection + client.clear_system_cache() + + +# Test adding and querying of a multimodal collection consisting of images and documents +def test_multimodal( + multimodal_collection: chromadb.Collection, + n_examples: int = 10, + n_query_results: int = 3, +) -> None: + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + + ef: EmbeddingFunction[Embeddable] = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + ) + # Fix numpy's random seed for reproducibility + random_state = np.random.get_state() + np.random.seed(0) + + image_ids = [str(i) for i in range(n_examples)] + images = [random_image() for _ in range(n_examples)] + image_embeddings = ef(images) + + document_ids = [str(i) for i in range(n_examples, 2 * n_examples)] + documents = [random_document() for _ in range(n_examples)] + document_embeddings = ef(documents) + + # Trying to add a document and an image at the same time should fail + with pytest.raises( + ValueError, + # This error string may be in any order + match=r"Exactly one of (images|documents|uris)(?:, (images|documents|uris))?(?:, (images|documents|uris))? must be provided in add\.", + ): + multimodal_collection.add( + ids=image_ids[0], documents=documents[0], images=images[0] + ) + + # Add some documents + multimodal_collection.add(ids=document_ids, documents=documents) + # Add some images + multimodal_collection.add(ids=image_ids, images=images) + + # get() should return all the documents and images + # ids corresponding to images should not have documents + get_result = multimodal_collection.get(include=["documents"]) + assert len(get_result["ids"]) == len(document_ids) + len(image_ids) + for i, id in enumerate(get_result["ids"]): + assert id in document_ids or id in image_ids + assert get_result["documents"] is not None + if id in document_ids: + assert get_result["documents"][i] == documents[document_ids.index(id)] + if id in image_ids: + assert get_result["documents"][i] is None + + # Generate a random query image + query_image = random_image() + query_image_embedding = ef([query_image]) + + image_neighbor_indices, _ = _exact_distances( + query_image_embedding, image_embeddings + document_embeddings + ) + # Get the ids of the nearest neighbors + nearest_image_neighbor_ids = [ + image_ids[i] if i < n_examples else document_ids[i % n_examples] + for i in image_neighbor_indices[0][:n_query_results] + ] + + # Generate a random query document + query_document = random_document() + query_document_embedding = ef([query_document]) + document_neighbor_indices, _ = _exact_distances( + query_document_embedding, image_embeddings + document_embeddings + ) + nearest_document_neighbor_ids = [ + image_ids[i] if i < n_examples else document_ids[i % n_examples] + for i in document_neighbor_indices[0][:n_query_results] + ] + + # Querying with both images and documents should fail + with pytest.raises(ValueError): + multimodal_collection.query( + query_images=[query_image], query_texts=[query_document] + ) + + # Query with images + query_result = multimodal_collection.query( + query_images=[query_image], n_results=n_query_results, include=["documents"] + ) + + assert query_result["ids"][0] == nearest_image_neighbor_ids + + # Query with documents + query_result = multimodal_collection.query( + query_texts=[query_document], n_results=n_query_results, include=["documents"] + ) + + assert query_result["ids"][0] == nearest_document_neighbor_ids + np.random.set_state(random_state) + + +@pytest.mark.xfail +def test_multimodal_update_with_image( + multimodal_collection: chromadb.Collection, +) -> None: + # Updating an entry with an existing document should remove the documents + + document = random_document() + image = random_image() + id = "0" + + multimodal_collection.add(ids=id, documents=document) + + multimodal_collection.update(ids=id, images=image) + + get_result = multimodal_collection.get(ids=id, include=["documents"]) + assert get_result["documents"] is not None + assert get_result["documents"][0] is None diff --git a/chromadb/test/ef/test_voyageai_ef.py b/chromadb/test/ef/test_voyageai_ef.py index bd0976aeee3..15cbbc79fb0 100644 --- a/chromadb/test/ef/test_voyageai_ef.py +++ b/chromadb/test/ef/test_voyageai_ef.py @@ -11,9 +11,285 @@ def test_with_embedding_dimensions() -> None: if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: pytest.skip("CHROMA_VOYAGE_API_KEY not set") ef = VoyageAIEmbeddingFunction( - api_key=os.environ["CHROMA_VOYAGE_API_KEY"] + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-3.5", + dimensions=2048, ) embeddings = ef(["hello world"]) assert embeddings is not None assert len(embeddings) == 1 - assert len(embeddings[0]) == 1536 + assert len(embeddings[0]) == 2048 + + +def test_with_multimodal_embeddings() -> None: + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + ) + embeddings = ef(["hello world"]) + assert embeddings is not None + assert len(embeddings) == 1 + assert len(embeddings[0]) == 1024 + + +def test_with_multimodal_image_embeddings() -> None: + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + ) + embeddings = ef(["hello world"]) + assert embeddings is not None + assert len(embeddings) == 1 + assert len(embeddings[0]) == 1024 + + +def test_with_multimodal_mixed_embeddings() -> None: + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + ) + embeddings = ef(["hello world"]) + assert embeddings is not None + assert len(embeddings) == 1 + assert len(embeddings[0]) == 1024 + + +def test_with_contextual_embedding() -> None: + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-context-3", + dimensions=2048, + ) + embeddings = ef(["hello world", "in chroma"]) + assert embeddings is not None + assert len(embeddings) == 2 + assert len(embeddings[0]) == 2048 + + +def test_count_tokens() -> None: + """Test token counting functionality.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-3", + ) + texts = ["hello world", "this is a longer text with more tokens"] + token_counts = ef.count_tokens(texts) + assert len(token_counts) == 2 + assert token_counts[0] > 0 + assert token_counts[1] > token_counts[0] # Longer text should have more tokens + + +def test_count_tokens_empty_list() -> None: + """Test token counting with empty list.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-3", + ) + token_counts = ef.count_tokens([]) + assert token_counts == [] + + +def test_count_tokens_single_text() -> None: + """Test token counting with single text.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-2", + ) + token_counts = ef.count_tokens(["hello"]) + assert len(token_counts) == 1 + assert token_counts[0] > 0 + + +def test_get_token_limit() -> None: + """Test getting token limit for different models.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + + # Test voyage-2 model + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-2", + ) + assert ef.get_token_limit() == 320_000 + + # Test context model + ef_context = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-context-3", + ) + assert ef_context.get_token_limit() == 32_000 + + # Test voyage-3-large model + ef_large = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-3-large", + ) + assert ef_large.get_token_limit() == 120_000 + + +def test_token_counting_with_multimodal() -> None: + """Test that token counting works with multimodal model.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + ) + texts = ["hello world", "test text"] + token_counts = ef.count_tokens(texts) + assert len(token_counts) == 2 + assert all(count > 0 for count in token_counts) + + +def test_batching_with_batch_size() -> None: + """Test that batching works with explicit batch_size parameter.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-3", + batch_size=2, + ) + texts = ["text1", "text2", "text3", "text4", "text5"] + embeddings = ef(texts) + assert len(embeddings) == 5 + assert all(len(emb) > 0 for emb in embeddings) + + +def test_build_batches() -> None: + """Test the _build_batches method.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-2", + batch_size=2, + ) + texts = ["short", "text", "here", "now"] + batches = list(ef._build_batches(texts)) + # Should create 2 batches of 2 texts each + assert len(batches) == 2 + assert len(batches[0]) == 2 + assert len(batches[1]) == 2 + + +def test_batching_with_large_texts() -> None: + """Test batching with texts that exceed token limits.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-3", + ) + # Create long texts + long_text = "This is a long text with many words. " * 100 + texts = [long_text, long_text, long_text] + embeddings = ef(texts) + assert len(embeddings) == 3 + assert all(len(emb) > 0 for emb in embeddings) + + +def test_config_includes_batch_size() -> None: + """Test that config includes batch_size parameter.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-3", + batch_size=10, + ) + config = ef.get_config() + assert "batch_size" in config + assert config["batch_size"] == 10 + + +def test_contextual_batching() -> None: + """Test that contextual models support batching.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-context-3", + batch_size=2, + ) + texts = ["text1", "text2", "text3", "text4"] + embeddings = ef(texts) + assert len(embeddings) == 4 + assert all(len(emb) > 0 for emb in embeddings) + + +def test_contextual_build_batches() -> None: + """Test that contextual models use _build_batches correctly.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-context-3", + batch_size=3, + ) + texts = ["short", "text", "here", "now", "more"] + batches = list(ef._build_batches(texts)) + # Should create batches respecting batch_size=3 + assert len(batches) >= 2 + # First batch should have at most 3 items + assert len(batches[0]) <= 3 + + +def test_multimodal_text_only_batching() -> None: + """Test that multimodal models support batching for text-only inputs.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + batch_size=2, + ) + texts = ["text1", "text2", "text3", "text4", "text5"] + embeddings = ef(texts) + assert len(embeddings) == 5 + assert all(len(emb) > 0 for emb in embeddings) + + +def test_contextual_with_large_batch() -> None: + """Test contextual model with large batch that should be split.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-context-3", + batch_size=5, + ) + # Create many texts + texts = [f"Document number {i} with some content" for i in range(15)] + embeddings = ef(texts) + assert len(embeddings) == 15 + assert all(len(emb) > 0 for emb in embeddings) + + +def test_multimodal_text_with_large_batch() -> None: + """Test multimodal model with large text batch that should be split.""" + if os.environ.get("CHROMA_VOYAGE_API_KEY") is None: + pytest.skip("CHROMA_VOYAGE_API_KEY not set") + ef = VoyageAIEmbeddingFunction( + api_key=os.environ["CHROMA_VOYAGE_API_KEY"], + model_name="voyage-multimodal-3", + batch_size=3, + ) + texts = [f"Text content {i}" for i in range(10)] + embeddings = ef(texts) + assert len(embeddings) == 10 + assert all(len(emb) > 0 for emb in embeddings) diff --git a/chromadb/utils/embedding_functions/voyageai_embedding_function.py b/chromadb/utils/embedding_functions/voyageai_embedding_function.py index 94a7051e46e..c925d1a0c27 100644 --- a/chromadb/utils/embedding_functions/voyageai_embedding_function.py +++ b/chromadb/utils/embedding_functions/voyageai_embedding_function.py @@ -1,37 +1,76 @@ -from chromadb.api.types import EmbeddingFunction, Space, Embeddings, Documents +from chromadb.api.types import ( + EmbeddingFunction, + Space, + Embeddings, + Embeddable, + Image, + Document, + is_image, + is_document, +) + from chromadb.utils.embedding_functions.schemas import validate_config_schema -from typing import List, Dict, Any, Optional +from typing import List, Dict, Any, Optional, Union, Generator, Callable import os import numpy as np import warnings +import importlib + +# Token limits for different VoyageAI models +VOYAGE_TOTAL_TOKEN_LIMITS = { + "voyage-context-3": 32_000, + "voyage-3.5-lite": 1_000_000, + "voyage-3.5": 320_000, + "voyage-2": 320_000, + "voyage-3-large": 120_000, + "voyage-code-3": 120_000, + "voyage-large-2-instruct": 120_000, + "voyage-finance-2": 120_000, + "voyage-multilingual-2": 120_000, + "voyage-law-2": 120_000, + "voyage-large-2": 120_000, + "voyage-3": 120_000, + "voyage-3-lite": 120_000, + "voyage-code-2": 120_000, + "voyage-3-m-exp": 120_000, + "voyage-multimodal-3": 120_000, +} -class VoyageAIEmbeddingFunction(EmbeddingFunction[Documents]): +class VoyageAIEmbeddingFunction(EmbeddingFunction[Embeddable]): """ This class is used to generate embeddings for a list of texts using the VoyageAI API. """ def __init__( self, + model_name: str, api_key: Optional[str] = None, - model_name: str = "voyage-large-2", api_key_env_var: str = "CHROMA_VOYAGE_API_KEY", input_type: Optional[str] = None, truncation: bool = True, + dimensions: Optional[int] = None, + embedding_type: Optional[str] = None, + batch_size: Optional[int] = None, ): """ Initialize the VoyageAIEmbeddingFunction. Args: + model_name (str): The name of the model to use for text embeddings. api_key_env_var (str, optional): Environment variable name that contains your API key for the VoyageAI API. Defaults to "CHROMA_VOYAGE_API_KEY". - model_name (str, optional): The name of the model to use for text embeddings. - Defaults to "voyage-large-2". api_key (str, optional): API key for the VoyageAI API. If not provided, will look for it in the environment variable. input_type (str, optional): The type of input to use for the VoyageAI API. Defaults to None. truncation (bool): Whether to truncate the input text. Defaults to True. + dimensions (int, optional): The output dimension for embeddings. + Defaults to None. + embedding_type (str, optional): The embedding type. + Defaults to None. + batch_size (int, optional): Maximum number of texts to embed in a single batch. + Defaults to None (no limit). """ try: import voyageai @@ -40,6 +79,13 @@ def __init__( "The voyageai python package is not installed. Please install it with `pip install voyageai`" ) + try: + self._PILImage = importlib.import_module("PIL.Image") + except ImportError: + raise ValueError( + "The PIL python package is not installed. Please install it with `pip install pillow`" + ) + if api_key is not None: warnings.warn( "Direct api_key configuration will not be persisted. " @@ -52,32 +98,214 @@ def __init__( if not self.api_key: raise ValueError(f"The {api_key_env_var} environment variable is not set.") + # Validate model_name + if not model_name or not model_name.strip(): + raise ValueError("model_name cannot be None or empty") + self.model_name = model_name self.input_type = input_type self.truncation = truncation + self.dimensions = dimensions + self.embedding_type = embedding_type + self.batch_size = batch_size self._client = voyageai.Client(api_key=self.api_key) - def __call__(self, input: Documents) -> Embeddings: + def __call__(self, input: Embeddable) -> Embeddings: """ - Generate embeddings for the given documents. + Generate embeddings for the given documents or images. Args: - input: Documents to generate embeddings for. + input: Documents or images to generate embeddings for. Returns: - Embeddings for the documents. + Embeddings for the documents or images. """ - embeddings = self._client.embed( - texts=input, - model=self.model_name, - input_type=self.input_type, - truncation=self.truncation, - ) + # Early return for empty input + if not input: + return [] + + # Handle multimodal mixed inputs (images and text) separately - no batching + if self._is_multimodal_model() and not all(isinstance(i, str) for i in input): + embeddings = self._client.multimodal_embed( + inputs=[[self.convert(i)] for i in input], + model=self.model_name, + input_type=self.input_type, + truncation=self.truncation, + ).embeddings + else: + # Use unified batching for all text inputs (regular, context, multimodal text-only) + # Cast to List[str] since we know all inputs are strings at this point + embeddings = self._embed_with_batching(list(input)) # type: ignore[arg-type] # Convert to numpy arrays - return [ - np.array(embedding, dtype=np.float32) for embedding in embeddings.embeddings - ] + return [np.array(embedding, dtype=np.float32) for embedding in embeddings] + + def _build_batches(self, texts: List[str]) -> Generator[List[str], None, None]: + """ + Generate batches of texts based on token limits using a generator. + + Args: + texts: List of texts to batch. + + Yields: + Batches of texts as lists. + """ + if not texts: + return + + max_tokens_per_batch = self.get_token_limit() + current_batch: List[str] = [] + current_batch_tokens = 0 + + # Tokenize all texts in one API call + all_token_lists = self._client.tokenize(texts, model=self.model_name) + token_counts = [len(tokens) for tokens in all_token_lists] + + for i, text in enumerate(texts): + n_tokens = token_counts[i] + + # Check if adding this text would exceed limits + if current_batch and ( + (self.batch_size and len(current_batch) >= self.batch_size) + or (current_batch_tokens + n_tokens > max_tokens_per_batch) + ): + # Yield the current batch and start a new one + yield current_batch + current_batch = [] + current_batch_tokens = 0 + + current_batch.append(text) + current_batch_tokens += n_tokens + + # Yield the last batch (always has at least one text) + yield current_batch + + def _get_embed_function(self) -> Callable[[List[str]], List[List[float]]]: + """ + Get the appropriate embedding function based on model type. + + Returns: + A callable that takes a batch of texts and returns embeddings. + """ + if self._is_context_model(): + + def embed_batch(batch: List[str]) -> List[List[float]]: + result = self._client.contextualized_embed( + inputs=[batch], + model=self.model_name, + input_type=self.input_type, + output_dimension=self.dimensions, + ) + return list(result.results[0].embeddings) + + return embed_batch + + elif self._is_multimodal_model(): + + def embed_batch(batch: List[str]) -> List[List[float]]: + result = self._client.multimodal_embed( + inputs=[[text] for text in batch], + model=self.model_name, + input_type=self.input_type, + truncation=self.truncation, + ) + return list(result.embeddings) + + return embed_batch + + else: + + def embed_batch(batch: List[str]) -> List[List[float]]: + result = self._client.embed( + texts=batch, + model=self.model_name, + input_type=self.input_type, + truncation=self.truncation, + output_dimension=self.dimensions, + ) + return list(result.embeddings) + + return embed_batch + + def _embed_with_batching(self, texts: List[str]) -> List[List[float]]: + """ + Unified method to embed texts with automatic batching based on token limits. + Works for regular, contextual, and multimodal (text-only) models. + + Args: + texts: List of texts to embed. + + Returns: + List of embeddings. + """ + if not texts: + return [] + + # Get the appropriate embedding function for this model type + embed_fn = self._get_embed_function() + + # Process each batch + all_embeddings = [] + for batch in self._build_batches(texts): + batch_embeddings = embed_fn(batch) + all_embeddings.extend(batch_embeddings) + + return all_embeddings + + def count_tokens(self, texts: List[str]) -> List[int]: + """ + Count tokens for the given texts. + + Args: + texts: List of texts to count tokens for. + + Returns: + List of token counts for each text. + """ + if not texts: + return [] + + # Use the VoyageAI tokenize API to get token counts + token_lists = self._client.tokenize(texts, model=self.model_name) + return [len(token_list) for token_list in token_lists] + + def get_token_limit(self) -> int: + """ + Get the token limit for the current model. + + Returns: + Token limit for the model, or default of 120_000 if not found. + """ + return VOYAGE_TOTAL_TOKEN_LIMITS.get(self.model_name, 120_000) + + def convert(self, embeddable: Union[Image, Document]) -> Any: + if is_document(embeddable): + return embeddable + elif is_image(embeddable): + # Convert to numpy array and ensure proper dtype for PIL + image_array = np.array(embeddable) + + # Convert to uint8 if not already, clipping values to valid range + if image_array.dtype != np.uint8: + # Normalize to 0-255 range if values are outside uint8 range + if image_array.max() > 255 or image_array.min() < 0: + image_array = np.clip(image_array, 0, 255) + image_array = image_array.astype(np.uint8) + + return self._PILImage.fromarray(image_array) + else: + raise ValueError( + f"Unsupported input type: {type(embeddable)}. " + "Expected Document (str) or Image (numpy array)" + ) + + def _is_context_model(self) -> bool: + """Check if the model is a contextualized embedding model.""" + return "context" in self.model_name + + def _is_multimodal_model(self) -> bool: + """Check if the model is a multimodal embedding model.""" + return "multimodal" in self.model_name @staticmethod def name() -> str: @@ -90,11 +318,14 @@ def supported_spaces(self) -> List[Space]: return ["cosine", "l2", "ip"] @staticmethod - def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]": + def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Embeddable]": api_key_env_var = config.get("api_key_env_var") model_name = config.get("model_name") input_type = config.get("input_type") truncation = config.get("truncation") + dimensions = config.get("dimensions") + embedding_type = config.get("embedding_type") + batch_size = config.get("batch_size") if api_key_env_var is None or model_name is None: assert False, "This code should not be reached" @@ -104,6 +335,9 @@ def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]": model_name=model_name, input_type=input_type, truncation=truncation if truncation is not None else True, + dimensions=dimensions, + embedding_type=embedding_type, + batch_size=batch_size, ) def get_config(self) -> Dict[str, Any]: @@ -112,6 +346,9 @@ def get_config(self) -> Dict[str, Any]: "model_name": self.model_name, "input_type": self.input_type, "truncation": self.truncation, + "dimensions": self.dimensions, + "embedding_type": self.embedding_type, + "batch_size": self.batch_size, } def validate_config_update( diff --git a/docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md b/docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md index f93db255b62..32ee27ea529 100644 --- a/docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md +++ b/docs/docs.trychroma.com/markdoc/content/integrations/embedding-models/voyageai.md @@ -5,7 +5,15 @@ name: VoyageAI # VoyageAI -Chroma also provides a convenient wrapper around VoyageAI's embedding API. This embedding function runs remotely on VoyageAI’s servers, and requires an API key. You can get an API key by signing up for an account at [VoyageAI](https://dash.voyageai.com/). +Chroma provides a convenient wrapper around VoyageAI's embedding API. This embedding function runs remotely on VoyageAI's servers, and requires an API key. You can get an API key by signing up for an account at [VoyageAI](https://dash.voyageai.com/). + +VoyageAI offers various embedding models including: +- **General-purpose models** (e.g., `voyage-3-large`, `voyage-3`, `voyage-2`) +- **Contextual embedding models** (e.g., `voyage-context-3`) +- **Multimodal models** (e.g., `voyage-multimodal-3`) +- **Domain-specific models** (e.g., `voyage-code-3`, `voyage-finance-2`, `voyage-law-2`) + +## Basic Usage {% Tabs %} {% Tab label="python" %} @@ -14,8 +22,20 @@ This embedding function relies on the `voyageai` python package, which you can i ```python import chromadb.utils.embedding_functions as embedding_functions -voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction(api_key="YOUR_API_KEY", model_name="voyage-3-large") -voyageai_ef(input=["document1","document2"]) + +# Basic usage with text embeddings +voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction( + api_key="YOUR_API_KEY", # Or use api_key_env_var (default: "CHROMA_VOYAGE_API_KEY") + model_name="voyage-3.5", # Required: model to use + input_type=None, # Optional: input type for the model + truncation=True, # Whether to truncate inputs (default: True) + dimensions=None, # Optional: output dimension for embeddings (e.g., 2048) + embedding_type=None, # Optional: embedding type + batch_size=None # Optional: max batch size for batching (e.g., 10) +) + +# Generate embeddings (supports multilingual text) +embeddings = voyageai_ef(input=["document1", "document2"]) ``` {% /Tab %} @@ -29,7 +49,7 @@ import { VoyageAIEmbeddingFunction } from "@chroma-core/voyageai"; const embedder = new VoyageAIEmbeddingFunction({ apiKey: "apiKey", - modelName: "model_name", + modelName: "voyage-3.5", }); // use directly @@ -50,49 +70,148 @@ const collectionGet = await client.getCollection({ {% /Tabs %} -### Multilingual model example +## Multimodal Embeddings + +VoyageAI's multimodal models (e.g., `voyage-multimodal-3`) can embed both text and images into the same vector space. {% TabbedCodeBlock %} {% Tab label="python" %} +For multimodal embeddings, you'll need to install Pillow: `pip install pillow` + ```python -voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction( - api_key="YOUR_API_KEY", - model_name="voyage-3-large") +import chromadb +import chromadb.utils.embedding_functions as embedding_functions +import numpy as np +from PIL import Image + +# Create multimodal embedding function +voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction( + api_key="YOUR_API_KEY", + model_name="voyage-multimodal-3" +) + +# Embed text documents +text_embeddings = voyageai_ef(["A photo of a cat", "A photo of a dog"]) + +# Embed images (as numpy arrays) +image = np.array(Image.open("path/to/image.jpg")) +image_embeddings = voyageai_ef([image]) + +# You can query images with text or vice versa +client = chromadb.Client() +collection = client.create_collection( + name="multimodal_collection", + embedding_function=voyageai_ef +) + +# Add text documents +collection.add( + ids=["doc1", "doc2"], + documents=["A photo of a cat", "A photo of a dog"] +) + +# Add images +collection.add( + ids=["img1"], + images=[image] +) + +# Query with text to find similar images or documents +results = collection.query( + query_texts=["feline animal"], + n_results=2 +) +``` + +{% /Tab %} + +{% /TabbedCodeBlock %} + +## Contextual Embeddings -multilingual_texts = [ 'Hello from VoyageAI!', 'مرحباً من VoyageAI!!', - 'Hallo von VoyageAI!', 'Bonjour de VoyageAI!', - '¡Hola desde VoyageAI!', 'Olá do VoyageAI!', - 'Ciao da VoyageAI!', '您好,来自 VoyageAI!', - 'कोहिअर से VoyageAI!' ] +VoyageAI's contextual models (e.g., `voyage-context-3`) generate embeddings that take into account the context of the entire batch. This is particularly useful when embedding related documents where understanding the relationships between them improves semantic search quality. -voyageai_ef(input=multilingual_texts) +{% TabbedCodeBlock %} + +{% Tab label="python" %} +```python +import chromadb.utils.embedding_functions as embedding_functions +import chromadb + +# Create contextual embedding function +voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction( + api_key="YOUR_API_KEY", + model_name="voyage-context-3", + dimensions=2048 # voyage-context-3 supports custom dimensions +) + +# Example: Using contextual embeddings with a collection +client = chromadb.Client() +collection = client.create_collection( + name="contextual_docs", + embedding_function=voyageai_ef +) + +# Add related documents - they will be embedded with contextual awareness +documents = [ + "Python is a high-level programming language.", + "Python is also a type of snake found in tropical regions.", + "Java is an island in Indonesia.", + "Java is a popular object-oriented programming language.", + "The Great Barrier Reef is located off the coast of Australia." +] + +collection.add( + ids=[f"doc{i}" for i in range(len(documents))], + documents=documents +) + +# Query for programming-related content +results = collection.query( + query_texts=["programming languages"], + n_results=3 +) + +# The contextual embeddings help distinguish between different meanings +# of "Python" and "Java" based on surrounding context ``` +**Use Cases for Contextual Embeddings:** +- Embedding chapters from the same book +- Processing related articles or research papers +- Handling documents with ambiguous terms that need context +- Creating embeddings for conversations or threaded discussions + {% /Tab %} -{% Tab label="typescript" %} +{% /TabbedCodeBlock %} -```typescript -import { VoyageAIEmbeddingFunction } from "chromadb"; - -const embedder = new VoyageAIEmbeddingFunction("apiKey", "voyage-3-large"); - -multilingual_texts = [ - "Hello from VoyageAI!", - "مرحباً من VoyageAI!!", - "Hallo von VoyageAI!", - "Bonjour de VoyageAI!", - "¡Hola desde VoyageAI!", - "Olá do VoyageAI!", - "Ciao da VoyageAI!", - "您好,来自 VoyageAI!", - "कोहिअर से VoyageAI!", -]; - -const embeddings = embedder.generate(multilingual_texts); +## Token Counting + +The VoyageAI embedding function provides a method to count tokens in your texts: + +{% TabbedCodeBlock %} + +{% Tab label="python" %} + +```python +import chromadb.utils.embedding_functions as embedding_functions + +voyageai_ef = embedding_functions.VoyageAIEmbeddingFunction( + api_key="YOUR_API_KEY", + model_name="voyage-3.5" +) + +texts = ["Short text", "This is a much longer text with more tokens"] +token_counts = voyageai_ef.count_tokens(texts) +# Returns: [2, 9] (example counts) + +# Get the token limit for the current model +token_limit = voyageai_ef.get_token_limit() +# Returns: 320000 for voyage-3.5 ``` {% /Tab %}