Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dimension support #72

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion pinecone_text/dense/base_dense_ecoder.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
from typing import List, Union
from typing import List, Union, Any, Optional
from abc import ABC, abstractmethod
from functools import cached_property


class BaseDenseEncoder(ABC):
def __init__(self, *, dimension: Optional[int] = None, **kwargs: Any):
self._dimension = dimension
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that if we put dimension as optional param for the base class, we should either allow it in all the child classes or not in the base at all. Right now it seems we only going to allow it for OpenAI, so maybe better to not put it in the base class? WDYT?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to have dimension accessible to all the child classes since it will be weird if only OpenAI exposes. You are right in the fact that in the base init we should not receive this parameter. I create a private property with a default now and override in OpenAI now. I only define as an initialization parameter in OpenAI one.


@cached_property
def dimension(self) -> int:
if self._dimension is None:
return len(self.encode_documents("hello"))
return self._dimension

@abstractmethod
def encode_documents(
self, texts: Union[str, List[str]]
Expand Down
1 change: 1 addition & 0 deletions pinecone_text/dense/cohere_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def __init__(
+ "\n"
+ "\n".join([f"- {x}" for x in CohereEncoderName.list_models()])
)
super().__init__()
self._model_name = model_name
self._client = cohere.Client(api_key=api_key, **kwargs)

Expand Down
1 change: 1 addition & 0 deletions pinecone_text/dense/jina_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
raise ValueError(
"JinaEncoder requires an API key to work. Please provide `api_key` argument or set `JINA_API_KEY` environment variable"
)
super().__init__()
self._model_name = model_name
self._session = requests.Session()
self._session.headers.update(
Expand Down
14 changes: 11 additions & 3 deletions pinecone_text/dense/openai_encoder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Union, List, Any, Optional
from typing import Union, List, Any, Optional, Dict
from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder

try:
Expand Down Expand Up @@ -38,6 +38,8 @@ class OpenAIEncoder(BaseDenseEncoder):
def __init__(
self,
model_name: str = "text-embedding-ada-002",
*,
dimension: Optional[int] = None,
**kwargs: Any,
):
if not _openai_installed:
Expand All @@ -46,7 +48,9 @@ def __init__(
"dependencies by running: "
"`pip install pinecone-text[openai]"
)
super().__init__(dimension=dimension)
self._model_name = model_name
self._dimension = dimension
self._client = self._create_client(**kwargs)

@staticmethod
Expand Down Expand Up @@ -76,9 +80,13 @@ def _encode(
)

try:
response = self._client.embeddings.create(
input=texts_input, model=self._model_name
params: Dict[str, Any] = dict(
input=texts_input,
model=self._model_name,
)
if self._dimension:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either we should have here self._dimension is not None or better just assert on input that dimension > 0, right now if someone pass 0 it will work with OpenAI default which is kind of weird

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right fixing

params["dimensions"] = self._dimension
response = self._client.embeddings.create(**params)
except OpenAIError as e:
# TODO: consider wrapping external provider errors
raise e
Expand Down
2 changes: 1 addition & 1 deletion pinecone_text/dense/sentence_transformer_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(
"Failed to import sentence transformers. Make sure you install dense "
"extra dependencies by running: `pip install pinecone-text[dense]`"
)

super().__init__()
device = device or ("cuda" if torch.cuda.is_available() else "cpu")
self.document_encoder = SentenceTransformer(
document_encoder_name, device=device
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ numpy = [
]
requests = "^2.25.0"
types-requests = "^2.25.0"
python-dotenv = "^1.0.1"


[tool.poetry.extras]
Expand All @@ -39,6 +40,7 @@ mypy = "^1.0.1"
black = "^23.1.0"
pytest-cov = "^4.0.0"
pdoc = "^13.0.0"
pytest-dotenv = "^0.5.2"

[build-system]
requires = ["poetry-core"]
Expand Down
13 changes: 12 additions & 1 deletion tests/system/test_openai_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pinecone_text.dense import OpenAIEncoder, AzureOpenAIEncoder
from openai import BadRequestError, AuthenticationError


DEFAULT_DIMENSION = 1536


Expand Down Expand Up @@ -87,3 +86,15 @@ def test_encode_invalid_input(openai_encoder, encoding_function):
def test_encode_error_handling(openai_encoder, encoding_function):
with pytest.raises(BadRequestError):
encode_by_type(openai_encoder, encoding_function, "text is too long" * 10000)


def test_dimension(openai_encoder):
assert openai_encoder.dimension == DEFAULT_DIMENSION


def test_openai_encoder_with_dimension():
dimension = 10
encoder = OpenAIEncoder(model_name="text-embedding-3-small", dimension=dimension)
assert encoder.dimension == dimension
result = encoder.encode_documents("test text")
assert len(result) == dimension
Loading