-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: get rid of Langchain dependency for document chunking and q…
…uerying the Vector Database (#9) * feat: add document loader * refactor: splits returns documents * feat: add text splitter * refactor: move to unstructured * chore: comment * refactor: refactor references * chore: updater README.md * chore: updater README.md * refactor: get rid of langchain fully * chore: update the README.md * refactor: refactored embedder and chroma client * refactor: refactored chroma client and text splitter * chore: updated todo * refactor: move vector database to memory * refactor: move vector database to memory * refactor: add Chroma unit tests * refactor: drop vector memory class * chore: update README * chore: reformat * chore: reformat * chore: bump version
- Loading branch information
1 parent
58a3e5a
commit f91e37a
Showing
28 changed files
with
1,205 additions
and
1,197 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,49 @@ | ||
from abc import ABC | ||
from typing import Any | ||
|
||
from langchain.embeddings import HuggingFaceEmbeddings | ||
import sentence_transformers | ||
|
||
|
||
class Embedder(ABC): | ||
embedder: Any | ||
class Embedder: | ||
def __init__(self, model_name: str = "all-MiniLM-L6-v2", cache_folder: str | None = None, **kwargs: Any): | ||
""" | ||
Initialize the Embedder class with the specified parameters. | ||
def get_embedding(self): | ||
return self.embedder | ||
Args: | ||
**kwargs (Any): Additional keyword arguments to pass to the SentenceTransformer model. | ||
""" | ||
self.client = sentence_transformers.SentenceTransformer(model_name, cache_folder=cache_folder, **kwargs) | ||
|
||
def embed_documents(self, texts: list[str], multi_process: bool = False, **encode_kwargs: Any) -> list[list[float]]: | ||
""" | ||
Compute document embeddings using a transformer model. | ||
class EmbedderHuggingFace(Embedder): | ||
def __init__(self, model_name: str = "all-MiniLM-L6-v2"): | ||
self.embedder = HuggingFaceEmbeddings(model_name=model_name) | ||
Args: | ||
texts (list[str]): The list of texts to embed. | ||
multi_process (bool): If True, use multiple processes to compute embeddings. | ||
**encode_kwargs (Any): Additional keyword arguments to pass when calling the `encode` method of the model. | ||
Returns: | ||
list[list[float]]: A list of embeddings, one for each text. | ||
""" | ||
|
||
texts = list(map(lambda x: x.replace("\n", " "), texts)) | ||
if multi_process: | ||
pool = self.client.start_multi_process_pool() | ||
embeddings = self.client.encode_multi_process(texts, pool) | ||
sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool) | ||
else: | ||
embeddings = self.client.encode(texts, show_progress_bar=True, **encode_kwargs) | ||
|
||
return embeddings.tolist() | ||
|
||
def embed_query(self, text: str) -> list[float]: | ||
""" | ||
Compute query embeddings using a transformer model. | ||
Args: | ||
text (str): The text to embed. | ||
Returns: | ||
list[float]: Embeddings for the text. | ||
""" | ||
return self.embed_documents([text])[0] |
Oops, something went wrong.