update settings

pingcap · Aug 27, 2024 · f7a1d57 · f7a1d57
1 parent fc29ec8
commit f7a1d57
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 12 deletions.
diff --git a/.env.example b/.env.example
@@ -18,5 +18,11 @@ TIDB_PASSWORD=
 TIDB_DATABASE=
 TIDB_SSL=true
 
-# *** DO NOT CHANGE BELOW CONFIGURATIONS UNLESS YOU KNOW WHAT YOU ARE DOING
-DSP_CACHEBOOL=false
+# CAUTION: Do not change EMBEDDING_DIMS after initializing the database.
+# Changing the embedding dimensions requires recreating the database and tables.
+# The default EMBEDDING_DIMS and EMBEDDING_MAX_TOKENS are set for the OpenAI text-embedding-3-small model.
+# If using a different embedding model, adjust these values according to the model's specifications.
+# For example:
+#   maidalun1020/bce-embedding-base_v1: EMBEDDING_DIMS=768   EMBEDDING_MAX_TOKENS=512
+EMBEDDING_DIMS=1536
+EMBEDDING_MAX_TOKENS=8191
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -79,8 +79,14 @@ def server_host(self) -> str:
 
     COMPLIED_INTENT_ANALYSIS_PROGRAM_PATH: str | None = None
 
-    # Currently, we only support 1536 dims for the embedding model
-    EMBEDDOMG_DIMS: int = 1536
+    # CAUTION: Do not change EMBEDDING_DIMS after initializing the database.
+    # Changing the embedding dimensions requires recreating the database and tables.
+    # The default EMBEDDING_DIMS and EMBEDDING_MAX_TOKENS are set for the OpenAI text-embedding-3-small model.
+    # If using a different embedding model, adjust these values according to the model's specifications.
+    # For example:
+    #   maidalun1020/bce-embedding-base_v1: EMBEDDING_DIMS=768   EMBEDDING_MAX_TOKENS=512
+    EMBEDDING_DIMS: int = 1536
+    EMBEDDING_MAX_TOKENS: int = 8191
 
     @computed_field  # type: ignore[misc]
     @property

diff --git a/backend/app/models/chunk.py b/backend/app/models/chunk.py
@@ -28,7 +28,9 @@ class Chunk(UUIDBaseModel, UpdatableBaseModel, table=True):
     text: str = Field(sa_column=Column(Text))
     meta: dict | list = Field(default={}, sa_column=Column(JSON))
     embedding: Any = Field(
-        sa_column=Column(VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)")
+        sa_column=Column(
+            VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)"
+        )
     )
     document_id: int = Field(foreign_key="documents.id", nullable=True)
     document: "Document" = SQLRelationship(

diff --git a/backend/app/models/knowledge_graph.py b/backend/app/models/knowledge_graph.py
@@ -36,10 +36,14 @@ class EntityBase(SQLModel):
 class Entity(EntityBase, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)
     description_vec: Any = Field(
-        sa_column=Column(VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)")
+        sa_column=Column(
+            VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)"
+        )
     )
     meta_vec: Any = Field(
-        sa_column=Column(VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)")
+        sa_column=Column(
+            VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)"
+        )
     )
 
     __tablename__ = "entities"
@@ -72,7 +76,9 @@ class RelationshipBase(SQLModel):
 class Relationship(RelationshipBase, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)
     description_vec: Any = Field(
-        sa_column=Column(VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)")
+        sa_column=Column(
+            VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)"
+        )
     )
     source_entity: Entity = SQLModelRelationship(
         sa_relationship_kwargs={

diff --git a/backend/app/models/semantic_cache.py b/backend/app/models/semantic_cache.py
@@ -19,11 +19,15 @@ class SemanticCache(SQLModel, table=True):
     id: Optional[int] = Field(default=None, primary_key=True)
     query: str = Field(sa_column=Column(Text))
     query_vec: Any = Field(
-        sa_column=Column(VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)")
+        sa_column=Column(
+            VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)"
+        )
     )
     value: str = Field(sa_column=Column(Text))
     value_vec: Any = Field(
-        sa_column=Column(VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)")
+        sa_column=Column(
+            VectorType(settings.EMBEDDOMG_DIMS), comment="hnsw(distance=cosine)"
+        )
     )
     meta: List | Dict = Field(default={}, sa_column=Column(JSON))
     created_at: datetime = Field(

diff --git a/backend/app/rag/build.py b/backend/app/rag/build.py
@@ -16,6 +16,7 @@
 from app.rag.node_parser import MarkdownNodeParser
 from app.rag.vector_store.tidb_vector_store import TiDBVectorStore
 from app.rag.chat_config import get_default_embedding_model
+from app.core.config import settings
 from app.models import (
     Document as DBDocument,
     Chunk as DBChunk,
@@ -45,9 +46,13 @@ def build_vector_index_from_document(
         if db_document.mime_type.lower() == "text/markdown":
             # spliter = MarkdownNodeParser()
             # TODO: FIX MarkdownNodeParser
-            spliter = SentenceSplitter()
+            spliter = SentenceSplitter(
+                chunk_size=settings.EMBEDDING_MAX_TOKENS,
+            )
         else:
-            spliter = SentenceSplitter()
+            spliter = SentenceSplitter(
+                chunk_size=settings.EMBEDDING_MAX_TOKENS,
+            )
 
         _transformations = [
             spliter,