feat: add quickstart.ipynb

pingcap · Feb 28, 2025 · 6a4565b · 6a4565b
1 parent ca305a6
commit 6a4565b
Show file tree

Hide file tree

Showing 15 changed files with 432 additions and 401 deletions.
diff --git a/core/README.md b/core/README.md
@@ -1,4 +1,4 @@
 # Autoflow
 
-Yet another RAG framework.
+Yet another RAG framework. (WIP)
 
diff --git a/core/autoflow/__init__.py b/core/autoflow/__init__.py
@@ -0,0 +1,5 @@
+from .main import Autoflow
+from .knowledge_base import KnowledgeBase
+from .llms import LLMManager
+
+__all__ = ["Autoflow", "KnowledgeBase", "LLMManager"]
diff --git a/core/autoflow/knowledge_base/base.py b/core/autoflow/knowledge_base/base.py
@@ -14,14 +14,13 @@
 from autoflow.indices.knowledge_graph.base import KnowledgeGraphIndex
 from autoflow.indices.knowledge_graph.extractor import KnowledgeGraphExtractor
 from autoflow.indices.vector_search.base import VectorSearchIndex
+from autoflow.storage.doc_store.base import DocumentSearchMethod
 from autoflow.transformers.markdown import MarkdownNodeParser
-from autoflow.schema import DataSourceKind, IndexMethod, BaseComponent
+from autoflow.schema import DataSourceType, IndexMethod, BaseComponent
 from autoflow.models.chunk import get_chunk_model
 from autoflow.models.entity import get_entity_model
 from autoflow.models.relationship import get_relationship_model
 from autoflow.knowledge_base.config import (
-    ChatModelConfig,
-    EmbeddingModelConfig,
     ChunkingMode,
     GeneralChunkingConfig,
     ChunkSplitterConfig,
@@ -31,12 +30,11 @@
     ChunkingConfig,
 )
 from autoflow.models.document import Document
-from autoflow.knowledge_base.datasource import get_datasource_by_kind
-from autoflow.llms import default_llm_manager, LLMManager
+from autoflow.knowledge_base.datasource import get_datasource_by_type
+from autoflow.llms import default_llm_manager, LLMManager, ChatModel, EmbeddingModel
 from autoflow.storage import TiDBDocumentStore, TiDBKnowledgeGraphStore
 from autoflow.storage.doc_store import (
     DocumentSearchResult,
-    DocumentSearchQuery,
 )
 from autoflow.storage.graph_store.base import GraphSearchAlgorithm
 from autoflow.storage.schema import QueryBundle
@@ -53,8 +51,6 @@ class KnowledgeBase(BaseComponent):
     chunking_config: Optional[ChunkingConfig] = Field(
         default_factory=GeneralChunkingConfig
     )
-    chat_model_config: ChatModelConfig = Field()
-    embedding_model_config: EmbeddingModelConfig = Field()
     data_sources: List[DataSource] = Field(default_factory=list)
 
     def __init__(
@@ -63,29 +59,25 @@ def __init__(
         description: Optional[str] = None,
         index_methods: Optional[List[IndexMethod]] = None,
         chunking_config: Optional[ChunkingConfig] = None,
-        chat_model: ChatModelConfig = None,
-        embedding_model: EmbeddingModelConfig = None,
+        chat_model: Optional[ChatModel] = None,
+        embedding_model: Optional[EmbeddingModel] = None,
         db_engine: Engine = None,
         llm_manager: Optional[LLMManager] = None,
-        kb_id: Optional[uuid.UUID] = None,
+        id: Optional[uuid.UUID] = None,
     ):
         super().__init__(
-            id=kb_id or uuid.uuid4(),
+            id=id or uuid.uuid4(),
             name=name,
             description=description,
             index_methods=index_methods or [IndexMethod.VECTOR_SEARCH],
             chunking_config=chunking_config or GeneralChunkingConfig(),
-            chat_model_config=chat_model,
-            embedding_model_config=embedding_model,
         )
         self._db_engine = db_engine
         self._model_manager = llm_manager or default_llm_manager
-        self._chat_model = self._model_manager.resolve_chat_model(chat_model)
+        self._chat_model = chat_model
         self._dspy_lm = get_dspy_lm_by_chat_model(self._chat_model)
         self._graph_extractor = KnowledgeGraphExtractor(dspy_lm=self._dspy_lm)
-        self._embedding_model = self._model_manager.resolve_embedding_model(
-            embedding_model
-        )
+        self._embedding_model = embedding_model
         self._init_stores()
         self._vector_search_index = VectorSearchIndex(
             doc_store=self._doc_store,
@@ -149,11 +141,11 @@ def _init_stores(self):
 
     def import_documents_from_datasource(
         self,
-        kind: DataSourceKind,
+        type: DataSourceType,
         config: Dict[str, Any] = None,
         # TODO: Metadata Extractor
     ) -> DataSource:
-        datasource = get_datasource_by_kind(kind, config)
+        datasource = get_datasource_by_type(type, config)
         for doc in datasource.load_documents():
             doc.data_source_id = datasource.id
             doc.knowledge_base_id = self.id
@@ -162,8 +154,8 @@ def import_documents_from_datasource(
         return datasource
 
     def import_documents_from_files(self, files: List[Path]) -> List[Document]:
-        datasource = get_datasource_by_kind(
-            DataSourceKind.FILE, {"files": [{"path": file.as_uri()} for file in files]}
+        datasource = get_datasource_by_type(
+            DataSourceType.FILE, {"files": [{"path": file.as_uri()} for file in files]}
         )
         documents = []
         for doc in datasource.load_documents():
@@ -253,8 +245,25 @@ def _get_text_splitter(self, db_document: Document) -> TransformComponent:
             case _:
                 raise ValueError(f"Unsupported chunking splitter type: {rule.splitter}")
 
-    def search_documents(self, query: DocumentSearchQuery) -> DocumentSearchResult:
-        return self._doc_store.search(query)
+    def search_documents(
+        self,
+        query: str,
+        search_method: Optional[List[DocumentSearchMethod]] = None,
+        top_k: Optional[int] = None,
+        similarity_threshold: Optional[float] = None,
+        similarity_nprobe: Optional[int] = None,
+        similarity_top_k: Optional[int] = 5,
+        **kwargs: Any,
+    ) -> DocumentSearchResult:
+        return self._doc_store.search(
+            query=query,
+            search_method=search_method or [DocumentSearchMethod.VECTOR_SEARCH],
+            top_k=top_k,
+            similarity_threshold=similarity_threshold,
+            similarity_nprobe=similarity_nprobe,
+            similarity_top_k=similarity_top_k,
+            **kwargs,
+        )
 
     def search_knowledge_graph(
         self,

diff --git a/core/autoflow/knowledge_base/datasource.py b/core/autoflow/knowledge_base/datasource.py
@@ -6,15 +6,15 @@
     WebSitemapDataSource,
     WebSinglePageDataSource,
 )
-from autoflow.schema import DataSourceKind
+from autoflow.schema import DataSourceType
 
 
-def get_datasource_by_kind(kind: DataSourceKind, config: Any) -> DataSource:
-    if kind == DataSourceKind.FILE:
+def get_datasource_by_type(type: DataSourceType, config: Any) -> DataSource:
+    if type == DataSourceType.FILE:
         return FileDataSource(config)
-    elif kind == DataSourceKind.WEB_SITEMAP:
+    elif type == DataSourceType.WEB_SITEMAP:
         return WebSitemapDataSource(config)
-    elif kind == DataSourceKind.WEB_SINGLE_PAGE:
+    elif type == DataSourceType.WEB_SINGLE_PAGE:
         return WebSinglePageDataSource(config)
     else:
-        raise ValueError(f"Unknown datasource kind: {kind}")
+        raise ValueError(f"Unknown datasource type: {type}")
diff --git a/core/autoflow/main.py b/core/autoflow/main.py
@@ -5,7 +5,10 @@
 from sqlmodel import SQLModel
 
 from autoflow.knowledge_base import KnowledgeBase
-from autoflow.llms import EmbeddingModelConfig, ChatModelConfig
+from autoflow.llms import (
+    ChatModel,
+    EmbeddingModel,
+)
 from autoflow.schema import IndexMethod
 from autoflow.llms import LLMManager, default_llm_manager
 
@@ -29,16 +32,16 @@ def db_engine(self) -> Engine:
         return self._db_engine
 
     @property
-    def model_manager(self) -> LLMManager:
+    def llm_manager(self) -> LLMManager:
         return self._model_manager
 
-    def crate_knowledge_base(
+    def create_knowledge_base(
         self,
         name: str,
+        chat_model: ChatModel,
+        embedding_model: EmbeddingModel,
         description: Optional[str] = None,
         index_methods: Optional[List[IndexMethod]] = None,
-        chat_model: ChatModelConfig = None,
-        embedding_model: EmbeddingModelConfig = None,
         kb_id: Optional[uuid.UUID] = None,
     ) -> KnowledgeBase:
         return KnowledgeBase(
@@ -47,6 +50,6 @@ def crate_knowledge_base(
             index_methods=index_methods,
             chat_model=chat_model,
             embedding_model=embedding_model,
-            kb_id=kb_id,
+            id=kb_id,
             db_engine=self._db_engine,
         )
diff --git a/core/autoflow/schema.py b/core/autoflow/schema.py
@@ -3,7 +3,7 @@
 from llama_index.core.schema import BaseComponent
 
 
-class DataSourceKind(str, Enum):
+class DataSourceType(str, Enum):
     FILE = "file"
     WEB_SITEMAP = "web_sitemap"
     WEB_SINGLE_PAGE = "web_single_page"

diff --git a/core/autoflow/storage/doc_store/tidb/tidb_doc_store.py b/core/autoflow/storage/doc_store/tidb/tidb_doc_store.py
@@ -18,11 +18,11 @@
 from autoflow.models.document import Document
 from autoflow.storage.doc_store.base import (
     DocumentStore,
-    DocumentSearchQuery,
     DocumentSearchResult,
     ChunkWithScore,
     D,
     C,
+    DocumentSearchMethod,
 )
 
 
@@ -176,14 +176,25 @@ def add_doc_chunks(self, chunks: List[C]) -> List[C]:
             return chunks
 
     # TODO: call the low-level database API.
-    def search(self, query: DocumentSearchQuery, **kwargs: Any) -> DocumentSearchResult:
+    def search(
+        self,
+        query: str,
+        search_method: List[DocumentSearchMethod] = [
+            DocumentSearchMethod.VECTOR_SEARCH
+        ],
+        top_k: Optional[int] = None,
+        similarity_threshold: Optional[float] = None,
+        similarity_nprobe: Optional[int] = None,
+        similarity_top_k: Optional[int] = 5,
+        **kwargs: Any,
+    ) -> DocumentSearchResult:
         # TODO: Support Hybrid search.
         with self._session_scope() as db_session:
             chunks_with_score = self._vector_search(
-                query=query.query_str,
+                query=query,
                 # metadata_filters=query.metadata_filters,
-                nprobe=query.similarity_nprobe,
-                similarity_top_k=query.similarity_top_k,
+                nprobe=similarity_nprobe,
+                similarity_top_k=similarity_top_k,
                 db_session=db_session,
             )
             # chunks_with_score = self._rerank_chunks(chunks_with_score)

diff --git a/core/autoflow/storage/graph_store/tidb/tidb_graph_store.py b/core/autoflow/storage/graph_store/tidb/tidb_graph_store.py
@@ -929,14 +929,14 @@ def search_similar_relationships(
                     ).params(path=json_path, value=json.dumps(value))
 
         # Debug: Print the SQL query
-        # """
+        """
         from sqlalchemy.dialects import mysql
 
         compiled_query = query.compile(
             dialect=mysql.dialect(),
         )
         logger.info(f"Debug - SQL Query: \n{compiled_query}")
-        # """
+        """
 
         with self._session_scope(db_session) as session:
             rows = session.exec(query).all()

diff --git a/core/autoflow/utils/dspy_lm.py b/core/autoflow/utils/dspy_lm.py
@@ -7,5 +7,5 @@ def get_dspy_lm_by_chat_model(chat_model: ChatModel) -> dspy.LM:
     return dspy.LM(
         model=chat_model.model,
         max_tokens=chat_model.max_tokens,
-        api_key=chat_model.additional_kwargs["api_key"],
+        **chat_model.additional_kwargs,
     )
diff --git a/core/pyproject.toml b/core/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "autoflow-ai"
-version = "0.0.1"
+version = "0.0.1.dev4"
 description = "Yet another RAG framework."
 authors = [
     { name = "Mini256", email = "[email protected]" }
@@ -21,23 +21,15 @@ dependencies = [
     "dspy>=2.6.6",
     "pytest>=8.3.4",
     "pymysql>=1.1.1",
-    "fastapi-pagination>=0.12.34",
     "deepdiff>=8.2.0",
     "tokenizers>=0.21.0",
-    "langchain>=0.3.19",
-    "langchain-community>=0.3.18",
-    "langchain-openai>=0.3.7",
-    "unstructured[md]>=0.16.23",
-    "nltk>=3.9.1",
-    "libmagic>=1.0",
     "playwright>=1.50.0",
-    "pydantic-extra-types>=2.10.2",
 ]
 readme = "README.md"
 requires-python = ">= 3.8"
 
 [build-system]
-requires = ["hatchling"]
+requires = ["hatchling==1.26.3"]
 build-backend = "hatchling.build"
 
 [tool.rye]