Skip to content

Commit

Permalink
feat: add quickstart.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
Mini256 committed Feb 28, 2025
1 parent ca305a6 commit 6a4565b
Show file tree
Hide file tree
Showing 15 changed files with 432 additions and 401 deletions.
2 changes: 1 addition & 1 deletion core/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Autoflow

Yet another RAG framework.
Yet another RAG framework. (WIP)

5 changes: 5 additions & 0 deletions core/autoflow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .main import Autoflow
from .knowledge_base import KnowledgeBase
from .llms import LLMManager

__all__ = ["Autoflow", "KnowledgeBase", "LLMManager"]
57 changes: 33 additions & 24 deletions core/autoflow/knowledge_base/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@
from autoflow.indices.knowledge_graph.base import KnowledgeGraphIndex
from autoflow.indices.knowledge_graph.extractor import KnowledgeGraphExtractor
from autoflow.indices.vector_search.base import VectorSearchIndex
from autoflow.storage.doc_store.base import DocumentSearchMethod
from autoflow.transformers.markdown import MarkdownNodeParser
from autoflow.schema import DataSourceKind, IndexMethod, BaseComponent
from autoflow.schema import DataSourceType, IndexMethod, BaseComponent
from autoflow.models.chunk import get_chunk_model
from autoflow.models.entity import get_entity_model
from autoflow.models.relationship import get_relationship_model
from autoflow.knowledge_base.config import (
ChatModelConfig,
EmbeddingModelConfig,
ChunkingMode,
GeneralChunkingConfig,
ChunkSplitterConfig,
Expand All @@ -31,12 +30,11 @@
ChunkingConfig,
)
from autoflow.models.document import Document
from autoflow.knowledge_base.datasource import get_datasource_by_kind
from autoflow.llms import default_llm_manager, LLMManager
from autoflow.knowledge_base.datasource import get_datasource_by_type
from autoflow.llms import default_llm_manager, LLMManager, ChatModel, EmbeddingModel
from autoflow.storage import TiDBDocumentStore, TiDBKnowledgeGraphStore
from autoflow.storage.doc_store import (
DocumentSearchResult,
DocumentSearchQuery,
)
from autoflow.storage.graph_store.base import GraphSearchAlgorithm
from autoflow.storage.schema import QueryBundle
Expand All @@ -53,8 +51,6 @@ class KnowledgeBase(BaseComponent):
chunking_config: Optional[ChunkingConfig] = Field(
default_factory=GeneralChunkingConfig
)
chat_model_config: ChatModelConfig = Field()
embedding_model_config: EmbeddingModelConfig = Field()
data_sources: List[DataSource] = Field(default_factory=list)

def __init__(
Expand All @@ -63,29 +59,25 @@ def __init__(
description: Optional[str] = None,
index_methods: Optional[List[IndexMethod]] = None,
chunking_config: Optional[ChunkingConfig] = None,
chat_model: ChatModelConfig = None,
embedding_model: EmbeddingModelConfig = None,
chat_model: Optional[ChatModel] = None,
embedding_model: Optional[EmbeddingModel] = None,
db_engine: Engine = None,
llm_manager: Optional[LLMManager] = None,
kb_id: Optional[uuid.UUID] = None,
id: Optional[uuid.UUID] = None,
):
super().__init__(
id=kb_id or uuid.uuid4(),
id=id or uuid.uuid4(),
name=name,
description=description,
index_methods=index_methods or [IndexMethod.VECTOR_SEARCH],
chunking_config=chunking_config or GeneralChunkingConfig(),
chat_model_config=chat_model,
embedding_model_config=embedding_model,
)
self._db_engine = db_engine
self._model_manager = llm_manager or default_llm_manager
self._chat_model = self._model_manager.resolve_chat_model(chat_model)
self._chat_model = chat_model
self._dspy_lm = get_dspy_lm_by_chat_model(self._chat_model)
self._graph_extractor = KnowledgeGraphExtractor(dspy_lm=self._dspy_lm)
self._embedding_model = self._model_manager.resolve_embedding_model(
embedding_model
)
self._embedding_model = embedding_model
self._init_stores()
self._vector_search_index = VectorSearchIndex(
doc_store=self._doc_store,
Expand Down Expand Up @@ -149,11 +141,11 @@ def _init_stores(self):

def import_documents_from_datasource(
self,
kind: DataSourceKind,
type: DataSourceType,
config: Dict[str, Any] = None,
# TODO: Metadata Extractor
) -> DataSource:
datasource = get_datasource_by_kind(kind, config)
datasource = get_datasource_by_type(type, config)
for doc in datasource.load_documents():
doc.data_source_id = datasource.id
doc.knowledge_base_id = self.id
Expand All @@ -162,8 +154,8 @@ def import_documents_from_datasource(
return datasource

def import_documents_from_files(self, files: List[Path]) -> List[Document]:
datasource = get_datasource_by_kind(
DataSourceKind.FILE, {"files": [{"path": file.as_uri()} for file in files]}
datasource = get_datasource_by_type(
DataSourceType.FILE, {"files": [{"path": file.as_uri()} for file in files]}
)
documents = []
for doc in datasource.load_documents():
Expand Down Expand Up @@ -253,8 +245,25 @@ def _get_text_splitter(self, db_document: Document) -> TransformComponent:
case _:
raise ValueError(f"Unsupported chunking splitter type: {rule.splitter}")

def search_documents(self, query: DocumentSearchQuery) -> DocumentSearchResult:
return self._doc_store.search(query)
def search_documents(
self,
query: str,
search_method: Optional[List[DocumentSearchMethod]] = None,
top_k: Optional[int] = None,
similarity_threshold: Optional[float] = None,
similarity_nprobe: Optional[int] = None,
similarity_top_k: Optional[int] = 5,
**kwargs: Any,
) -> DocumentSearchResult:
return self._doc_store.search(
query=query,
search_method=search_method or [DocumentSearchMethod.VECTOR_SEARCH],
top_k=top_k,
similarity_threshold=similarity_threshold,
similarity_nprobe=similarity_nprobe,
similarity_top_k=similarity_top_k,
**kwargs,
)

def search_knowledge_graph(
self,
Expand Down
12 changes: 6 additions & 6 deletions core/autoflow/knowledge_base/datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
WebSitemapDataSource,
WebSinglePageDataSource,
)
from autoflow.schema import DataSourceKind
from autoflow.schema import DataSourceType


def get_datasource_by_kind(kind: DataSourceKind, config: Any) -> DataSource:
if kind == DataSourceKind.FILE:
def get_datasource_by_type(type: DataSourceType, config: Any) -> DataSource:
if type == DataSourceType.FILE:
return FileDataSource(config)
elif kind == DataSourceKind.WEB_SITEMAP:
elif type == DataSourceType.WEB_SITEMAP:
return WebSitemapDataSource(config)
elif kind == DataSourceKind.WEB_SINGLE_PAGE:
elif type == DataSourceType.WEB_SINGLE_PAGE:
return WebSinglePageDataSource(config)
else:
raise ValueError(f"Unknown datasource kind: {kind}")
raise ValueError(f"Unknown datasource type: {type}")
15 changes: 9 additions & 6 deletions core/autoflow/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
from sqlmodel import SQLModel

from autoflow.knowledge_base import KnowledgeBase
from autoflow.llms import EmbeddingModelConfig, ChatModelConfig
from autoflow.llms import (
ChatModel,
EmbeddingModel,
)
from autoflow.schema import IndexMethod
from autoflow.llms import LLMManager, default_llm_manager

Expand All @@ -29,16 +32,16 @@ def db_engine(self) -> Engine:
return self._db_engine

@property
def model_manager(self) -> LLMManager:
def llm_manager(self) -> LLMManager:
return self._model_manager

def crate_knowledge_base(
def create_knowledge_base(
self,
name: str,
chat_model: ChatModel,
embedding_model: EmbeddingModel,
description: Optional[str] = None,
index_methods: Optional[List[IndexMethod]] = None,
chat_model: ChatModelConfig = None,
embedding_model: EmbeddingModelConfig = None,
kb_id: Optional[uuid.UUID] = None,
) -> KnowledgeBase:
return KnowledgeBase(
Expand All @@ -47,6 +50,6 @@ def crate_knowledge_base(
index_methods=index_methods,
chat_model=chat_model,
embedding_model=embedding_model,
kb_id=kb_id,
id=kb_id,
db_engine=self._db_engine,
)
2 changes: 1 addition & 1 deletion core/autoflow/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from llama_index.core.schema import BaseComponent


class DataSourceKind(str, Enum):
class DataSourceType(str, Enum):
FILE = "file"
WEB_SITEMAP = "web_sitemap"
WEB_SINGLE_PAGE = "web_single_page"
Expand Down
21 changes: 16 additions & 5 deletions core/autoflow/storage/doc_store/tidb/tidb_doc_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
from autoflow.models.document import Document
from autoflow.storage.doc_store.base import (
DocumentStore,
DocumentSearchQuery,
DocumentSearchResult,
ChunkWithScore,
D,
C,
DocumentSearchMethod,
)


Expand Down Expand Up @@ -176,14 +176,25 @@ def add_doc_chunks(self, chunks: List[C]) -> List[C]:
return chunks

# TODO: call the low-level database API.
def search(self, query: DocumentSearchQuery, **kwargs: Any) -> DocumentSearchResult:
def search(
self,
query: str,
search_method: List[DocumentSearchMethod] = [
DocumentSearchMethod.VECTOR_SEARCH
],
top_k: Optional[int] = None,
similarity_threshold: Optional[float] = None,
similarity_nprobe: Optional[int] = None,
similarity_top_k: Optional[int] = 5,
**kwargs: Any,
) -> DocumentSearchResult:
# TODO: Support Hybrid search.
with self._session_scope() as db_session:
chunks_with_score = self._vector_search(
query=query.query_str,
query=query,
# metadata_filters=query.metadata_filters,
nprobe=query.similarity_nprobe,
similarity_top_k=query.similarity_top_k,
nprobe=similarity_nprobe,
similarity_top_k=similarity_top_k,
db_session=db_session,
)
# chunks_with_score = self._rerank_chunks(chunks_with_score)
Expand Down
4 changes: 2 additions & 2 deletions core/autoflow/storage/graph_store/tidb/tidb_graph_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,14 +929,14 @@ def search_similar_relationships(
).params(path=json_path, value=json.dumps(value))

# Debug: Print the SQL query
# """
"""
from sqlalchemy.dialects import mysql
compiled_query = query.compile(
dialect=mysql.dialect(),
)
logger.info(f"Debug - SQL Query: \n{compiled_query}")
# """
"""

with self._session_scope(db_session) as session:
rows = session.exec(query).all()
Expand Down
2 changes: 1 addition & 1 deletion core/autoflow/utils/dspy_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ def get_dspy_lm_by_chat_model(chat_model: ChatModel) -> dspy.LM:
return dspy.LM(
model=chat_model.model,
max_tokens=chat_model.max_tokens,
api_key=chat_model.additional_kwargs["api_key"],
**chat_model.additional_kwargs,
)
12 changes: 2 additions & 10 deletions core/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "autoflow-ai"
version = "0.0.1"
version = "0.0.1.dev4"
description = "Yet another RAG framework."
authors = [
{ name = "Mini256", email = "[email protected]" }
Expand All @@ -21,23 +21,15 @@ dependencies = [
"dspy>=2.6.6",
"pytest>=8.3.4",
"pymysql>=1.1.1",
"fastapi-pagination>=0.12.34",
"deepdiff>=8.2.0",
"tokenizers>=0.21.0",
"langchain>=0.3.19",
"langchain-community>=0.3.18",
"langchain-openai>=0.3.7",
"unstructured[md]>=0.16.23",
"nltk>=3.9.1",
"libmagic>=1.0",
"playwright>=1.50.0",
"pydantic-extra-types>=2.10.2",
]
readme = "README.md"
requires-python = ">= 3.8"

[build-system]
requires = ["hatchling"]
requires = ["hatchling==1.26.3"]
build-backend = "hatchling.build"

[tool.rye]
Expand Down
Loading

0 comments on commit 6a4565b

Please sign in to comment.