refactor(cli): create subdir for helpers

JohnTrunix · JohnTrunix · commit eca97a983e70 · 2025-03-31T21:14:52.000+02:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -12,6 +12,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+      - name: Install git
+        run: apt-get update && apt-get install -y git
       - name: Run pytest
         run: uv run poe test
       - name: Upload test coverage
diff --git a/grimoire.yaml b/grimoire.yaml
@@ -10,8 +10,6 @@ llm:
   text_chunk_overlap: 128
   code_chunk_size: 512
   code_chunk_overlap: 128
-docs:
+sources:
   - url: https://github.com/numpy/numpy
   - url: https://github.com/pandas-dev/pandas
-code:
-  - url: https://github.com/pandas-dev/pandas
diff --git a/grimoire/ask.py b/grimoire/ask.py
@@ -1,10 +1,10 @@
 import os
 
 import typer
-from langchain.chat_models import init_chat_model
 from langchain.chat_models.base import BaseChatModel
 
-from grimoire.helpers import blue_text, red_text
+from grimoire.helpers.rag import setup_llm
+from grimoire.helpers.typer import blue_text, red_text
 
 ask_cli = typer.Typer()
 
@@ -20,13 +20,7 @@ def get_llm_client() -> BaseChatModel:
         raise typer.Exit(code=1)
 
     # TODO: add configuration options for other models and options for model temperature, etc.
-    return init_chat_model(
-        "google_genai:gemini-2.0-flash",
-        api_key=os.getenv("LLM_API_KEY"),
-        configurable_fields=None,
-        max_tokens=512,
-        temperature=0,
-    )
+    return setup_llm()
 
 
 @ask_cli.command("ask", help="Ask a question with project context")
diff --git a/grimoire/configuration.py b/grimoire/configuration.py
@@ -35,20 +35,15 @@ class DBConfiguration(BaseModel):
     password: str
 
 
-class CodeSource(BaseModel):
-    url: str
-
-
-class DocumentSource(BaseModel):
+class Source(BaseModel):
     url: str
 
 
 class ProjectConfiguration(BaseModel):
     name: str
     db: DBConfiguration
     llm: LLMConfiguration
-    docs: list[DocumentSource] | None = None
-    code: list[CodeSource] | None = None
+    sources: list[Source] | None = None
 
     @classmethod
     def load_from_yaml(cls, file_path: Path) -> "ProjectConfiguration":  # noqa: B008
diff --git a/grimoire/helpers/__init__.py b/grimoire/helpers/__init__.py
diff --git a/grimoire/helpers/rag.py b/grimoire/helpers/rag.py
@@ -0,0 +1,94 @@
+import os
+
+import torch
+from langchain.chat_models import init_chat_model
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.vectorstores import VectorStore
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_postgres import PGVector
+
+from grimoire.configuration import DBConfiguration
+
+
+def vectorstore_connection(db: DBConfiguration) -> str:
+    return PGVector.connection_string_from_db_params(
+        driver="psycopg",
+        host=db.host,
+        port=db.port,
+        database="postgres",  # TODO: make this configurable
+        user=db.user,
+        password=db.password,
+    )
+
+
+def embeddings() -> HuggingFaceEmbeddings:
+    device = (
+        "cuda"
+        if torch.cuda.is_available()
+        else "mps"
+        if torch.backends.mps.is_available()
+        else "cpu"
+    )
+
+    # https://huggingface.co/BAAI/bge-m3?library=sentence-transformers
+    # https://python.langchain.com/api_reference/huggingface/embeddings/langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings.html
+    return HuggingFaceEmbeddings(
+        model_name="BAAI/bge-m3",
+        model_kwargs={"device": device},
+        encode_kwargs={"normalize_embeddings": True},
+    )
+
+
+def setup_vectorstore(collection: str, connection: str) -> VectorStore | None:
+    try:
+        return PGVector(
+            collection_name=collection,
+            connection=connection,
+            embeddings=embeddings(),
+        )
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        print(f"Error: {e}")
+        return None
+
+
+def clear_collection(collection: str, connection: str) -> None:
+    try:
+        PGVector(
+            connection=connection,
+            embeddings=embeddings(),
+            collection_name=collection,
+        ).delete_collection()
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        print(f"Error: {e}")
+
+
+def delete_vectorstore(connection: str) -> None:
+    try:
+        PGVector(
+            connection=connection,
+            embeddings=embeddings(),
+        ).drop_tables()
+    except Exception as e:  # pylint: disable=broad-exception-caught
+        print(f"Error: {e}")
+
+
+def setup_llm() -> BaseChatModel:
+    return init_chat_model(
+        "google_genai:gemini-2.0-flash",
+        api_key=os.getenv("LLM_API_KEY"),
+        configurable_fields=None,
+        max_tokens=512,
+        temperature=0,
+    )
+
+
+def text_ingestion(
+    collection: str, text_chunk_size: int, text_chunk_overlap: int
+) -> None:
+    pass
+
+
+def code_ingestion(
+    collection: str, code_chunk_size: int, code_chunk_overlap: int
+) -> None:
+    pass
diff --git a/grimoire/helpers/typer.py b/grimoire/helpers/typer.py
diff --git a/grimoire/init.py b/grimoire/init.py
@@ -4,22 +4,17 @@
 
 from grimoire.configuration import (
     CONFIG_FILE_NAME,
-    CodeSource,
     DBConfiguration,
-    DocumentSource,
     LLMConfiguration,
     ProjectConfiguration,
+    Source,
 )
 
 init_cli = typer.Typer()
 
-DUMMY_DOCS = [
-    DocumentSource(url="https://github.com/numpy/numpy"),
-    DocumentSource(url="https://github.com/pandas-dev/pandas"),
-]
-
-DUMMY_CODE = [
-    CodeSource(url="https://github.com/pandas-dev/pandas"),
+DUMMY_SOURCES = [
+    Source(url="https://github.com/numpy/numpy"),
+    Source(url="https://github.com/pandas-dev/pandas"),
 ]
 
 
@@ -65,8 +60,7 @@ def get_project_config(path: Path) -> ProjectConfiguration:
         name=project_name,
         llm=ingestion_config,
         db=db_config,
-        docs=DUMMY_DOCS,
-        code=DUMMY_CODE,
+        sources=DUMMY_SOURCES,
     )
 
 
diff --git a/grimoire/sync.py b/grimoire/sync.py
@@ -1,8 +1,36 @@
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
 import typer
+from git import Repo
+from rich.progress import track
+
+from grimoire.configuration import CONFIG_FILE_NAME, ProjectConfiguration
+from grimoire.helpers.rag import text_ingestion
 
 sync_cli = typer.Typer()
 
 
 @sync_cli.command("sync", help="Sync the grimoire project with existing configuration")
-def sync() -> None:
+def sync(
+    path: Path = typer.Argument(  # noqa: B008
+        Path.cwd(),  # noqa: B008
+        help="Path to the grimoire project",
+    ),
+) -> None:
     typer.echo("Syncing grimoire project")
+    config = ProjectConfiguration.load_from_yaml(path / CONFIG_FILE_NAME)
+
+    if not config.sources:
+        typer.echo("No sources found in configuration file")
+        raise typer.Exit()
+
+    for repo in track(config.sources, description="Processing ..."):
+        with TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            Repo.clone_from(repo.url, to_path=temp_path)
+            text_ingestion(
+                config.llm.collection,
+                config.llm.text_chunk_size,
+                config.llm.text_chunk_overlap,
+            )
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,12 +9,14 @@ requires-python = "<3.13,>=3.12"
 dependencies = [
     "datasets>=3.4.1",
     "esprima>=4.0.1",
+    "gitpython>=3.1.44",
     "google-genai>=1.4.0",
     "grpcio==1.60.1",
     "langchain-postgres>=0.0.13",
     "langchain[community,huggingface,google-genai]>=0.3.20",
     "psycopg[binary]>=3.2.6",
     "pydantic>=2.10.6",
+    "rich>=13.9.4",
     "sentence-transformers>=3.4.1",
     "torch>=2.6.0",
     "tree-sitter>=0.24.0",
diff --git a/uv.lock b/uv.lock