Skip to content

Commit

Permalink
Refactor ingest_from_query to run_ingest_query, add process_notebook …
Browse files Browse the repository at this point in the history
…function, and add tests for notebook processing
  • Loading branch information
filipchristiansen committed Jan 6, 2025
1 parent 193c3c9 commit 1908391
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 7 deletions.
4 changes: 2 additions & 2 deletions src/gitingest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from gitingest.clone import clone_repo
from gitingest.ingest import ingest
from gitingest.ingest_from_query import ingest_from_query
from gitingest.ingest_from_query import run_ingest_query
from gitingest.parse_query import parse_query

__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"]
__all__ = ["run_ingest_query", "clone_repo", "parse_query", "ingest"]
4 changes: 2 additions & 2 deletions src/gitingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import shutil

from gitingest.clone import CloneConfig, clone_repo
from gitingest.ingest_from_query import ingest_from_query
from gitingest.ingest_from_query import run_ingest_query
from gitingest.parse_query import parse_query


Expand Down Expand Up @@ -74,7 +74,7 @@ def ingest(
else:
raise TypeError("clone_repo did not return a coroutine as expected.")

summary, tree, content = ingest_from_query(query)
summary, tree, content = run_ingest_query(query)

if output is not None:
with open(output, "w", encoding="utf-8") as f:
Expand Down
6 changes: 5 additions & 1 deletion src/gitingest/ingest_from_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tiktoken

from gitingest.exceptions import AlreadyVisitedError, MaxFileSizeReachedError, MaxFilesReachedError
from gitingest.notebook_utils import process_notebook

MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal
Expand Down Expand Up @@ -146,6 +147,9 @@ def _read_file_content(file_path: str) -> str:
The content of the file, or an error message if the file could not be read.
"""
try:
if file_path.endswith(".ipynb"):
return process_notebook(file_path)

with open(file_path, encoding="utf-8", errors="ignore") as f:
return f.read()
except OSError as e:
Expand Down Expand Up @@ -814,7 +818,7 @@ def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]:
return summary, tree, files_content


def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]:
def run_ingest_query(query: dict[str, Any]) -> tuple[str, str, str]:
"""
Main entry point for analyzing a codebase directory or single file.
Expand Down
47 changes: 47 additions & 0 deletions src/gitingest/notebook_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
""" Utilities for processing Jupyter notebooks. """

import json


def process_notebook(file: str) -> str:
"""
Process a Jupyter notebook file and return an executable Python script as a string.
Parameters
----------
file : str
The path to the Jupyter notebook file.
Returns
-------
str
The executable Python script as a string.
Raises
------
ValueError
If an unexpected cell type is encountered.
"""
with open(file, encoding="utf-8") as f:
notebook = json.load(f)

result = []

for cell in notebook["cells"]:
cell_type = cell.get("cell_type")

# Validate cell type and handle unexpected types
if cell_type not in ("markdown", "code", "raw"):
raise ValueError(f"Unknown cell type: {cell_type}")

str_ = "".join(cell.get("source", []))
if not str_:
continue

# Convert Markdown and raw cells to multi-line comments
if cell_type in ("markdown", "raw"):
str_ = f'"""\n{str_}\n"""'

result.append(str_)

return "\n\n".join(result)
4 changes: 2 additions & 2 deletions src/process_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE
from gitingest.clone import CloneConfig, clone_repo
from gitingest.ingest_from_query import ingest_from_query
from gitingest.ingest_from_query import run_ingest_query
from gitingest.parse_query import parse_query
from server_utils import Colors, log_slider_to_size

Expand Down Expand Up @@ -91,7 +91,7 @@ async def process_query(
branch=query.get("branch"),
)
await clone_repo(clone_config)
summary, tree, content = ingest_from_query(query)
summary, tree, content = run_ingest_query(query)
with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f:
f.write(tree + "\n" + content)
except Exception as e:
Expand Down
16 changes: 16 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" This module contains fixtures for the tests. """

import json
from pathlib import Path
from typing import Any

Expand Down Expand Up @@ -70,3 +71,18 @@ def temp_directory(tmp_path: Path) -> Path:
(dir2 / "file_dir2.txt").write_text("Hello from dir2")

return test_dir


@pytest.fixture
def write_notebook(tmp_path: Path):
"""
A helper fixture that returns a function for writing arbitrary notebook content to a temporary .ipynb file.
"""

def _write_notebook(name: str, content: dict) -> Path:
notebook_path = tmp_path / name
with notebook_path.open("w", encoding="utf-8") as f:
json.dump(content, f)
return notebook_path

return _write_notebook

0 comments on commit 1908391

Please sign in to comment.