Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for improved handling of jupyter notebooks #105

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/gitingest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from gitingest.clone import clone_repo
from gitingest.ingest import ingest
from gitingest.ingest_from_query import ingest_from_query
from gitingest.ingest_from_query import run_ingest_query
from gitingest.parse_query import parse_query

__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"]
__all__ = ["run_ingest_query", "clone_repo", "parse_query", "ingest"]
4 changes: 2 additions & 2 deletions src/gitingest/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from config import TMP_BASE_PATH
from gitingest.clone import CloneConfig, clone_repo
from gitingest.ingest_from_query import ingest_from_query
from gitingest.ingest_from_query import run_ingest_query
from gitingest.parse_query import parse_query


Expand Down Expand Up @@ -75,7 +75,7 @@ def ingest(
else:
raise TypeError("clone_repo did not return a coroutine as expected.")

summary, tree, content = ingest_from_query(query)
summary, tree, content = run_ingest_query(query)

if output is not None:
with open(output, "w", encoding="utf-8") as f:
Expand Down
8 changes: 6 additions & 2 deletions src/gitingest/ingest_from_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tiktoken

from gitingest.exceptions import AlreadyVisitedError, MaxFileSizeReachedError, MaxFilesReachedError
from gitingest.notebook_utils import process_notebook

MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal
Expand Down Expand Up @@ -158,7 +159,10 @@ def _read_file_content(file_path: Path) -> str:
The content of the file, or an error message if the file could not be read.
"""
try:
with file_path.open(encoding="utf-8", errors="ignore") as f:
if file_path.suffix == ".ipynb":
return process_notebook(file_path)

with open(file_path, encoding="utf-8", errors="ignore") as f:
return f.read()
except OSError as e:
return f"Error reading file: {e}"
Expand Down Expand Up @@ -819,7 +823,7 @@ def _ingest_directory(path: Path, query: dict[str, Any]) -> tuple[str, str, str]
return summary, tree, files_content


def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]:
def run_ingest_query(query: dict[str, Any]) -> tuple[str, str, str]:
"""
Main entry point for analyzing a codebase directory or single file.

Expand Down
66 changes: 66 additions & 0 deletions src/gitingest/notebook_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
""" Utilities for processing Jupyter notebooks. """

import json
import warnings
from pathlib import Path
from typing import Any


def process_notebook(file: Path) -> str:
"""
Process a Jupyter notebook file and return an executable Python script as a string.

Parameters
----------
file : Path
The path to the Jupyter notebook file.

Returns
-------
str
The executable Python script as a string.

Raises
------
ValueError
If an unexpected cell type is encountered.
"""
with file.open(encoding="utf-8") as f:
notebook: dict[str, Any] = json.load(f)

# Check if the notebook contains worksheets
if worksheets := notebook.get("worksheets"):
# https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets
# "The `worksheets` field is a list, but we have no UI to support multiple worksheets.
# Our design has since shifted to heading-cell based structure, so we never intend to
# support the multiple worksheet model. The worksheets list of lists shall be replaced
# with a single list, called `cells`."
warnings.warn("Worksheets are deprecated as of IPEP-17.", DeprecationWarning)

if len(worksheets) > 1:
warnings.warn(
"Multiple worksheets are not supported. Only the first worksheet will be processed.", UserWarning
)

notebook = worksheets[0]

result = []

for cell in notebook["cells"]:
cell_type = cell.get("cell_type")

# Validate cell type and handle unexpected types
if cell_type not in ("markdown", "code", "raw"):
raise ValueError(f"Unknown cell type: {cell_type}")

str_ = "".join(cell.get("source", []))
if not str_:
continue

# Convert Markdown and raw cells to multi-line comments
if cell_type in ("markdown", "raw"):
str_ = f'"""\n{str_}\n"""'

result.append(str_)

return "\n\n".join(result)
4 changes: 2 additions & 2 deletions src/process_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE
from gitingest.clone import CloneConfig, clone_repo
from gitingest.ingest_from_query import ingest_from_query
from gitingest.ingest_from_query import run_ingest_query
from gitingest.parse_query import parse_query
from server_utils import Colors, log_slider_to_size

Expand Down Expand Up @@ -91,7 +91,7 @@ async def process_query(
branch=query.get("branch"),
)
await clone_repo(clone_config)
summary, tree, content = ingest_from_query(query)
summary, tree, content = run_ingest_query(query)
with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f:
f.write(tree + "\n" + content)
except Exception as e:
Expand Down
16 changes: 16 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" This module contains fixtures for the tests. """

import json
from pathlib import Path
from typing import Any

Expand Down Expand Up @@ -72,3 +73,18 @@ def temp_directory(tmp_path: Path) -> Path:
(dir2 / "file_dir2.txt").write_text("Hello from dir2")

return test_dir


@pytest.fixture
def write_notebook(tmp_path: Path):
"""
A fixture that returns a helper function to write a .ipynb notebook file at runtime with given content.
"""

def _write_notebook(name: str, content: dict[str, Any]) -> Path:
notebook_path = tmp_path / name
with notebook_path.open(mode="w", encoding="utf-8") as f:
json.dump(content, f)
return notebook_path

return _write_notebook
22 changes: 21 additions & 1 deletion tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

from pathlib import Path
from typing import Any
from unittest.mock import patch

from gitingest.ingest_from_query import _extract_files_content, _scan_directory
from gitingest.ingest_from_query import _extract_files_content, _read_file_content, _scan_directory


def test_scan_directory(temp_directory: Path, sample_query: dict[str, Any]) -> None:
Expand Down Expand Up @@ -37,6 +38,25 @@ def test_extract_files_content(temp_directory: Path, sample_query: dict[str, Any
assert any("file_dir2.txt" in p for p in paths)


def test_read_file_content_with_notebook(tmp_path: Path):
notebook_path = tmp_path / "dummy_notebook.ipynb"
notebook_path.write_text("{}", encoding="utf-8") # minimal JSON

# Patch the symbol as it is used in ingest_from_query
with patch("gitingest.ingest_from_query.process_notebook") as mock_process:
_read_file_content(notebook_path)
mock_process.assert_called_once_with(notebook_path)


def test_read_file_content_with_non_notebook(tmp_path: Path):
py_file_path = tmp_path / "dummy_file.py"
py_file_path.write_text("print('Hello')", encoding="utf-8")

with patch("gitingest.ingest_from_query.process_notebook") as mock_process:
_read_file_content(py_file_path)
mock_process.assert_not_called()


# TODO: test with include patterns: ['*.txt']
# TODO: test with wrong include patterns: ['*.qwerty']

Expand Down
Loading
Loading