forked from cyclotruc/gitingest
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add support for improved handling of jupyter notebooks (cyclotr…
- Loading branch information
1 parent
e071c8e
commit 472be9e
Showing
8 changed files
with
321 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
""" Gitingest: A package for ingesting data from git repositories. """ | ||
|
||
from gitingest.query_ingestion import ingest_from_query | ||
from gitingest.query_ingestion import run_ingest_query | ||
from gitingest.query_parser import parse_query | ||
from gitingest.repository_clone import clone_repo | ||
from gitingest.repository_ingest import ingest | ||
|
||
__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] | ||
__all__ = ["run_ingest_query", "clone_repo", "parse_query", "ingest"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
""" Utilities for processing Jupyter notebooks. """ | ||
|
||
import json | ||
import warnings | ||
from pathlib import Path | ||
from typing import Any | ||
|
||
|
||
def process_notebook(file: Path) -> str: | ||
""" | ||
Process a Jupyter notebook file and return an executable Python script as a string. | ||
Parameters | ||
---------- | ||
file : Path | ||
The path to the Jupyter notebook file. | ||
Returns | ||
------- | ||
str | ||
The executable Python script as a string. | ||
Raises | ||
------ | ||
ValueError | ||
If an unexpected cell type is encountered. | ||
""" | ||
with file.open(encoding="utf-8") as f: | ||
notebook: dict[str, Any] = json.load(f) | ||
|
||
# Check if the notebook contains worksheets | ||
if worksheets := notebook.get("worksheets"): | ||
# https://github.com/ipython/ipython/wiki/IPEP-17:-Notebook-Format-4#remove-multiple-worksheets | ||
# "The `worksheets` field is a list, but we have no UI to support multiple worksheets. | ||
# Our design has since shifted to heading-cell based structure, so we never intend to | ||
# support the multiple worksheet model. The worksheets list of lists shall be replaced | ||
# with a single list, called `cells`." | ||
warnings.warn("Worksheets are deprecated as of IPEP-17.", DeprecationWarning) | ||
|
||
if len(worksheets) > 1: | ||
warnings.warn( | ||
"Multiple worksheets are not supported. Only the first worksheet will be processed.", UserWarning | ||
) | ||
|
||
notebook = worksheets[0] | ||
|
||
result = [] | ||
|
||
for cell in notebook["cells"]: | ||
cell_type = cell.get("cell_type") | ||
|
||
# Validate cell type and handle unexpected types | ||
if cell_type not in ("markdown", "code", "raw"): | ||
raise ValueError(f"Unknown cell type: {cell_type}") | ||
|
||
str_ = "".join(cell.get("source", [])) | ||
if not str_: | ||
continue | ||
|
||
# Convert Markdown and raw cells to multi-line comments | ||
if cell_type in ("markdown", "raw"): | ||
str_ = f'"""\n{str_}\n"""' | ||
|
||
result.append(str_) | ||
|
||
return "\n\n".join(result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
""" Tests for the notebook_utils module. """ | ||
|
||
import pytest | ||
|
||
from gitingest.notebook_utils import process_notebook | ||
|
||
|
||
def test_process_notebook_all_cells(write_notebook): | ||
""" | ||
Test a notebook containing markdown, code, and raw cells. | ||
- Markdown/raw cells => triple-quoted | ||
- Code cells => remain normal code | ||
- For 1 markdown + 1 raw => 2 triple-quoted blocks => 4 occurrences of triple-quotes. | ||
""" | ||
notebook_content = { | ||
"cells": [ | ||
{"cell_type": "markdown", "source": ["# Markdown cell"]}, | ||
{"cell_type": "code", "source": ['print("Hello Code")']}, | ||
{"cell_type": "raw", "source": ["<raw content>"]}, | ||
] | ||
} | ||
nb_path = write_notebook("all_cells.ipynb", notebook_content) | ||
result = process_notebook(nb_path) | ||
|
||
assert result.count('"""') == 4, "Expected 4 triple-quote occurrences for 2 blocks." | ||
|
||
# Check that markdown and raw content are inside triple-quoted blocks | ||
assert "# Markdown cell" in result | ||
assert "<raw content>" in result | ||
|
||
# Check code cell is present and not wrapped in triple quotes | ||
assert 'print("Hello Code")' in result | ||
assert '"""\nprint("Hello Code")\n"""' not in result | ||
|
||
|
||
def test_process_notebook_with_worksheets(write_notebook): | ||
""" | ||
Test a notebook containing the 'worksheets' key (deprecated as of IPEP-17). | ||
- Should raise a DeprecationWarning. | ||
- We process only the first (and only) worksheet's cells. | ||
- The resulting content matches an equivalent notebook with "cells" at top level. | ||
""" | ||
with_worksheets = { | ||
"worksheets": [ | ||
{ | ||
"cells": [ | ||
{"cell_type": "markdown", "source": ["# Markdown cell"]}, | ||
{"cell_type": "code", "source": ['print("Hello Code")']}, | ||
{"cell_type": "raw", "source": ["<raw content>"]}, | ||
] | ||
} | ||
] | ||
} | ||
without_worksheets = with_worksheets["worksheets"][0] # same, but no 'worksheets' key at top | ||
|
||
nb_with = write_notebook("with_worksheets.ipynb", with_worksheets) | ||
nb_without = write_notebook("without_worksheets.ipynb", without_worksheets) | ||
|
||
with pytest.warns(DeprecationWarning, match="Worksheets are deprecated as of IPEP-17."): | ||
result_with = process_notebook(nb_with) | ||
|
||
# No warnings here | ||
result_without = process_notebook(nb_without) | ||
|
||
assert result_with == result_without, "Both notebooks should produce identical content." | ||
|
||
|
||
def test_process_notebook_multiple_worksheets(write_notebook): | ||
""" | ||
Test a notebook containing multiple 'worksheets'. | ||
If multiple worksheets are present: | ||
- Only process the first sheet's cells. | ||
- DeprecationWarning for worksheets | ||
- UserWarning for ignoring extra worksheets | ||
""" | ||
multi_worksheets = { | ||
"worksheets": [ | ||
{"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, | ||
{"cells": [{"cell_type": "code", "source": ['print("Ignored Worksheet")']}]}, | ||
] | ||
} | ||
|
||
# Single-worksheet version (only the first) | ||
single_worksheet = { | ||
"worksheets": [ | ||
{"cells": [{"cell_type": "markdown", "source": ["# First Worksheet"]}]}, | ||
] | ||
} | ||
|
||
nb_multi = write_notebook("multiple_worksheets.ipynb", multi_worksheets) | ||
nb_single = write_notebook("single_worksheet.ipynb", single_worksheet) | ||
|
||
with pytest.warns(DeprecationWarning, match="Worksheets are deprecated as of IPEP-17."): | ||
with pytest.warns(UserWarning, match="Multiple worksheets are not supported."): | ||
result_multi = process_notebook(nb_multi) | ||
|
||
with pytest.warns(DeprecationWarning, match="Worksheets are deprecated as of IPEP-17."): | ||
result_single = process_notebook(nb_single) | ||
|
||
# The second worksheet (with code) should have been ignored | ||
assert result_multi == result_single, "Second worksheet was ignored, results match." | ||
|
||
|
||
def test_process_notebook_code_only(write_notebook): | ||
""" | ||
Test a notebook containing only code cells. | ||
No triple quotes should appear. | ||
""" | ||
notebook_content = { | ||
"cells": [ | ||
{"cell_type": "code", "source": ["print('Code Cell 1')"]}, | ||
{"cell_type": "code", "source": ["x = 42"]}, | ||
] | ||
} | ||
nb_path = write_notebook("code_only.ipynb", notebook_content) | ||
result = process_notebook(nb_path) | ||
|
||
# No triple quotes | ||
assert '"""' not in result | ||
assert "print('Code Cell 1')" in result | ||
assert "x = 42" in result | ||
|
||
|
||
def test_process_notebook_markdown_only(write_notebook): | ||
""" | ||
Test a notebook with 2 markdown cells. | ||
2 markdown cells => each becomes 1 triple-quoted block => 2 blocks => 4 triple quotes. | ||
""" | ||
notebook_content = { | ||
"cells": [ | ||
{"cell_type": "markdown", "source": ["# Markdown Header"]}, | ||
{"cell_type": "markdown", "source": ["Some more markdown."]}, | ||
] | ||
} | ||
nb_path = write_notebook("markdown_only.ipynb", notebook_content) | ||
result = process_notebook(nb_path) | ||
|
||
assert result.count('"""') == 4, "Two markdown cells => two triple-quoted blocks => 4 triple quotes total." | ||
assert "# Markdown Header" in result | ||
assert "Some more markdown." in result | ||
|
||
|
||
def test_process_notebook_raw_only(write_notebook): | ||
""" | ||
Test a notebook with 2 raw cells. | ||
2 raw cells => 2 blocks => 4 triple quotes. | ||
""" | ||
notebook_content = { | ||
"cells": [ | ||
{"cell_type": "raw", "source": ["Raw content line 1"]}, | ||
{"cell_type": "raw", "source": ["Raw content line 2"]}, | ||
] | ||
} | ||
nb_path = write_notebook("raw_only.ipynb", notebook_content) | ||
result = process_notebook(nb_path) | ||
|
||
# 2 raw cells => 2 triple-quoted blocks => 4 occurrences | ||
assert result.count('"""') == 4 | ||
assert "Raw content line 1" in result | ||
assert "Raw content line 2" in result | ||
|
||
|
||
def test_process_notebook_empty_cells(write_notebook): | ||
""" | ||
Test that cells with an empty 'source' are skipped entirely. | ||
4 cells but 3 are empty => only 1 non-empty cell => 1 triple-quoted block => 2 quotes. | ||
""" | ||
notebook_content = { | ||
"cells": [ | ||
{"cell_type": "markdown", "source": []}, | ||
{"cell_type": "code", "source": []}, | ||
{"cell_type": "raw", "source": []}, | ||
{"cell_type": "markdown", "source": ["# Non-empty markdown"]}, | ||
] | ||
} | ||
nb_path = write_notebook("empty_cells.ipynb", notebook_content) | ||
result = process_notebook(nb_path) | ||
|
||
# Only one non-empty markdown cell => 1 block => 2 triple quotes | ||
assert result.count('"""') == 2 | ||
assert "# Non-empty markdown" in result | ||
|
||
|
||
def test_process_notebook_invalid_cell_type(write_notebook): | ||
""" | ||
Test a notebook with an unknown cell type. | ||
Should raise a ValueError. | ||
""" | ||
notebook_content = { | ||
"cells": [ | ||
{"cell_type": "markdown", "source": ["# Valid markdown"]}, | ||
{"cell_type": "unknown", "source": ["Unrecognized cell type"]}, | ||
] | ||
} | ||
nb_path = write_notebook("invalid_cell_type.ipynb", notebook_content) | ||
|
||
with pytest.raises(ValueError, match="Unknown cell type: unknown"): | ||
process_notebook(nb_path) |
Oops, something went wrong.