diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 77f488f..95dfece 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -18,7 +18,7 @@ jobs: - name: Set up Python 3.8 uses: actions/setup-python@v1 with: - python-version: 3.8 + python-version: "3.8" - uses: pre-commit/action@v2.0.0 tests: @@ -26,10 +26,10 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: ["3.7", "3.8", "3.9", "3.10"] include: - os: windows-latest - python-version: 3.7 + python-version: "3.8" runs-on: ${{ matrix.os }} @@ -67,14 +67,14 @@ jobs: steps: - name: Checkout source uses: actions/checkout@v2 - - name: Set up Python 3.7 + - name: Set up Python 3.8 uses: actions/setup-python@v1 with: - python-version: 3.7 + python-version: "3.8" - name: Build package run: | - pip install wheel - python setup.py sdist bdist_wheel + pip install build + python -m build - name: Publish uses: pypa/gh-action-pypi-publish@v1.1.0 with: diff --git a/.gitignore b/.gitignore index 27d37f3..7bbaf03 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,6 @@ _archive/ *_old* .DS_Store .vscode/ +~$* +_*.ipynb +final_notebook.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c9e0415..f5763a8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,8 +8,8 @@ exclude: > repos: - - repo: git://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.1.0 hooks: - id: check-json - id: check-yaml @@ -23,13 +23,19 @@ repos: args: [--no-build-isolation] additional_dependencies: [setuptools>=46.4.0] + - repo: https://github.com/asottile/pyupgrade + rev: v2.31.0 + hooks: + - id: pyupgrade + args: [--py37-plus] + - repo: https://github.com/pycqa/isort - rev: 5.9.3 + rev: 5.10.1 hooks: - id: isort - repo: https://github.com/psf/black - rev: 21.9b0 + rev: 21.12b0 hooks: - id: black diff --git a/.readthedocs.yml b/.readthedocs.yml index ccb61ea..2213067 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -6,6 +6,7 @@ python: - method: pip path: . extra_requirements: + - cli - rtd sphinx: diff --git a/README.md b/README.md index aee0aa3..84e8b11 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -[Install](#install) | [Example](#example-cli-usage) | [Contributing](#contributing) - # jupyter-cache [![Github-CI][github-ci]][github-link] @@ -10,400 +8,47 @@ A defined interface for working with a cache of jupyter notebooks. -Some desired requirements (not yet all implemented): +## Why use jupyter-cache? -- Persistent -- Separates out "edits to content" from "edits to code cells". Cell - rearranges and code cell changes should require a re-execution. Content changes should not. -- Allow parallel access to notebooks (for execution) -- Store execution statistics/reports -- Store external assets: Notebooks being executed often require external assets: importing scripts/data/etc. These are prepared by the users. -- Store execution artifacts: created during exeution -- A transparent and robust cache invalidation: imagine the user updating an external dependency or a Python module, or checking out a different git branch. +If you have a number of notebooks whose execution outputs you want to ensure are kept up to date, without having to re-execute them every time (particularly for long running code, or text-based formats that do not store the outputs). + +The notebooks must have deterministic execution outputs: + +- You use the same environment to run them (e.g. the same installed packages) +- They run no non-deterministic code (e.g. random numbers) +- They do not depend on external resources (e.g. files or network connections) that change over time + +For example, it is utilised by [jupyter-book](https://jupyterbook.org/content/execute.html#caching-the-notebook-execution), to allow for fast document re-builds. ## Install ```bash -pip install jupyter-cache[cli] +pip install jupyter-cache ``` For development: ```bash -git clone https://github.com/ExecutableBookProject/jupyter-cache +git clone https://github.com/executablebooks/jupyter-cache cd jupyter-cache git checkout develop pip install -e .[cli,code_style,testing] ``` -## Example API usage - -to come ... - -## Example CLI usage - - - -From the checked-out repository folder: - -```console -$ jcache --help -Usage: jcache [OPTIONS] COMMAND [ARGS]... - - The command line interface of jupyter-cache. - -Options: - -v, --version Show the version and exit. - -p, --cache-path Print the current cache path and exit. - -a, --autocomplete Print the autocompletion command and exit. - -h, --help Show this message and exit. - -Commands: - cache Commands for adding to and inspecting the cache. - clear Clear the cache completely. - config Commands for configuring the cache. - execute Execute staged notebooks that are outdated. - stage Commands for staging notebooks to be executed. -``` - -**Important**: Execute this in the terminal for auto-completion: - -```console -eval "$(_JCACHE_COMPLETE=source jcache)" -``` - -### Caching Executed Notebooks - -```console -$ jcache cache --help -Usage: cache [OPTIONS] COMMAND [ARGS]... - - Commands for adding to and inspecting the cache. - -Options: - --help Show this message and exit. - -Commands: - add Cache notebook(s) that have already been executed. - add-with-artefacts Cache a notebook, with possible artefact files. - cat-artifact Print the contents of a cached artefact. - diff-nb Print a diff of a notebook to one stored in the cache. - list List cached notebook records in the cache. - remove Remove notebooks stored in the cache. - show Show details of a cached notebook in the cache. -``` - -The first time the cache is required, it will be lazily created: - -```console -$ jcache cache list -Cache path: ../.jupyter_cache -The cache does not yet exist, do you want to create it? [y/N]: y -No Cached Notebooks - -``` - -You can add notebooks straight into the cache. -When caching, a check will be made that the notebooks look to have been executed -correctly, i.e. the cell execution counts go sequentially up from 1. - -```console -$ jcache cache add tests/notebooks/basic.ipynb -Caching: ../tests/notebooks/basic.ipynb -Validity Error: Expected cell 1 to have execution_count 1 not 2 -The notebook may not have been executed, continue caching? [y/N]: y -Success! -``` - -Or to skip validation: - -```console -$ jcache cache add --no-validate tests/notebooks/basic.ipynb tests/notebooks/basic_failing.ipynb tests/notebooks/basic_unrun.ipynb tests/notebooks/complex_outputs.ipynb tests/notebooks/external_output.ipynb -Caching: ../tests/notebooks/basic.ipynb -Caching: ../tests/notebooks/basic_failing.ipynb -Caching: ../tests/notebooks/basic_unrun.ipynb -Caching: ../tests/notebooks/complex_outputs.ipynb -Caching: ../tests/notebooks/external_output.ipynb -Success! -``` - -Once you've cached some notebooks, you can look at the 'cache records' -for what has been cached. - -Each notebook is hashed (code cells and kernel spec only), -which is used to compare against 'staged' notebooks. -Multiple hashes for the same URI can be added -(the URI is just there for inspetion) and the size of the cache is limited -(current default 1000) so that, at this size, -the last accessed records begin to be deleted. -You can remove cached records by their ID. - -```console -$ jcache cache list - ID Origin URI Created Accessed ----- ------------------------------------- ---------------- ---------------- - 5 tests/notebooks/external_output.ipynb 2020-03-12 17:31 2020-03-12 17:31 - 4 tests/notebooks/complex_outputs.ipynb 2020-03-12 17:31 2020-03-12 17:31 - 3 tests/notebooks/basic_unrun.ipynb 2020-03-12 17:31 2020-03-12 17:31 - 2 tests/notebooks/basic_failing.ipynb 2020-03-12 17:31 2020-03-12 17:31 -``` - -Tip: Use the `--latest-only` option, to only show the latest versions of cached notebooks. - -You can also cache notebooks with artefacts -(external outputs of the notebook execution). - -```console -$ jcache cache add-with-artefacts -nb tests/notebooks/basic.ipynb tests/notebooks/artifact_folder/artifact.txt -Caching: ../tests/notebooks/basic.ipynb -Validity Error: Expected cell 1 to have execution_count 1 not 2 -The notebook may not have been executed, continue caching? [y/N]: y -Success! -``` - -Show a full description of a cached notebook by referring to its ID - -```console -$ jcache cache show 6 -ID: 6 -Origin URI: ../tests/notebooks/basic.ipynb -Created: 2020-03-12 17:31 -Accessed: 2020-03-12 17:31 -Hashkey: 818f3412b998fcf4fe9ca3cca11a3fc3 -Artifacts: -- artifact_folder/artifact.txt -``` - -Note artefact paths must be 'upstream' of the notebook folder: - -```console -$ jcache cache add-with-artefacts -nb tests/notebooks/basic.ipynb tests/test_db.py -Caching: ../tests/notebooks/basic.ipynb -Artifact Error: Path '../tests/test_db.py' is not in folder '../tests/notebooks'' -``` - -To view the contents of an execution artefact: - -```console -$ jcache cache cat-artifact 6 artifact_folder/artifact.txt -An artifact - -``` +See the documentation for usage. -You can directly remove a cached notebook by its ID: +## Development -```console -$ jcache cache remove 4 -Removing Cache ID = 4 -Success! -``` - -You can also diff any of the cached notebooks with any (external) notebook: - -```console -$ jcache cache diff-nb 2 tests/notebooks/basic.ipynb -nbdiff ---- cached pk=2 -+++ other: ../tests/notebooks/basic.ipynb -## inserted before nb/cells/0: -+ code cell: -+ execution_count: 2 -+ source: -+ a=1 -+ print(a) -+ outputs: -+ output 0: -+ output_type: stream -+ name: stdout -+ text: -+ 1 - -## deleted nb/cells/0: -- code cell: -- source: -- raise Exception('oopsie!') - - -Success! -``` - -### Staging Notebooks for execution - -```console -$ jcache stage --help -Usage: stage [OPTIONS] COMMAND [ARGS]... - - Commands for staging notebooks to be executed. - -Options: - --help Show this message and exit. - -Commands: - add Stage notebook(s) for execution. - add-with-assets Stage a notebook, with possible asset files. - list List notebooks staged for possible execution. - remove-ids Un-stage notebook(s), by ID. - remove-uris Un-stage notebook(s), by URI. - show Show details of a staged notebook. -``` - -Staged notebooks are recorded as pointers to their URI, -i.e. no physical copying takes place until execution time. - -If you stage some notebooks for execution, then -you can list them to see which have existing records in the cache (by hash), -and which will require execution: - -```console -$ jcache stage add tests/notebooks/basic.ipynb tests/notebooks/basic_failing.ipynb tests/notebooks/basic_unrun.ipynb tests/notebooks/complex_outputs.ipynb tests/notebooks/external_output.ipynb -Staging: ../tests/notebooks/basic.ipynb -Staging: ../tests/notebooks/basic_failing.ipynb -Staging: ../tests/notebooks/basic_unrun.ipynb -Staging: ../tests/notebooks/complex_outputs.ipynb -Staging: ../tests/notebooks/external_output.ipynb -Success! -``` - -```console -$ jcache stage list - ID URI Created Assets Cache ID ----- ------------------------------------- ---------------- -------- ---------- - 5 tests/notebooks/external_output.ipynb 2020-03-12 17:31 0 5 - 4 tests/notebooks/complex_outputs.ipynb 2020-03-12 17:31 0 - 3 tests/notebooks/basic_unrun.ipynb 2020-03-12 17:31 0 6 - 2 tests/notebooks/basic_failing.ipynb 2020-03-12 17:31 0 2 - 1 tests/notebooks/basic.ipynb 2020-03-12 17:31 0 6 -``` - -You can remove a staged notebook by its URI or ID: - -```console -$ jcache stage remove-ids 4 -Unstaging ID: 4 -Success! -``` - -You can then run a basic execution of the required notebooks: - -```console -$ jcache cache remove 6 2 -Removing Cache ID = 6 -Removing Cache ID = 2 -Success! -``` - -```console -$ jcache execute -Executing: ../tests/notebooks/basic.ipynb -Execution Succeeded: ../tests/notebooks/basic.ipynb -Executing: ../tests/notebooks/basic_failing.ipynb -error: Execution Failed: ../tests/notebooks/basic_failing.ipynb -Executing: ../tests/notebooks/basic_unrun.ipynb -Execution Succeeded: ../tests/notebooks/basic_unrun.ipynb -Finished! Successfully executed notebooks have been cached. -succeeded: -- ../tests/notebooks/basic.ipynb -- ../tests/notebooks/basic_unrun.ipynb -excepted: -- ../tests/notebooks/basic_failing.ipynb -errored: [] - -``` - -Successfully executed notebooks will be cached to the cache, -along with any 'artefacts' created by the execution, -that are inside the notebook folder, and data supplied by the executor. - -```console -$ jcache stage list - ID URI Created Assets Cache ID ----- ------------------------------------- ---------------- -------- ---------- - 5 tests/notebooks/external_output.ipynb 2020-03-12 17:31 0 5 - 3 tests/notebooks/basic_unrun.ipynb 2020-03-12 17:31 0 6 - 2 tests/notebooks/basic_failing.ipynb 2020-03-12 17:31 0 - 1 tests/notebooks/basic.ipynb 2020-03-12 17:31 0 6 -``` - -Execution data (such as execution time) will be stored in the cache record: - -```console -$ jcache cache show 6 -ID: 6 -Origin URI: ../tests/notebooks/basic_unrun.ipynb -Created: 2020-03-12 17:31 -Accessed: 2020-03-12 17:31 -Hashkey: 818f3412b998fcf4fe9ca3cca11a3fc3 -Data: - execution_seconds: 1.0559415130000005 - -``` - -Failed notebooks will not be cached, but the exception traceback will be added to the stage record: - -```console -$ jcache stage show 2 -ID: 2 -URI: ../tests/notebooks/basic_failing.ipynb -Created: 2020-03-12 17:31 -Failed Last Execution! -Traceback (most recent call last): - File "../jupyter_cache/executors/basic.py", line 152, in execute - executenb(nb_bundle.nb, cwd=tmpdirname) - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py", line 737, in executenb - return ep.preprocess(nb, resources, km=km)[0] - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py", line 405, in preprocess - nb, resources = super(ExecutePreprocessor, self).preprocess(nb, resources) - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/base.py", line 69, in preprocess - nb.cells[index], resources = self.preprocess_cell(cell, resources, index) - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py", line 448, in preprocess_cell - raise CellExecutionError.from_cell_and_msg(cell, out) -nbconvert.preprocessors.execute.CellExecutionError: An error occurred while executing the following cell: ------------------- -raise Exception('oopsie!') ------------------- - ---------------------------------------------------------------------------- -Exception Traceback (most recent call last) - in -----> 1 raise Exception('oopsie!') - -Exception: oopsie! -Exception: oopsie! - - -``` - -Once executed you may leave staged notebooks, for later re-execution, or remove them: - -```console -$ jcache stage remove-ids --all -Are you sure you want to remove all? [y/N]: y -Unstaging ID: 1 -Unstaging ID: 2 -Unstaging ID: 3 -Unstaging ID: 5 -Success! -``` - -You can also stage notebooks with assets; -external files that are required by the notebook during execution. -As with artefacts, these files must be in the same folder as the notebook, -or a sub-folder. - -```console -$ jcache stage add-with-assets -nb tests/notebooks/basic.ipynb tests/notebooks/artifact_folder/artifact.txt -Success! -``` +Some desired requirements (not yet all implemented): -```console -$ jcache stage show 1 -ID: 1 -URI: ../tests/notebooks/basic.ipynb -Created: 2020-03-12 17:31 -Cache ID: 6 -Assets: -- ../tests/notebooks/artifact_folder/artifact.txt -``` +- Persistent +- Separates out "edits to content" from "edits to code cells". Cell + rearranges and code cell changes should require a re-execution. Content changes should not. +- Allow parallel access to notebooks (for execution) +- Store execution statistics/reports +- Store external assets: Notebooks being executed often require external assets: importing scripts/data/etc. These are prepared by the users. +- Store execution artefacts: created during execution +- A transparent and robust cache invalidation: imagine the user updating an external dependency or a Python module, or checking out a different git branch. ## Contributing diff --git a/codecov.yml b/codecov.yml index cf28904..01c8765 100644 --- a/codecov.yml +++ b/codecov.yml @@ -6,5 +6,5 @@ coverage: threshold: 0.5% patch: default: - target: 80% + target: 75% threshold: 0.5% diff --git a/docs/_static/logo.jpg b/docs/_static/logo.jpg deleted file mode 100644 index fd9f578..0000000 Binary files a/docs/_static/logo.jpg and /dev/null differ diff --git a/docs/_static/logo_small.jpg b/docs/_static/logo_small.jpg deleted file mode 100644 index 773412b..0000000 Binary files a/docs/_static/logo_small.jpg and /dev/null differ diff --git a/docs/_static/logo_square.svg b/docs/_static/logo_square.svg new file mode 100644 index 0000000..00448c2 --- /dev/null +++ b/docs/_static/logo_square.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/docs/_static/logo_wide.svg b/docs/_static/logo_wide.svg new file mode 100644 index 0000000..f0f3f9a --- /dev/null +++ b/docs/_static/logo_wide.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/docs/conf.py b/docs/conf.py index 6d740f3..c9617d5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,18 @@ # "sphinx.ext.autodoc", # "sphinx.ext.viewcode", ] +myst_enable_extensions = ["colon_fence", "deflist"] jupyter_execute_notebooks = "off" +html_theme_options = { + "repository_url": "https://github.com/executablebooks/jupyter-cache", + "use_repository_button": True, + "use_edit_page_button": True, + "use_issues_button": True, + "repository_branch": "master", + "path_to_docs": "docs", + "home_page_in_toc": True, + "logo_only": True, +} # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -49,7 +60,9 @@ # a list of builtin themes. # html_theme = "sphinx_book_theme" -html_logo = "_static/logo_small.jpg" +html_title = "Jupyter Cache" +html_logo = "_static/logo_wide.svg" +html_favicon = "_static/logo_square.svg" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -67,3 +80,103 @@ ("py:class", "ForwardRef"), ("py:class", "NoneType"), ] + + +def setup(app): + import importlib + import os + import shutil + import traceback + + import click + from click.testing import CliRunner + from docutils import nodes + from docutils.parsers.rst import directives + from sphinx.util.docutils import SphinxDirective + + class JcacheClear(SphinxDirective): + """A directive to clear the jupyter cache.""" + + def run(self): + path = os.path.join(os.path.dirname(self.env.app.srcdir), ".jupyter_cache") + if os.path.exists(path): + shutil.rmtree(path) + return [] + + class JcacheCli(SphinxDirective): + """A directive to run a CLI command, + and output a nicely formatted representation of the input command and its output. + """ + + required_arguments = 1 # command + final_argument_whitespace = False + has_content = False + option_spec = { + "prog": directives.unchanged_required, + "command": directives.unchanged_required, + "args": directives.unchanged_required, + "input": directives.unchanged_required, + "allow-exception": directives.flag, + } + + def run(self): + modpath = self.arguments[0] + + try: + module_name, attr_name = modpath.split(":", 1) + except ValueError: + raise self.error(f'"{modpath}" is not of format "module:command"') + + try: + module = importlib.import_module(module_name) + except Exception: + raise self.error( + f"Failed to import '{module_name}': {traceback.format_exc()}" + ) + + if not hasattr(module, attr_name): + raise self.error( + f'Module "{module_name}" has no attribute "{attr_name}"' + ) + command = getattr(module, attr_name) + if not isinstance(command, click.Group): + raise self.error( + f'"{modpath}" of type {type(command)}"" is not derived from "click.Group"' + ) + + cmd_string = [self.options.get("prog", "jcache")] + if command.name != cmd_string[0]: + cmd_string.append(command.name) + if "command" in self.options: + cmd_string.append(self.options["command"]) + command = command.commands[self.options["command"]] + + args = self.options.get("args", "") + + runner = CliRunner() + root_path = os.path.dirname(self.env.app.srcdir) + try: + old_cwd = os.getcwd() + os.chdir(root_path) + result = runner.invoke( + command, args.split(), input=self.options.get("input", None), env={} + ) + finally: + os.chdir(old_cwd) + + if result.exception and "allow-exception" not in self.options: + raise self.error( + f"CLI raised exception: {result.exception}\n---\n{result.output}\n---\n" + ) + if result.exit_code != 0 and "allow-exception" not in self.options: + raise self.error( + f"CLI non-zero exit code: {result.exit_code}\n---\n{result.output}\n---\n" + ) + + text = f"$ {' '.join(cmd_string)} {args}\n{result.output}" + text = text.replace(root_path + os.sep, "../") + node = nodes.literal_block(text, text, language="console") + return [node] + + app.add_directive("jcache-clear", JcacheClear) + app.add_directive("jcache-cli", JcacheCli) diff --git a/docs/develop/contributing.md b/docs/develop/contributing.md index 3814a5f..e6ed0e2 100644 --- a/docs/develop/contributing.md +++ b/docs/develop/contributing.md @@ -6,6 +6,17 @@ [![Code style: black][black-badge]][black-link] [![PyPI][pypi-badge]][pypi-link] +## Installation + +For package development: + +```bash +git clone https://github.com/executablebooks/jupyter-cache +cd jupyter-cache +git checkout develop +pip install -e .[cli,code_style,testing,rtd] +``` + ## Code Style Code style is tested using [flake8](http://flake8.pycqa.org), diff --git a/docs/index.md b/docs/index.md index f026632..32a483e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,13 +1,110 @@ # Jupyter Cache -A defined interface for working with a cache of jupyter notebooks. +Execute and cache multiple Jupyter Notebook-like files via an [API](use/api) and [CLI](use/cli). -This packages provides a clear [API](use/api) and [CLI](use/cli) for staging, executing and cacheing -Jupyter Notebooks. Although there are certainly other use cases, -the principle use case this was written for is generating books / websites, -created from multiple notebooks (and other text documents), -during which it is desired that notebooks can be *auto-executed* **only** -if the notebook had been modified in a way that may alter its code cell outputs. +🤓 Smart re-execution +: Notebooks will only be re-executed when **code cells** have changed (or code related metadata), not Markdown/Raw cells. + +🧩 Pluggable execution modes +: Select the executor for notebooks, including serial and parallel execution + +📈 Execution reports +: Timing statistics and exception tracebacks are stored for analysis + +📖 [jupytext](https://jupytext.readthedocs.io) integration +: Read and execute notebooks written in multiple formats + +## Why use jupyter-cache? + +If you have a number of notebooks whose execution outputs you want to ensure are kept up to date, without having to re-execute them every time (particularly for long running code, or text-based formats that do not store the outputs). + +The notebooks must have deterministic execution outputs: + +- You use the same environment to run them (e.g. the same installed packages) +- They run no non-deterministic code (e.g. random numbers) +- They do not depend on external resources (e.g. files or network connections) that change over time + +For example, it is utilised by [jupyter-book](https://jupyterbook.org/content/execute.html#caching-the-notebook-execution), to allow for fast document re-builds. + +## Installation + +Install `jupyter-cache`, via pip or Conda: + +```bash +pip install jupyter-cache +``` + +```bash +conda install jupyter-cache +``` + +## Quick-start + +```{jcache-clear} +``` + +Add one or more source notebook files to the "project" (a folder containing a database and a cache of executed notebooks): + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: add +:args: tests/notebooks/basic_unrun.ipynb tests/notebooks/basic_failing.ipynb +:input: y +``` + +These files are now ready for execution: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list +``` + +Now run the execution: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_project:cmnd_project +:command: execute +``` + +Successfully executed files will now be associated with a record in the cache: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list +``` + +The cache record includes execution statistics: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: info +:args: 1 +``` + +Next time we execute, jupyter-cache will check which files require re-execution: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_project:cmnd_project +:command: execute +``` + +The source files themselves will not be modified during/after execution. +You can create a new "final" notebook, with the cached outputs merged into the source notebook with: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: merge +:args: 1 final_notebook.ipynb +``` + +You can also add notebooks with custom formats, such as those read by [jupytext](https://jupytext.readthedocs.io): + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: add +:args: --reader jupytext tests/notebooks/basic.md +``` + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list +``` + +## Design considerations + +Although there are certainly other use cases, the principle use case this was written for is generating books / websites, created from multiple notebooks (and other text documents). +It is desired that notebooks can be *auto-executed* **only** if the notebook had been modified in a way that may alter its code cell outputs. Some desired requirements (not yet all implemented): @@ -19,34 +116,18 @@ Some desired requirements (not yet all implemented): - Allow parallel access to notebooks (for execution) - Store execution statistics/reports. - Store external assets: Notebooks being executed often require external assets: importing scripts/data/etc. These are prepared by the users. -- Store execution artifacts: created during execution +- Store execution artefacts: created during execution - A transparent and robust cache invalidation: imagine the user updating an external dependency or a Python module, or checking out a different git branch. -## Installation - -To install `jupytes-cache`, do the following: - -```bash -pip install jupyter-cache[cli] -``` - -For package development: - -```bash -git clone https://github.com/ExecutableBookProject/jupyter-cache -cd jupyter-cache -git checkout develop -pip install -e .[cli,code_style,testing,rtd] -``` - -Here are the site contents: +## Contents ```{toctree} ---- -maxdepth: 2 -caption: Contents ---- +:caption: Tutorials using/cli using/api +``` + +```{toctree} +:caption: Development develop/contributing ``` diff --git a/docs/using/api.ipynb b/docs/using/api.ipynb index 3b4a69f..0bdf14e 100644 --- a/docs/using/api.ipynb +++ b/docs/using/api.ipynb @@ -15,7 +15,7 @@ "source": [ "This page outlines how to utilise the cache programatically.\n", "We step throught the three aspects illustrated in the diagram below:\n", - "[cacheing](use/api/cache), [staging](use/api/stage) and [executing](use/api/execute).\n", + "[cacheing](use/api/cache), [staging](use/api/project) and [executing](use/api/execute).\n", "\n", "```{figure} images/execution_process.svg\n", ":width: 500 px\n", @@ -29,7 +29,7 @@ "metadata": {}, "source": [ "```{note}\n", - "The full Jupyter notebook for this page can accessed here; {nb-download:notebook.ipynb}`api`.\n", + "The full Jupyter notebook for this page can accessed here; {nb-download}`api.ipynb`.\n", "Try it for yourself!\n", "```" ] @@ -50,11 +50,11 @@ "from pathlib import Path\n", "import nbformat as nbf\n", "from jupyter_cache import get_cache\n", - "from jupyter_cache.base import NbBundleIn\n", + "from jupyter_cache.base import CacheBundleIn\n", "from jupyter_cache.executors import load_executor, list_executors\n", "from jupyter_cache.utils import (\n", " tabulate_cache_records, \n", - " tabulate_stage_records\n", + " tabulate_project_records\n", ")" ] }, @@ -77,7 +77,7 @@ { "data": { "text/plain": [ - "JupyterCacheBase('/Users/cjs14/GitHub/jupyter-cache/docs/using/.jupyter_cache')" + "JupyterCacheBase('/Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/.jupyter_cache')" ] }, "execution_count": 2, @@ -107,7 +107,7 @@ ], "source": [ "print(cache.list_cache_records())\n", - "print(cache.list_staged_records())" + "print(cache.list_project_records())" ] }, { @@ -170,13 +170,13 @@ { "data": { "text/plain": [ - "{'data': {},\n", - " 'pk': 1,\n", + "{'description': '',\n", + " 'hashkey': '94c17138f782c75df59e989fffa64e3a',\n", + " 'created': datetime.datetime(2022, 1, 12, 15, 15, 27, 255299),\n", + " 'accessed': datetime.datetime(2022, 1, 12, 15, 15, 27, 255312),\n", + " 'data': {},\n", " 'uri': 'example_nbs/basic.ipynb',\n", - " 'accessed': datetime.datetime(2020, 3, 13, 14, 21, 46, 271953),\n", - " 'description': '',\n", - " 'hashkey': '818f3412b998fcf4fe9ca3cca11a3fc3',\n", - " 'created': datetime.datetime(2020, 3, 13, 14, 21, 46, 271943)}" + " 'pk': 1}" ] }, "execution_count": 5, @@ -259,7 +259,7 @@ { "data": { "text/plain": [ - "NbBundleOut(nb=Notebook(cells=1), record=NbCacheRecord(pk=1), artifacts=NbArtifacts(paths=0))" + "CacheBundleOut(nb=Notebook(cells=1), record=NbCacheRecord(pk=1), artifacts=NbArtifacts(paths=0))" ] }, "execution_count": 8, @@ -332,9 +332,9 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mCachingError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m record = cache.cache_notebook_file(\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"example_nbs\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"basic.ipynb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m )\n", - "\u001b[0;32m~/GitHub/jupyter-cache/jupyter_cache/cache/main.py\u001b[0m in \u001b[0;36mcache_notebook_file\u001b[0;34m(self, path, uri, artifacts, data, check_validity, overwrite)\u001b[0m\n\u001b[1;32m 271\u001b[0m ),\n\u001b[1;32m 272\u001b[0m \u001b[0mcheck_validity\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcheck_validity\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 273\u001b[0;31m \u001b[0moverwrite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moverwrite\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 274\u001b[0m )\n\u001b[1;32m 275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/GitHub/jupyter-cache/jupyter_cache/cache/main.py\u001b[0m in \u001b[0;36mcache_notebook_bundle\u001b[0;34m(self, bundle, check_validity, overwrite, description)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0moverwrite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 209\u001b[0m raise CachingError(\n\u001b[0;32m--> 210\u001b[0;31m \u001b[0;34m\"Notebook already exists in cache and overwrite=False.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 211\u001b[0m )\n\u001b[1;32m 212\u001b[0m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrmtree\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/var/folders/t2/xbl15_3n4tsb1vr_ccmmtmbr0000gn/T/ipykernel_99993/3576020660.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m record = cache.cache_notebook_file(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"example_nbs\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"basic.ipynb\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m )\n", + "\u001b[0;32m~/Documents/GitHub/jupyter-cache/jupyter_cache/cache/main.py\u001b[0m in \u001b[0;36mcache_notebook_file\u001b[0;34m(self, path, uri, artifacts, data, check_validity, overwrite)\u001b[0m\n\u001b[1;32m 268\u001b[0m \"\"\"\n\u001b[1;32m 269\u001b[0m \u001b[0mnotebook\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnbf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNO_CONVERT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 270\u001b[0;31m return self.cache_notebook_bundle(\n\u001b[0m\u001b[1;32m 271\u001b[0m CacheBundleIn(\n\u001b[1;32m 272\u001b[0m \u001b[0mnotebook\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Documents/GitHub/jupyter-cache/jupyter_cache/cache/main.py\u001b[0m in \u001b[0;36mcache_notebook_bundle\u001b[0;34m(self, bundle, check_validity, overwrite, description)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0moverwrite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 215\u001b[0;31m raise CachingError(\n\u001b[0m\u001b[1;32m 216\u001b[0m \u001b[0;34m\"Notebook already exists in cache and overwrite=False.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m )\n", "\u001b[0;31mCachingError\u001b[0m: Notebook already exists in cache and overwrite=False." ] } @@ -477,15 +477,15 @@ "outputs": [ { "ename": "KeyError", - "evalue": "'Cache record not found for NB with hashkey: 74933d8a93d1df9caad87b2e6efcdc69'", + "evalue": "'Cache record not found for NB with hashkey: 07e6a47c8c180cb7851ede6dbb088769'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch_cache_notebook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnotebook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/GitHub/jupyter-cache/jupyter_cache/cache/main.py\u001b[0m in \u001b[0;36mmatch_cache_notebook\u001b[0;34m(self, nb)\u001b[0m\n\u001b[1;32m 328\u001b[0m \"\"\"\n\u001b[1;32m 329\u001b[0m \u001b[0mhashkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_hash_notebook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m \u001b[0mcache_record\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNbCacheRecord\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecord_from_hashkey\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhashkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 331\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcache_record\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/GitHub/jupyter-cache/jupyter_cache/cache/db.py\u001b[0m in \u001b[0;36mrecord_from_hashkey\u001b[0;34m(hashkey, db)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 151\u001b[0m raise KeyError(\n\u001b[0;32m--> 152\u001b[0;31m \u001b[0;34m\"Cache record not found for NB with hashkey: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhashkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 153\u001b[0m )\n\u001b[1;32m 154\u001b[0m \u001b[0msession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpunge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyError\u001b[0m: 'Cache record not found for NB with hashkey: 74933d8a93d1df9caad87b2e6efcdc69'" + "\u001b[0;32m/var/folders/t2/xbl15_3n4tsb1vr_ccmmtmbr0000gn/T/ipykernel_99993/941642554.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch_cache_notebook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnotebook\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Documents/GitHub/jupyter-cache/jupyter_cache/cache/main.py\u001b[0m in \u001b[0;36mmatch_cache_notebook\u001b[0;34m(self, nb)\u001b[0m\n\u001b[1;32m 333\u001b[0m \"\"\"\n\u001b[1;32m 334\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhashkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_hashed_notebook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 335\u001b[0;31m \u001b[0mcache_record\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNbCacheRecord\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecord_from_hashkey\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhashkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 336\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcache_record\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Documents/GitHub/jupyter-cache/jupyter_cache/cache/db.py\u001b[0m in \u001b[0;36mrecord_from_hashkey\u001b[0;34m(hashkey, db)\u001b[0m\n\u001b[1;32m 158\u001b[0m )\n\u001b[1;32m 159\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m raise KeyError(\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;34m\"Cache record not found for NB with hashkey: {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhashkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m )\n", + "\u001b[0;31mKeyError\u001b[0m: 'Cache record not found for NB with hashkey: 07e6a47c8c180cb7851ede6dbb088769'" ] } ], @@ -569,7 +569,7 @@ } ], "source": [ - "nb_bundle = NbBundleIn(\n", + "nb_bundle = CacheBundleIn(\n", " nb=notebook,\n", " uri=Path(\"example_nbs\", \"basic.ipynb\"),\n", " data={\"tag\": \"mytag\"}\n", @@ -588,8 +588,8 @@ "text": [ " ID Origin URI Created Accessed Hashkey\n", "---- ------------ ---------------- ---------------- --------------------------------\n", - " 2 basic.ipynb 2020-03-13 14:21 2020-03-13 14:21 74933d8a93d1df9caad87b2e6efcdc69\n", - " 1 basic.ipynb 2020-03-13 14:21 2020-03-13 14:21 818f3412b998fcf4fe9ca3cca11a3fc3\n" + " 2 basic.ipynb 2022-01-12 15:16 2022-01-12 15:16 07e6a47c8c180cb7851ede6dbb088769\n", + " 1 basic.ipynb 2022-01-12 15:15 2022-01-12 15:16 94c17138f782c75df59e989fffa64e3a\n" ] } ], @@ -640,7 +640,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "(use/api/stage)=\n", + "(use/api/project)=\n", "\n", "## Staging Notebooks for Execution" ] @@ -665,7 +665,7 @@ { "data": { "text/plain": [ - "NbStageRecord(pk=1)" + "NbProjectRecord(pk=1)" ] }, "execution_count": 22, @@ -674,7 +674,7 @@ } ], "source": [ - "record = cache.stage_notebook_file(Path(\"example_nbs\", \"basic.ipynb\"))\n", + "record = cache.add_nb_to_project(Path(\"example_nbs\", \"basic.ipynb\"))\n", "record" ] }, @@ -686,10 +686,11 @@ { "data": { "text/plain": [ - "{'uri': '/Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb',\n", - " 'traceback': '',\n", - " 'created': datetime.datetime(2020, 3, 13, 14, 21, 47, 304914),\n", + "{'uri': '/Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb',\n", " 'assets': [],\n", + " 'created': datetime.datetime(2022, 1, 12, 15, 16, 27, 64960),\n", + " 'traceback': '',\n", + " 'read_data': {'name': 'nbformat', 'type': 'plugin'},\n", " 'pk': 1}" ] }, @@ -726,7 +727,7 @@ } ], "source": [ - "cache.get_cache_record_of_staged(1)" + "cache.get_cached_project_nb(1)" ] }, { @@ -738,15 +739,15 @@ "name": "stdout", "output_type": "stream", "text": [ - " ID URI Created Assets Cache ID\n", - "---- ----------------------- ---------------- -------- ----------\n", - " 1 example_nbs/basic.ipynb 2020-03-13 14:21 0 1\n" + " ID URI Reader Added Status\n", + "---- ----------------------- -------- ---------------- --------\n", + " 1 example_nbs/basic.ipynb nbformat 2022-01-12 15:16 ✅ [1]\n" ] } ], "source": [ - "print(tabulate_stage_records(\n", - " cache.list_staged_records(), path_length=2, cache=cache\n", + "print(tabulate_project_records(\n", + " cache.list_project_records(), path_length=2, cache=cache\n", "))" ] }, @@ -803,7 +804,7 @@ ], "source": [ "cache.merge_match_into_file(\n", - " cache.get_staged_record(1).uri,\n", + " cache.get_project_record(1).uri,\n", " nb_meta=('kernelspec', 'language_info', 'widgets'),\n", " cell_meta=None\n", ")" @@ -824,7 +825,7 @@ { "data": { "text/plain": [ - "NbStageRecord(pk=2)" + "NbProjectRecord(pk=2)" ] }, "execution_count": 27, @@ -833,7 +834,7 @@ } ], "source": [ - "record = cache.stage_notebook_file(Path(\"example_nbs\", \"basic_failing.ipynb\"))\n", + "record = cache.add_nb_to_project(Path(\"example_nbs\", \"basic_failing.ipynb\"))\n", "record" ] }, @@ -843,7 +844,7 @@ "metadata": {}, "outputs": [], "source": [ - "cache.get_cache_record_of_staged(2) # returns None" + "cache.get_cached_project_nb(2) # returns None" ] }, { @@ -854,7 +855,7 @@ { "data": { "text/plain": [ - "[NbStageRecord(pk=2)]" + "[NbProjectRecord(pk=2)]" ] }, "execution_count": 29, @@ -863,7 +864,7 @@ } ], "source": [ - "cache.list_staged_unexecuted()" + "cache.list_unexecuted()" ] }, { @@ -875,16 +876,16 @@ "name": "stdout", "output_type": "stream", "text": [ - " ID URI Created Assets Cache ID\n", - "---- ------------------------------- ---------------- -------- ----------\n", - " 2 example_nbs/basic_failing.ipynb 2020-03-13 14:21 0\n", - " 1 example_nbs/basic.ipynb 2020-03-13 14:21 0 1\n" + " ID URI Reader Added Status\n", + "---- ------------------------------- -------- ---------------- --------\n", + " 1 example_nbs/basic.ipynb nbformat 2022-01-12 15:16 ✅ [1]\n", + " 2 example_nbs/basic_failing.ipynb nbformat 2022-01-12 15:17 -\n" ] } ], "source": [ - "print(tabulate_stage_records(\n", - " cache.list_staged_records(), path_length=2, cache=cache\n", + "print(tabulate_project_records(\n", + " cache.list_project_records(), path_length=2, cache=cache\n", "))" ] }, @@ -901,7 +902,7 @@ "metadata": {}, "outputs": [], "source": [ - "cache.discard_staged_notebook(1)" + "cache.remove_nb_from_project(1)" ] }, { @@ -913,15 +914,15 @@ "name": "stdout", "output_type": "stream", "text": [ - " ID URI Created Assets\n", - "---- ------------------------------- ---------------- --------\n", - " 2 example_nbs/basic_failing.ipynb 2020-03-13 14:21 0\n" + " ID URI Reader Added Status\n", + "---- ------------------------------- -------- ---------------- --------\n", + " 2 example_nbs/basic_failing.ipynb nbformat 2022-01-12 15:17 -\n" ] } ], "source": [ - "print(tabulate_stage_records(\n", - " cache.list_staged_records(), path_length=2, cache=cache\n", + "print(tabulate_project_records(\n", + " cache.list_project_records(), path_length=2, cache=cache\n", "))" ] }, @@ -949,7 +950,7 @@ { "data": { "text/plain": [ - "NbStageRecord(pk=2)" + "NbProjectRecord(pk=2)" ] }, "execution_count": 33, @@ -959,8 +960,8 @@ ], "source": [ "cache.clear_cache()\n", - "cache.stage_notebook_file(Path(\"example_nbs\", \"basic.ipynb\"))\n", - "cache.stage_notebook_file(Path(\"example_nbs\", \"basic_failing.ipynb\"))" + "cache.add_nb_to_project(Path(\"example_nbs\", \"basic.ipynb\"))\n", + "cache.add_nb_to_project(Path(\"example_nbs\", \"basic_failing.ipynb\"))" ] }, { @@ -972,16 +973,16 @@ "name": "stdout", "output_type": "stream", "text": [ - " ID URI Created Assets\n", - "---- ------------------------------- ---------------- --------\n", - " 2 example_nbs/basic_failing.ipynb 2020-03-13 14:21 0\n", - " 1 example_nbs/basic.ipynb 2020-03-13 14:21 0\n" + " ID URI Reader Added Status\n", + "---- ------------------------------- -------- ---------------- --------\n", + " 1 example_nbs/basic.ipynb nbformat 2022-01-12 15:17 -\n", + " 2 example_nbs/basic_failing.ipynb nbformat 2022-01-12 15:17 -\n" ] } ], "source": [ - "print(tabulate_stage_records(\n", - " cache.list_staged_records(), path_length=2, cache=cache\n", + "print(tabulate_project_records(\n", + " cache.list_project_records(), path_length=2, cache=cache\n", "))" ] }, @@ -1005,7 +1006,7 @@ { "data": { "text/plain": [ - "[EntryPoint.parse('basic = jupyter_cache.executors.basic:JupyterExecutorBasic')]" + "{'local-parallel', 'local-serial', 'temp-parallel', 'temp-serial'}" ] }, "execution_count": 35, @@ -1025,7 +1026,7 @@ { "data": { "text/plain": [ - "JupyterExecutorBasic(cache=JupyterCacheBase('/Users/cjs14/GitHub/jupyter-cache/docs/using/.jupyter_cache'))" + "JupyterExecutorLocalSerial(cache=JupyterCacheBase('/Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/.jupyter_cache'))" ] }, "execution_count": 36, @@ -1037,7 +1038,7 @@ "from logging import basicConfig, INFO\n", "basicConfig(level=INFO)\n", "\n", - "executor = load_executor(\"basic\", cache=cache)\n", + "executor = load_executor(\"local-serial\", cache=cache)\n", "executor" ] }, @@ -1073,18 +1074,30 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:jupyter_cache.executors.base:Executing: /Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb\n", - "INFO:jupyter_cache.executors.base:Execution Succeeded: /Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb\n", - "INFO:jupyter_cache.executors.base:Executing: /Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic_failing.ipynb\n", - "ERROR:jupyter_cache.executors.base:Execution Failed: /Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic_failing.ipynb\n" + "INFO:jupyter_cache.executors.base:Executing 2 notebook(s) in serial\n", + "INFO:jupyter_cache.executors.base:Executing: /Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb\n", + "INFO:jupyter_cache.executors.base:Execution Successful: /Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb\n", + "INFO:jupyter_cache.executors.base:Executing: /Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic_failing.ipynb\n", + "WARNING:jupyter_cache.executors.base:Execution Excepted: /Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic_failing.ipynb\n", + "CellExecutionError: An error occurred while executing the following cell:\n", + "------------------\n", + "raise Exception('oopsie!')\n", + "------------------\n", + "\n", + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n", + "\u001b[0;31mException\u001b[0m Traceback (most recent call last)\n", + "\u001b[0;32m/var/folders/t2/xbl15_3n4tsb1vr_ccmmtmbr0000gn/T/ipykernel_1308/340246212.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", + "\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'oopsie!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n", + "\u001b[0;31mException\u001b[0m: oopsie!\n", + "Exception: oopsie!\n", + "\n" ] }, { "data": { "text/plain": [ - "{'succeeded': ['/Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb'],\n", - " 'excepted': ['/Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic_failing.ipynb'],\n", - " 'errored': []}" + "ExecutorRunResult(succeeded=['/Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb'], excepted=['/Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic_failing.ipynb'], errored=[])" ] }, "execution_count": 37, @@ -1132,13 +1145,13 @@ { "data": { "text/plain": [ - "{'data': {'execution_seconds': 1.7455324890000004},\n", - " 'pk': 1,\n", - " 'uri': '/Users/cjs14/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb',\n", - " 'accessed': datetime.datetime(2020, 3, 13, 14, 21, 50, 803042),\n", - " 'description': '',\n", - " 'hashkey': '818f3412b998fcf4fe9ca3cca11a3fc3',\n", - " 'created': datetime.datetime(2020, 3, 13, 14, 21, 50, 803031)}" + "{'description': '',\n", + " 'hashkey': '94c17138f782c75df59e989fffa64e3a',\n", + " 'created': datetime.datetime(2022, 1, 12, 15, 17, 45, 471862),\n", + " 'accessed': datetime.datetime(2022, 1, 12, 15, 17, 45, 471871),\n", + " 'data': {'execution_seconds': 1.8344826350000005},\n", + " 'uri': '/Users/chrisjsewell/Documents/GitHub/jupyter-cache/docs/using/example_nbs/basic.ipynb',\n", + " 'pk': 1}" ] }, "execution_count": 39, @@ -1170,24 +1183,34 @@ "output_type": "stream", "text": [ "Traceback (most recent call last):\n", - " File \"/Users/cjs14/GitHub/jupyter-cache/jupyter_cache/executors/basic.py\", line 152, in execute\n", - " executenb(nb_bundle.nb, cwd=tmpdirname)\n", - " File \"/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py\", line 737, in executenb\n", - " return ep.preprocess(nb, resources, km=km)[0]\n", - " File \"/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py\", line 405, in preprocess\n", - " nb, resources = super(ExecutePreprocessor, self).preprocess(nb, resources)\n", - " File \"/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/base.py\", line 69, in preprocess\n", - " nb.cells[index], resources = self.preprocess_cell(cell, resources, index)\n", - " File \"/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py\", line 448, in preprocess_cell\n", - " raise CellExecutionError.from_cell_and_msg(cell, out)\n", - "nbconvert.preprocessors.execute.CellExecutionError: An error occurred while executing the following cell:\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/jupyter_cache/executors/utils.py\", line 58, in single_nb_execution\n", + " executenb(\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/site-packages/nbclient/client.py\", line 1093, in execute\n", + " return NotebookClient(nb=nb, resources=resources, km=km, **kwargs).execute()\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/site-packages/nbclient/util.py\", line 84, in wrapped\n", + " return just_run(coro(*args, **kwargs))\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/site-packages/nbclient/util.py\", line 62, in just_run\n", + " return loop.run_until_complete(coro)\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/site-packages/nest_asyncio.py\", line 81, in run_until_complete\n", + " return f.result()\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/asyncio/futures.py\", line 178, in result\n", + " raise self._exception\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/asyncio/tasks.py\", line 280, in __step\n", + " result = coro.send(None)\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/site-packages/nbclient/client.py\", line 559, in async_execute\n", + " await self.async_execute_cell(\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/site-packages/nbclient/client.py\", line 854, in async_execute_cell\n", + " self._check_raise_for_error(cell, exec_reply)\n", + " File \"/Users/chrisjsewell/Documents/GitHub/jupyter-cache/.tox/py38/lib/python3.8/site-packages/nbclient/client.py\", line 756, in _check_raise_for_error\n", + " raise CellExecutionError.from_cell_and_msg(cell, exec_reply_content)\n", + "nbclient.exceptions.CellExecutionError: An error occurred while executing the following cell:\n", "------------------\n", "raise Exception('oopsie!')\n", "------------------\n", "\n", "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n", "\u001b[0;31mException\u001b[0m Traceback (most recent call last)\n", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", + "\u001b[0;32m/var/folders/t2/xbl15_3n4tsb1vr_ccmmtmbr0000gn/T/ipykernel_1308/340246212.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", "\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'oopsie!'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0m\n", "\u001b[0;31mException\u001b[0m: oopsie!\n", @@ -1198,7 +1221,7 @@ } ], "source": [ - "record = cache.get_staged_record(2)\n", + "record = cache.get_project_record(2)\n", "print(record.traceback)" ] }, @@ -1218,16 +1241,16 @@ "name": "stdout", "output_type": "stream", "text": [ - " ID URI Created Assets Cache ID\n", - "---- ------------------------------- ---------------- -------- ----------\n", - " 2 example_nbs/basic_failing.ipynb 2020-03-13 14:21 0\n", - " 1 example_nbs/basic.ipynb 2020-03-13 14:21 0 1\n" + " ID URI Reader Added Status\n", + "---- ------------------------------- -------- ---------------- --------\n", + " 1 example_nbs/basic.ipynb nbformat 2022-01-12 15:17 ✅ [1]\n", + " 2 example_nbs/basic_failing.ipynb nbformat 2022-01-12 15:17 ❌\n" ] } ], "source": [ - "print(tabulate_stage_records(\n", - " cache.list_staged_records(), path_length=2, cache=cache\n", + "print(tabulate_project_records(\n", + " cache.list_project_records(), path_length=2, cache=cache\n", "))" ] }, @@ -1242,7 +1265,7 @@ "text": [ " ID Origin URI Created Accessed Hashkey\n", "---- ------------ ---------------- ---------------- --------------------------------\n", - " 1 basic.ipynb 2020-03-13 14:21 2020-03-13 14:21 818f3412b998fcf4fe9ca3cca11a3fc3\n" + " 1 basic.ipynb 2022-01-12 15:17 2022-01-12 15:17 94c17138f782c75df59e989fffa64e3a\n" ] } ], @@ -1273,10 +1296,13 @@ ], "metadata": { "celltoolbar": "Tags", + "interpreter": { + "hash": "8398b65b1e6feb38b0506d5ab1aedf8bf63748a9844a9c81ed9242850234e24f" + }, "kernelspec": { - "display_name": "Python 3.7.6 64-bit ('mistune': conda)", + "display_name": "Coconut", "language": "python", - "name": "python37664bitmistuneconda77ae93e05d9c4c1eab3d7fc3f8312065" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1288,7 +1314,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.8.12" } }, "nbformat": 4, diff --git a/docs/using/cli.md b/docs/using/cli.md index 4e3c412..4313285 100644 --- a/docs/using/cli.md +++ b/docs/using/cli.md @@ -2,28 +2,35 @@ # Command-Line - +Note, you can follow this tutorial by cloning , and running these commands inside it.: +tox +```{jcache-clear} +``` -From the checked-out repository folder: +```{jcache-cli} jupyter_cache.cli.commands.cmd_main:jcache +:args: --help +``` -```console -$ jcache --help -Usage: jcache [OPTIONS] COMMAND [ARGS]... +The first time the cache is required, it will be lazily created: - The command line interface of jupyter-cache. +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list +:input: y +``` -Options: - -v, --version Show the version and exit. - -p, --cache-path Print the current cache path and exit. - -a, --autocomplete Print the autocompletion command and exit. - -h, --help Show this message and exit. +You can specify the path to the cache, with the `--cache-path` option, +or set the `JUPYTERCACHE` environment variable. -Commands: - cache Commands for adding to and inspecting the cache. - clear Clear the cache completely. - config Commands for configuring the cache. - execute Execute staged notebooks that are outdated. - stage Commands for staging notebooks to be executed. +You can also clear it at any time: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_project:cmnd_project +:command: clear +:input: y +``` + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list +:input: y ``` ````{tip} @@ -34,346 +41,238 @@ eval "$(_JCACHE_COMPLETE=source jcache)" ``` ```` -## Caching Executed Notebooks +## Adding notebooks to the project -```console -$ jcache cache --help -Usage: cache [OPTIONS] COMMAND [ARGS]... +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:args: --help +``` - Commands for adding to and inspecting the cache. +A project consist of a set of notebooks to be executed. -Options: - --help Show this message and exit. +When adding notebooks to the project, they are recorded by their URI (e.g. file path), +i.e. no physical copying takes place until execution time. -Commands: - add Cache notebook(s) that have already been executed. - add-with-artefacts Cache a notebook, with possible artefact files. - cat-artifact Print the contents of a cached artefact. - diff-nb Print a diff of a notebook to one stored in the cache. - list List cached notebook records in the cache. - remove Remove notebooks stored in the cache. - show Show details of a cached notebook in the cache. +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: add +:args: tests/notebooks/basic.ipynb tests/notebooks/basic_failing.ipynb tests/notebooks/basic_unrun.ipynb tests/notebooks/complex_outputs.ipynb tests/notebooks/external_output.ipynb ``` -The first time the cache is required, it will be lazily created: - -```console -$ jcache cache list -Cache path: ../.jupyter_cache -The cache does not yet exist, do you want to create it? [y/N]: y -No Cached Notebooks +You can list the notebooks in the project, at present none have an existing execution record in the cache: +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list ``` -You can add notebooks straight into the cache. -When caching, a check will be made that the notebooks look to have been executed -correctly, i.e. the cell execution counts go sequentially up from 1. +You can remove a notebook from the project by its URI or ID: -```console -$ jcache cache add tests/notebooks/basic.ipynb -Caching: ../tests/notebooks/basic.ipynb -Validity Error: Expected cell 1 to have execution_count 1 not 2 -The notebook may not have been executed, continue caching? [y/N]: y -Success! +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: remove +:args: 4 ``` -Or to skip validation: +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list +``` -```console -$ jcache cache add --no-validate tests/notebooks/basic.ipynb tests/notebooks/basic_failing.ipynb tests/notebooks/basic_unrun.ipynb tests/notebooks/complex_outputs.ipynb tests/notebooks/external_output.ipynb -Caching: ../tests/notebooks/basic.ipynb -Caching: ../tests/notebooks/basic_failing.ipynb -Caching: ../tests/notebooks/basic_unrun.ipynb -Caching: ../tests/notebooks/complex_outputs.ipynb -Caching: ../tests/notebooks/external_output.ipynb -Success! -``` - -Once you've cached some notebooks, you can look at the 'cache records' -for what has been cached. - -Each notebook is hashed (code cells and kernel spec only), -which is used to compare against 'staged' notebooks. -Multiple hashes for the same URI can be added -(the URI is just there for inspetion) and the size of the cache is limited -(current default 1000) so that, at this size, -the last accessed records begin to be deleted. -You can remove cached records by their ID. +or clear all notebooks from the project: -```console -$ jcache cache list - ID Origin URI Created Accessed ----- ------------------------------------- ---------------- ---------------- - 5 tests/notebooks/external_output.ipynb 2020-03-12 17:31 2020-03-12 17:31 - 4 tests/notebooks/complex_outputs.ipynb 2020-03-12 17:31 2020-03-12 17:31 - 3 tests/notebooks/basic_unrun.ipynb 2020-03-12 17:31 2020-03-12 17:31 - 2 tests/notebooks/basic_failing.ipynb 2020-03-12 17:31 2020-03-12 17:31 +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: clear +:input: y ``` -````{tip} -To only show the latest versions of cached notebooks. +## Add a custom reader to read notebook files -```console -$ jcache cache list --latest-only +By default, notebook files are read using the [nbformat reader](https://nbformat.readthedocs.io/en/latest/api.html#nbformat.read). +However, you can also specify a custom reader, defined by an entry point in the `jcache.readers` group. +Included with jupyter_cache is the [jupytext](https://jupytext.readthedocs.io) reader, for formats like MyST Markdown: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: add +:args: --reader nbformat tests/notebooks/basic.ipynb tests/notebooks/basic_failing.ipynb ``` -```` -You can also cache notebooks with artefacts -(external outputs of the notebook execution). +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: add +:args: --reader jupytext tests/notebooks/basic.md +``` -```console -$ jcache cache add-with-artefacts -nb tests/notebooks/basic.ipynb tests/notebooks/artifact_folder/artifact.txt -Caching: ../tests/notebooks/basic.ipynb -Validity Error: Expected cell 1 to have execution_count 1 not 2 -The notebook may not have been executed, continue caching? [y/N]: y -Success! +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list ``` -Show a full description of a cached notebook by referring to its ID +:::{important} +To use the `jupytext` reader, you must have the `jupytext` package installed. +::: -```console -$ jcache cache show 6 -ID: 6 -Origin URI: ../tests/notebooks/basic.ipynb -Created: 2020-03-12 17:31 -Accessed: 2020-03-12 17:31 -Hashkey: 818f3412b998fcf4fe9ca3cca11a3fc3 -Artifacts: -- artifact_folder/artifact.txt +## Executing the notebooks + +Simply call the `execute` command, to execute all notebooks in the project that do not have an existing record in the cache. + +Executors are defined by entry points in the `jcache.executors` group. +jupyter-cache includes these executors: + +- `local-serial`: execute notebooks with the working directory set to their path, in serial mode (using a single process). +- `local-parallel`: execute notebooks with the working directory set to their path, in parallel mode (using multiple processes). +- `temp-serial`: execute notebooks with a temporary working directory, in serial mode (using a single process). +- `temp-parallel`: execute notebooks with a temporary working directory, in parallel mode (using multiple processes). + +```{jcache-cli} jupyter_cache.cli.commands.cmd_project:cmnd_project +:command: execute +:args: --executor local-serial ``` -Note artefact paths must be 'upstream' of the notebook folder: +Successfully executed notebooks will now have a record in the cache, uniquely identified by the a hash of their code and metadata content: -```console -$ jcache cache add-with-artefacts -nb tests/notebooks/basic.ipynb tests/test_db.py -Caching: ../tests/notebooks/basic.ipynb -Artifact Error: Path '../tests/test_db.py' is not in folder '../tests/notebooks'' +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: list +:args: --hashkeys ``` -To view the contents of an execution artefact: +These records are then compared to the hashes of notebooks in the project, to find which have up-to-date executions. +Note here both notebooks share the same cached notebook (denoted by `[1]` in the status): -```console -$ jcache cache cat-artifact 6 artifact_folder/artifact.txt -An artifact +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list +``` +Next time you execute the project, only notebooks which don't match a cached record will be executed: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_project:cmnd_project +:command: execute +:args: --executor local-serial -v CRITICAL ``` -You can directly remove a cached notebook by its ID: +You can also `force` all notebooks to be re-executed: -```console -$ jcache cache remove 4 -Removing Cache ID = 4 -Success! +```{jcache-cli} jupyter_cache.cli.commands.cmd_project:cmnd_project +:command: execute +:args: --force ``` -You can also diff any of the cached notebooks with any (external) notebook: +If you modify a code cell, the notebook will no longer match a cached notebook or, if you wish to re-execute unchanged notebook(s) (for example if the runtime environment has changed), you can remove their records from the cache (keeping the project record): -```console -$ jcache cache diff-nb 2 tests/notebooks/basic.ipynb -nbdiff ---- cached pk=2 -+++ other: ../tests/notebooks/basic.ipynb -## inserted before nb/cells/0: -+ code cell: -+ execution_count: 2 -+ source: -+ a=1 -+ print(a) -+ outputs: -+ output 0: -+ output_type: stream -+ name: stdout -+ text: -+ 1 - -## deleted nb/cells/0: -- code cell: -- source: -- raise Exception('oopsie!') - - -Success! -``` - -## Staging Notebooks for execution +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: clear +:input: n +:allow-exception: +``` -```console -$ jcache stage --help -Usage: stage [OPTIONS] COMMAND [ARGS]... +:::{note} +The number of notebooks in the cache is limited +(current default 1000). +Once this limit is reached, the oldest (last accessed) notebooks begin to be deleted. +change this default with `jcache config cache-limit` +::: - Commands for staging notebooks to be executed. +## Analysing executed/excepted notebooks -Options: - --help Show this message and exit. +You can see the elapsed execution time of a notebook via its ID in the cache: -Commands: - add Stage notebook(s) for execution. - add-with-assets Stage a notebook, with possible asset files. - list List notebooks staged for possible execution. - remove-ids Un-stage notebook(s), by ID. - remove-uris Un-stage notebook(s), by URI. - show Show details of a staged notebook. +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: info +:args: 1 ``` -Staged notebooks are recorded as pointers to their URI, -i.e. no physical copying takes place until execution time. - -If you stage some notebooks for execution, then -you can list them to see which have existing records in the cache (by hash), -and which will require execution: +Failed execution tracebacks are also available on the project record: -```console -$ jcache stage add tests/notebooks/basic.ipynb tests/notebooks/basic_failing.ipynb tests/notebooks/basic_unrun.ipynb tests/notebooks/complex_outputs.ipynb tests/notebooks/external_output.ipynb -Staging: ../tests/notebooks/basic.ipynb -Staging: ../tests/notebooks/basic_failing.ipynb -Staging: ../tests/notebooks/basic_unrun.ipynb -Staging: ../tests/notebooks/complex_outputs.ipynb -Staging: ../tests/notebooks/external_output.ipynb -Success! +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: info +:args: --tb tests/notebooks/basic_failing.ipynb ``` -```console -$ jcache stage list - ID URI Created Assets Cache ID ----- ------------------------------------- ---------------- -------- ---------- - 5 tests/notebooks/external_output.ipynb 2020-03-12 17:31 0 5 - 4 tests/notebooks/complex_outputs.ipynb 2020-03-12 17:31 0 - 3 tests/notebooks/basic_unrun.ipynb 2020-03-12 17:31 0 6 - 2 tests/notebooks/basic_failing.ipynb 2020-03-12 17:31 0 2 - 1 tests/notebooks/basic.ipynb 2020-03-12 17:31 0 6 +```{tip} +Code cells can be tagged with `raises-exception` to let the executor known that a cell *may* raise an exception +(see [this issue on its behaviour](https://github.com/jupyter/nbconvert/issues/730)). ``` -You can remove a staged notebook by its URI or ID: +## Retrieving executed notebooks -```console -$ jcache stage remove-ids 4 -Unstaging ID: 4 -Success! -``` +Notebooks added to the project are not modified in any way during or after execution: -You can then run a basic execution of the required notebooks: +You can create a new "final" notebook, with the cached outputs merged into the source notebook with: -```console -$ jcache cache remove 6 2 -Removing Cache ID = 6 -Removing Cache ID = 2 -Success! +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: merge +:args: tests/notebooks/basic.md final_notebook.ipynb ``` -```console -$ jcache execute -Executing: ../tests/notebooks/basic.ipynb -Execution Succeeded: ../tests/notebooks/basic.ipynb -Executing: ../tests/notebooks/basic_failing.ipynb -error: Execution Failed: ../tests/notebooks/basic_failing.ipynb -Executing: ../tests/notebooks/basic_unrun.ipynb -Execution Succeeded: ../tests/notebooks/basic_unrun.ipynb -Finished! Successfully executed notebooks have been cached. -succeeded: -- ../tests/notebooks/basic.ipynb -- ../tests/notebooks/basic_unrun.ipynb -excepted: -- ../tests/notebooks/basic_failing.ipynb -errored: [] - -``` - -Successfully executed notebooks will be cached to the cache, -along with any 'artefacts' created by the execution, -that are inside the notebook folder, and data supplied by the executor. +## Invalidating cached notebooks -```console -$ jcache stage list - ID URI Created Assets Cache ID ----- ------------------------------------- ---------------- -------- ---------- - 5 tests/notebooks/external_output.ipynb 2020-03-12 17:31 0 5 - 3 tests/notebooks/basic_unrun.ipynb 2020-03-12 17:31 0 6 - 2 tests/notebooks/basic_failing.ipynb 2020-03-12 17:31 0 - 1 tests/notebooks/basic.ipynb 2020-03-12 17:31 0 6 +If you want to invalidate a notebook's cached execution, +for example if you have changed the notebook's execution environment, +you can do so by calling the `invalidate` command: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: invalidate +:args: tests/notebooks/basic.ipynb +``` + +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: list ``` -Execution data (such as execution time) will be stored in the cache record: +## Specifying notebooks with assets -```console -$ jcache cache show 6 -ID: 6 -Origin URI: ../tests/notebooks/basic_unrun.ipynb -Created: 2020-03-12 17:31 -Accessed: 2020-03-12 17:31 -Hashkey: 818f3412b998fcf4fe9ca3cca11a3fc3 -Data: - execution_seconds: 1.0559415130000005 +When executing in a temporary directory, you may want to specify additional "asset" files that also need to be be copied to this directory for the notebook to run. +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: remove +:args: tests/notebooks/basic.ipynb ``` -Failed notebooks will not be cached, but the exception traceback will be added to the stage record: +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: add-with-assets +:args: -nb tests/notebooks/basic.ipynb tests/notebooks/artifact_folder/artifact.txt +``` -```console -$ jcache stage show 2 -ID: 2 -URI: ../tests/notebooks/basic_failing.ipynb -Created: 2020-03-12 17:31 -Failed Last Execution! -Traceback (most recent call last): - File "../jupyter_cache/executors/basic.py", line 152, in execute - executenb(nb_bundle.nb, cwd=tmpdirname) - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py", line 737, in executenb - return ep.preprocess(nb, resources, km=km)[0] - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py", line 405, in preprocess - nb, resources = super(ExecutePreprocessor, self).preprocess(nb, resources) - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/base.py", line 69, in preprocess - nb.cells[index], resources = self.preprocess_cell(cell, resources, index) - File "/anaconda/envs/mistune/lib/python3.7/site-packages/nbconvert/preprocessors/execute.py", line 448, in preprocess_cell - raise CellExecutionError.from_cell_and_msg(cell, out) -nbconvert.preprocessors.execute.CellExecutionError: An error occurred while executing the following cell: ------------------- -raise Exception('oopsie!') ------------------- - ---------------------------------------------------------------------------- -Exception Traceback (most recent call last) - in -----> 1 raise Exception('oopsie!') - -Exception: oopsie! -Exception: oopsie! +```{jcache-cli} jupyter_cache.cli.commands.cmd_notebook:cmnd_notebook +:command: info +:args: tests/notebooks/basic.ipynb +``` +## Adding notebooks directly to the cache +Pre-executed notebooks can be added to the cache directly, without executing them. + +A check will be made that the notebooks look to have been executed correctly, +i.e. the cell execution counts go sequentially up from 1. + +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: add +:args: tests/notebooks/complex_outputs.ipynb +:input: y ``` -```{tip} -Code cells can be tagged with `raises-exception` to let the executor known that -a cell *may* raise an exception (see [this issue on its behaviour](https://github.com/jupyter/nbconvert/issues/730)). +Or to skip the validation: + +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: add +:args: --no-validate tests/notebooks/external_output.ipynb ``` -Once executed you may leave staged notebooks, for later re-execution, or remove them: +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: list +``` + +:::{tip} +To only show the latest versions of cached notebooks. ```console -$ jcache stage remove-ids --all -Are you sure you want to remove all? [y/N]: y -Unstaging ID: 1 -Unstaging ID: 2 -Unstaging ID: 3 -Unstaging ID: 5 -Success! +$ jcache cache list --latest-only ``` -You can also stage notebooks with assets; -external files that are required by the notebook during execution. -As with artefacts, these files must be in the same folder as the notebook, -or a sub-folder. +::: -```console -$ jcache stage add-with-assets -nb tests/notebooks/basic.ipynb tests/notebooks/artifact_folder/artifact.txt -Success! +## Diffing notebooks + +You can diff any of the cached notebooks with any (external) notebook: + +```{warning} +This requires `pip install nbdime` ``` -```console -$ jcache stage show 1 -ID: 1 -URI: ../tests/notebooks/basic.ipynb -Created: 2020-03-12 17:31 -Cache ID: 6 -Assets: -- ../tests/notebooks/artifact_folder/artifact.txt +```{jcache-cli} jupyter_cache.cli.commands.cmd_cache:cmnd_cache +:command: diff +:args: 1 tests/notebooks/basic_unrun.ipynb ``` diff --git a/docs/using/images/execution_flow.pptx b/docs/using/images/execution_flow.pptx index 4d6108f..f4aad67 100644 Binary files a/docs/using/images/execution_flow.pptx and b/docs/using/images/execution_flow.pptx differ diff --git a/docs/using/images/execution_process.svg b/docs/using/images/execution_process.svg index 680c219..31fa770 100644 --- a/docs/using/images/execution_process.svg +++ b/docs/using/images/execution_process.svg @@ -1,743 +1,969 @@ - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +PROJECT FOLDER + + + + + + + + + + + + - - - - - - - - - - - - - - - - +Notebook 1 + + + + + + + + + + + + - - - - - - - - - - - - - - - - - +Notebook 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - + +EXECUTION +1. +Get notebook path from +database +2. +Get actual notebook from +project folder +3. +Check if notebook already +exists in the cache ( +via +hash) +4. +If not, read notebook and +execute +5. +If successful, write executed +notebook to the cache. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + +CACHE FOLDER + + + + + + + + + + + + + + + + + + + + + + +COMMITTED NOTEBOOKS + + + + + + + + + + + + + + + + + + + + + +Notebook A + + + + + + + + + + + + + + + + + + + + + +Notebook B + + + + + + + + + + + + + + + + + + + + + + + + + + + +PROJECT DATABASE + + + + + + + + + + + + + + + + + + + + + + + + + +EXECUTOR + + + + + + + + + + + + + + + + + +Notebook 1 Copy + + + + + + + + + + + + + + + + + + + + + + + +Notebook 1 Executed + + + + + + + + + + + + + + + +HASH + + + + + + + + +URI + + + diff --git a/jupyter_cache/base.py b/jupyter_cache/base.py index dae26ac..a033fbf 100644 --- a/jupyter_cache/base.py +++ b/jupyter_cache/base.py @@ -3,17 +3,18 @@ API access to the cache should use this interface, with no assumptions about the backend storage/retrieval mechanisms. """ -import io from abc import ABC, abstractmethod +import io from pathlib import Path -from typing import Callable, Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Mapping, Optional, Tuple, Union import attr -import nbformat as nbf from attr.validators import instance_of, optional +import nbformat as nbf # TODO make these abstract -from jupyter_cache.cache.db import NbCacheRecord, NbStageRecord +from jupyter_cache.cache.db import NbCacheRecord, NbProjectRecord +from jupyter_cache.readers import DEFAULT_READ_DATA NB_VERSION = 4 @@ -37,6 +38,30 @@ def __init__(self, message, nb_bundle, *args, **kwargs): super().__init__(message, *args, **kwargs) +@attr.s(frozen=True, slots=True) +class ProjectNb: + """A notebook read from a project""" + + pk: int = attr.ib( + validator=instance_of(int), + metadata={"help": "the ID of the notebook"}, + ) + uri: str = attr.ib( + converter=str, + validator=instance_of(str), + metadata={"help": "the URI of the notebook"}, + ) + nb: nbf.NotebookNode = attr.ib( + validator=instance_of(nbf.NotebookNode), + repr=lambda nb: f"Notebook(cells={len(nb.cells)})", + metadata={"help": "the notebook"}, + ) + assets: List[Path] = attr.ib( + factory=list, + metadata={"help": "File paths required to run the notebook"}, + ) + + class NbArtifactsAbstract(ABC): """Container for artefacts of a notebook execution.""" @@ -44,26 +69,22 @@ class NbArtifactsAbstract(ABC): @abstractmethod def relative_paths(self) -> List[Path]: """Return the list of paths (relative to the notebook folder).""" - pass @abstractmethod def __iter__(self) -> Iterable[Tuple[Path, io.BufferedReader]]: """Yield the relative path and open files (in bytes mode)""" - pass def __repr__(self): - return "{0}(paths={1})".format( - self.__class__.__name__, len(self.relative_paths) - ) + return f"{self.__class__.__name__}(paths={len(self.relative_paths)})" @attr.s(frozen=True, slots=True) -class NbBundleIn: +class CacheBundleIn: """A container for notebooks and their associated data to cache.""" nb: nbf.NotebookNode = attr.ib( validator=instance_of(nbf.NotebookNode), - repr=lambda nb: "Notebook(cells={0})".format(len(nb.cells)), + repr=lambda nb: f"Notebook(cells={len(nb.cells)})", metadata={"help": "the notebook"}, ) uri: str = attr.ib( @@ -91,12 +112,12 @@ class NbBundleIn: @attr.s(frozen=True, slots=True) -class NbBundleOut: +class CacheBundleOut: """A container for notebooks and their associated data that have been cached.""" nb: nbf.NotebookNode = attr.ib( validator=instance_of(nbf.NotebookNode), - repr=lambda nb: "Notebook(cells={0})".format(len(nb.cells)), + repr=lambda nb: f"Notebook(cells={len(nb.cells)})", metadata={"help": "the notebook"}, ) record: NbCacheRecord = attr.ib(metadata={"help": "the cache record"}) @@ -107,16 +128,25 @@ class NbBundleOut: class JupyterCacheAbstract(ABC): - """An abstract cache for storing pre/post executed notebooks.""" + """An abstract cache for storing pre/post executed notebooks. + + Note: class instances should be pickleable. + """ + + @abstractmethod + def get_version(self) -> Optional[str]: + """Return the version of the cache.""" @abstractmethod - def clear_cache(self): + def clear_cache(self) -> None: """Clear the cache completely.""" - pass @abstractmethod def cache_notebook_bundle( - self, bundle: NbBundleIn, check_validity: bool = True, overwrite: bool = False + self, + bundle: CacheBundleIn, + check_validity: bool = True, + overwrite: bool = False, ) -> NbCacheRecord: """Commit an executed notebook, returning its cache record. @@ -128,7 +158,6 @@ def cache_notebook_bundle( :param overwrite: Allow overwrite of cache with matching hash :return: The primary key of the cache """ - pass @abstractmethod def cache_notebook_file( @@ -154,21 +183,18 @@ def cache_notebook_file( :param overwrite: Allow overwrite of cache with matching hash :return: The primary key of the cache """ - pass @abstractmethod def list_cache_records(self) -> List[NbCacheRecord]: """Return a list of cached notebook records.""" - pass + @abstractmethod def get_cache_record(self, pk: int) -> NbCacheRecord: """Return the record of a cache, by its primary key""" - pass @abstractmethod - def get_cache_bundle(self, pk: int) -> NbBundleOut: + def get_cache_bundle(self, pk: int) -> CacheBundleOut: """Return an executed notebook bundle, by its primary key""" - pass @abstractmethod def cache_artefacts_temppath(self, pk: int) -> Path: @@ -180,7 +206,6 @@ def cache_artefacts_temppath(self, pk: int) -> Path: with cache.cache_artefacts_temppath(1) as path: shutil.copytree(path, destination) """ - pass @abstractmethod def match_cache_notebook(self, nb: nbf.NotebookNode) -> NbCacheRecord: @@ -188,7 +213,6 @@ def match_cache_notebook(self, nb: nbf.NotebookNode) -> NbCacheRecord: :raises KeyError: if no match is found """ - pass def match_cache_file(self, path: str) -> NbCacheRecord: """Match to an executed notebook, returning its primary key. @@ -213,7 +237,6 @@ def merge_match_into_notebook( :raises KeyError: if no match is found :return: pk, input notebook with cached code cells and metadata merged. """ - pass def merge_match_into_file( self, @@ -240,7 +263,6 @@ def diff_nbnode_with_cache( Note: this will not diff markdown content, since it is not stored in the cache. """ - pass def diff_nbfile_with_cache( self, pk: int, path: str, as_str=False, **kwargs @@ -253,65 +275,57 @@ def diff_nbfile_with_cache( return self.diff_nbnode_with_cache(pk, nb, uri=path, as_str=as_str, **kwargs) @abstractmethod - def stage_notebook_file(self, uri: str, assets: List[str] = ()) -> NbStageRecord: - """Stage a single notebook for execution. + def add_nb_to_project( + self, + uri: str, + *, + read_data: Mapping = DEFAULT_READ_DATA, + assets: List[str] = (), + ) -> NbProjectRecord: + """Add a single notebook to the project. :param uri: The path to the file + :param read_data: Data to generate a function, to read the uri and return a NotebookNode :param assets: The path of files required by the notebook to run. :raises ValueError: assets not within the same folder as the notebook URI. """ - pass @abstractmethod - def discard_staged_notebook(self, uri_or_pk: Union[int, str]): - """Discard a staged notebook.""" - pass + def remove_nb_from_project(self, uri_or_pk: Union[int, str]): + """Remove a notebook from the project.""" @abstractmethod - def list_staged_records(self) -> List[NbStageRecord]: - """list staged notebook URI's in the cache.""" - pass + def list_project_records( + self, + filter_uris: Optional[List[str]] = None, + filter_pks: Optional[List[int]] = None, + ) -> List[NbProjectRecord]: + """Return a list of all notebook records in the project.""" @abstractmethod - def get_staged_record(self, uri_or_pk: Union[int, str]) -> NbStageRecord: - """Return the record of a staged notebook, by its primary key or URI.""" - pass + def get_project_record(self, uri_or_pk: Union[int, str]) -> NbProjectRecord: + """Return the record of a notebook in the project, by its primary key or URI.""" @abstractmethod - def get_staged_notebook( - self, uri_or_pk: Union[int, str], converter: Optional[Callable] = None - ) -> NbBundleIn: - """Return a single staged notebook, by its primary key or URI. + def get_project_notebook(self, uri_or_pk: Union[int, str]) -> ProjectNb: + """Return a single notebook in the project, by its primary key or URI. - :param converter: An optional converter for staged notebooks, - which takes the URI and returns a notebook node (default nbformat.read) + :raises NbReadError: if the notebook cannot be read """ - pass @abstractmethod - def get_cache_record_of_staged( - self, uri_or_pk: Union[int, str], converter: Optional[Callable] = None + def get_cached_project_nb( + self, uri_or_pk: Union[int, str] ) -> Optional[NbCacheRecord]: - pass - - @abstractmethod - def list_staged_unexecuted( - self, converter: Optional[Callable] = None - ) -> List[NbStageRecord]: - """List staged notebooks, whose hash is not present in the cache. + """Get cache record for a notebook in the project. - :param converter: An optional converter for staged notebooks, - which takes the URI and returns a notebook node (default nbformat.read) + :param uri_or_pk: The URI of pk of the file in the project """ - pass - - # removed until defined use case - # @abstractmethod - # def get_cache_codecell(self, pk: int, index: int) -> nbf.NotebookNode: - # """Return a code cell from a cached notebook. - - # NOTE: the index **only** refers to the list of code cells, e.g. - # `[codecell_0, textcell_1, codecell_2]` - # would map {0: codecell_0, 1: codecell_2} - # """ - # pass + + @abstractmethod + def list_unexecuted( + self, + filter_uris: Optional[List[str]] = None, + filter_pks: Optional[List[int]] = None, + ) -> List[NbProjectRecord]: + """List notebooks in the project, whose hash is not present in the cache.""" diff --git a/jupyter_cache/cache/db.py b/jupyter_cache/cache/db.py index 4ec08e0..3217a19 100644 --- a/jupyter_cache/cache/db.py +++ b/jupyter_cache/cache/db.py @@ -1,33 +1,62 @@ -import os from contextlib import contextmanager from datetime import datetime +import os from pathlib import Path -from typing import List, Optional +from typing import Any, Dict, List, Optional, Union from sqlalchemy import JSON, Column, DateTime, Integer, String, Text from sqlalchemy.engine import Engine, create_engine -from sqlalchemy.exc import IntegrityError +from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import Session, sessionmaker, validates from sqlalchemy.sql.expression import desc +from jupyter_cache import __version__ from jupyter_cache.utils import shorten_path OrmBase = declarative_base() +DB_NAME = "global.db" + +# version changes: +# 0.5.0: +# - __version__.txt file written to cache on creation +# - table: nbstage -> nbproject +# - added read_data and exec_data fields to nbproject + +def create_db(path: Union[str, Path]) -> Engine: + """Get or create a database at the given path. + + :param path: The path to the cache folder. + """ + exists = (Path(path) / DB_NAME).exists() + engine = create_engine(f"sqlite:///{os.path.join(path, DB_NAME)}") + if not exists: + # add all the tables, and a version identifier + OrmBase.metadata.create_all(engine) + Path(path).joinpath("__version__.txt").write_text(__version__) -def create_db(path, name="global.db") -> Engine: - engine = create_engine("sqlite:///{}".format(os.path.join(path, name))) - OrmBase.metadata.create_all(engine) return engine +def get_version(path: Union[str, Path]) -> Optional[str]: + """Attempt to get the version of the cache.""" + version_file = Path(path).joinpath("__version__.txt") + if version_file.exists(): + return version_file.read_text().strip() + + @contextmanager def session_context(engine: Engine): """Open a connection to the database.""" session = sessionmaker(bind=engine)() try: yield session + except OperationalError as exc: + session.rollback() + raise RuntimeError( + "Unexpected error accessing jupyter cache, it may need to be cleared." + ) from exc except Exception: session.rollback() raise @@ -45,7 +74,7 @@ class Setting(OrmBase): value = Column(JSON()) def __repr__(self): - return "{0}(pk={1},{2}={3})".format( + return "{}(pk={},{}={})".format( self.__class__.__name__, self.pk, self.key, self.value ) @@ -68,10 +97,9 @@ def get_value(key: str, db: Engine, default=None): result = session.query(Setting.value).filter_by(key=key).one_or_none() if result is None: if default is not None: - Setting.set_value(key, default, db) result = [default] else: - raise KeyError("Setting not found in DB: {}".format(key)) + raise KeyError(f"Setting not found in DB: {key}") value = result[0] return value @@ -82,6 +110,168 @@ def get_dict(db: Engine) -> dict: return {k: v for k, v in results} +class NbProjectRecord(OrmBase): + """A record of a notebook within the project.""" + + __tablename__ = "nbproject" + + pk = Column(Integer(), primary_key=True) + uri = Column(String(255), nullable=False, unique=True) + read_data = Column(JSON(), nullable=False) + """Data on how to read the uri to a notebook.""" + assets = Column(JSON(), nullable=False, default=list) + """A list of file assets required for the notebook to run.""" + exec_data = Column(JSON(), nullable=True) + """Data on how to execute the notebook.""" + created = Column(DateTime, nullable=False, default=datetime.utcnow) + traceback = Column(Text(), nullable=True, default="") + """A traceback is added if a notebook fails to execute fully.""" + + def __repr__(self): + return f"{self.__class__.__name__}(pk={self.pk})" + + def to_dict(self): + return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} + + def format_dict( + self, + cache_record: Optional["NbCacheRecord"] = None, + path_length: Optional[int] = None, + assets: bool = True, + read_error: Optional[str] = None, + read_name: bool = True, + ) -> dict: + """Return data for display.""" + status = "-" + if cache_record: + status = f"✅ [{cache_record.pk}]" + elif self.traceback: + status = "❌" + elif read_error: + status = "❗️ (unreadable)" + data = { + "ID": self.pk, + "URI": str(shorten_path(self.uri, path_length)), + "Reader": self.read_data.get("name", "-") if read_name else self.read_data, + "Added": self.created.isoformat(" ", "minutes"), + "Status": status, + } + if assets: + data["Assets"] = len(self.assets) + return data + + @validates("read_data") + def validate_read_data(self, key, value): + if not isinstance(value, dict): + raise ValueError("read_data must be a dict") + if "name" not in value: + raise ValueError("read_data must have a name") + return value + + @validates("assets") + def validator_assets(self, key, value): + return self.validate_assets(value) + + @staticmethod + def validate_assets(paths, uri=None): + """Validate asset paths are within same folder as the notebook URI""" + if not ( + isinstance(paths, (list, tuple)) and all(isinstance(v, str) for v in paths) + ): + raise TypeError(f"assets must be interable of strings: {paths}") + if uri is None: + return list(paths) + + uri_folder = Path(uri).parent + for path in paths: + try: + Path(path).relative_to(uri_folder) + except ValueError: + raise ValueError(f"Asset '{path}' is not in folder '{uri_folder}''") + return list(paths) + + @staticmethod + def create_record( + uri: str, + db: Engine, + read_data: Dict[str, Any], + raise_on_exists=True, + *, + assets=(), + ) -> "NbProjectRecord": + assets = NbProjectRecord.validate_assets(assets, uri) + with session_context(db) as session: # type: Session + record = NbProjectRecord(uri=uri, read_data=read_data, assets=assets) + session.add(record) + try: + session.commit() + except IntegrityError: + if raise_on_exists: + raise ValueError(f"URI already in project: {uri}") + return NbProjectRecord.record_from_uri(uri, db) + session.refresh(record) + session.expunge(record) + return record + + def remove_pks(pks: List[int], db: Engine): + with session_context(db) as session: # type: Session + session.query(NbProjectRecord).filter(NbProjectRecord.pk.in_(pks)).delete( + synchronize_session=False + ) + session.commit() + + def remove_uris(uris: List[str], db: Engine): + with session_context(db) as session: # type: Session + session.query(NbProjectRecord).filter(NbProjectRecord.uri.in_(uris)).delete( + synchronize_session=False + ) + session.commit() + + @staticmethod + def record_from_pk(pk: int, db: Engine) -> "NbProjectRecord": + with session_context(db) as session: # type: Session + result = session.query(NbProjectRecord).filter_by(pk=pk).one_or_none() + if result is None: + raise KeyError(f"Project record not found for NB with PK: {pk}") + session.expunge(result) + return result + + @staticmethod + def record_from_uri(uri: str, db: Engine) -> "NbProjectRecord": + with session_context(db) as session: # type: Session + result = session.query(NbProjectRecord).filter_by(uri=uri).one_or_none() + if result is None: + raise KeyError(f"Project record not found for NB with URI: {uri}") + session.expunge(result) + return result + + @staticmethod + def records_all(db: Engine) -> "NbProjectRecord": + with session_context(db) as session: # type: Session + results = session.query(NbProjectRecord).order_by(NbProjectRecord.pk).all() + session.expunge_all() + return results + + def remove_tracebacks(pks, db: Engine): + """Remove all tracebacks.""" + with session_context(db) as session: # type: Session + session.query(NbProjectRecord).filter(NbProjectRecord.pk.in_(pks)).update( + {NbProjectRecord.traceback: None}, synchronize_session=False + ) + session.commit() + + def set_traceback(uri: str, traceback: Optional[str], db: Engine): + with session_context(db) as session: # type: Session + result = session.query(NbProjectRecord).filter_by(uri=uri).one_or_none() + if result is None: + raise KeyError(f"Project record not found for NB with URI: {uri}") + result.traceback = traceback + try: + session.commit() + except IntegrityError: + raise TypeError(traceback) + + class NbCacheRecord(OrmBase): """A record of an executed notebook cache.""" @@ -92,13 +282,14 @@ class NbCacheRecord(OrmBase): uri = Column(String(255), nullable=False, unique=False) description = Column(String(255), nullable=False, default="") data = Column(JSON()) + """Extra data, such as the execution time.""" created = Column(DateTime, nullable=False, default=datetime.utcnow) accessed = Column( DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow ) def __repr__(self): - return "{0}(pk={1})".format(self.__class__.__name__, self.pk) + return f"{self.__class__.__name__}(pk={self.pk})" def to_dict(self): return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} @@ -133,6 +324,12 @@ def create_record(uri: str, hashkey: str, db: Engine, **kwargs) -> "NbCacheRecor session.expunge(record) return record + def remove_record(pk: int, db: Engine): + with session_context(db) as session: # type: Session + record = session.get(NbCacheRecord, pk) + session.delete(record) + session.commit() + def remove_records(pks: List[int], db: Engine): with session_context(db) as session: # type: Session session.query(NbCacheRecord).filter(NbCacheRecord.pk.in_(pks)).delete( @@ -147,9 +344,7 @@ def record_from_hashkey(hashkey: str, db: Engine) -> "NbCacheRecord": session.query(NbCacheRecord).filter_by(hashkey=hashkey).one_or_none() ) if result is None: - raise KeyError( - "Cache record not found for NB with hashkey: {}".format(hashkey) - ) + raise KeyError(f"Cache record not found for NB with hashkey: {hashkey}") session.expunge(result) return result @@ -158,7 +353,7 @@ def record_from_pk(pk: int, db: Engine) -> "NbCacheRecord": with session_context(db) as session: # type: Session result = session.query(NbCacheRecord).filter_by(pk=pk).one_or_none() if result is None: - raise KeyError("Cache record not found for NB with PK: {}".format(pk)) + raise KeyError(f"Cache record not found for NB with PK: {pk}") session.expunge(result) return result @@ -167,7 +362,7 @@ def touch(pk, db: Engine): with session_context(db) as session: # type: Session record = session.query(NbCacheRecord).filter_by(pk=pk).one_or_none() if record is None: - raise KeyError("Cache record not found for NB with PK: {}".format(pk)) + raise KeyError(f"Cache record not found for NB with PK: {pk}") record.accessed = datetime.utcnow() session.commit() @@ -178,9 +373,7 @@ def touch_hashkey(hashkey, db: Engine): session.query(NbCacheRecord).filter_by(hashkey=hashkey).one_or_none() ) if record is None: - raise KeyError( - "Cache record not found for NB with hashkey: {}".format(hashkey) - ) + raise KeyError(f"Cache record not found for NB with hashkey: {hashkey}") record.accessed = datetime.utcnow() session.commit() @@ -215,135 +408,3 @@ def records_to_delete(keep: int, db: Engine) -> List[int]: .all() ] return pks_to_delete - - -class NbStageRecord(OrmBase): - """A record of a notebook staged for execution.""" - - __tablename__ = "nbstage" - - pk = Column(Integer(), primary_key=True) - uri = Column(String(255), nullable=False, unique=True) - assets = Column(JSON(), nullable=False, default=list) - traceback = Column(Text(), nullable=True, default="") - created = Column(DateTime, nullable=False, default=datetime.utcnow) - - def __repr__(self): - return "{0}(pk={1})".format(self.__class__.__name__, self.pk) - - def to_dict(self): - return {k: v for k, v in self.__dict__.items() if not k.startswith("_")} - - def format_dict(self, cache_record=None, path_length=None, assets=True): - data = { - "ID": self.pk, - "URI": str(shorten_path(self.uri, path_length)), - "Created": self.created.isoformat(" ", "minutes"), - } - if assets: - data["Assets"] = len(self.assets) - if cache_record is not None: - data["Cache ID"] = cache_record.pk - return data - - @validates("assets") - def validator_assets(self, key, value): - return self.validate_assets(value) - - @staticmethod - def validate_assets(paths, uri=None): - """Validate asset paths are within same folder as the notebook URI""" - if not ( - isinstance(paths, (list, tuple)) and all(isinstance(v, str) for v in paths) - ): - raise TypeError(f"assets must be interable of strings: {paths}") - if uri is None: - return list(paths) - - uri_folder = Path(uri).parent - for path in paths: - try: - Path(path).relative_to(uri_folder) - except ValueError: - raise ValueError(f"Asset '{path}' is not in folder '{uri_folder}''") - return list(paths) - - @staticmethod - def create_record( - uri: str, db: Engine, raise_on_exists=True, assets=() - ) -> "NbStageRecord": - assets = NbStageRecord.validate_assets(assets, uri) - with session_context(db) as session: # type: Session - record = NbStageRecord(uri=uri, assets=assets) - session.add(record) - try: - session.commit() - except IntegrityError: - if raise_on_exists: - raise ValueError(f"uri already staged: {uri}") - return NbStageRecord.record_from_uri(uri, db) - session.refresh(record) - session.expunge(record) - return record - - def remove_pks(pks: List[int], db: Engine): - with session_context(db) as session: # type: Session - session.query(NbStageRecord).filter(NbStageRecord.pk.in_(pks)).delete( - synchronize_session=False - ) - session.commit() - - def remove_uris(uris: List[str], db: Engine): - with session_context(db) as session: # type: Session - session.query(NbStageRecord).filter(NbStageRecord.uri.in_(uris)).delete( - synchronize_session=False - ) - session.commit() - - @staticmethod - def record_from_pk(pk: int, db: Engine) -> "NbStageRecord": - with session_context(db) as session: # type: Session - result = session.query(NbStageRecord).filter_by(pk=pk).one_or_none() - if result is None: - raise KeyError("Staging record not found for NB with PK: {}".format(pk)) - session.expunge(result) - return result - - @staticmethod - def record_from_uri(uri: str, db: Engine) -> "NbStageRecord": - with session_context(db) as session: # type: Session - result = session.query(NbStageRecord).filter_by(uri=uri).one_or_none() - if result is None: - raise KeyError( - "Staging record not found for NB with URI: {}".format(uri) - ) - session.expunge(result) - return result - - @staticmethod - def records_all(db: Engine) -> "NbStageRecord": - with session_context(db) as session: # type: Session - results = session.query(NbStageRecord).all() - session.expunge_all() - return results - - def remove_tracebacks(pks, db: Engine): - """Remove all tracebacks.""" - with session_context(db) as session: # type: Session - session.query(NbStageRecord).filter(NbStageRecord.pk.in_(pks)).update( - {NbStageRecord.traceback: None}, synchronize_session=False - ) - session.commit() - - def set_traceback(uri: str, traceback: Optional[str], db: Engine): - with session_context(db) as session: # type: Session - result = session.query(NbStageRecord).filter_by(uri=uri).one_or_none() - if result is None: - raise KeyError( - "Staging record not found for NB with URI: {}".format(uri) - ) - result.traceback = traceback - try: - session.commit() - except IntegrityError: - raise TypeError(traceback) diff --git a/jupyter_cache/cache/main.py b/jupyter_cache/cache/main.py index 8681d7d..ea63dbe 100644 --- a/jupyter_cache/cache/main.py +++ b/jupyter_cache/cache/main.py @@ -1,26 +1,28 @@ +from contextlib import contextmanager import copy import hashlib import io -import shutil -from contextlib import contextmanager from pathlib import Path -from typing import Callable, Iterable, List, Optional, Tuple, Union +import shutil +from typing import Iterable, List, Mapping, Optional, Tuple, Union import nbformat as nbf from jupyter_cache.base import ( # noqa: F401 NB_VERSION, + CacheBundleIn, + CacheBundleOut, CachingError, JupyterCacheAbstract, NbArtifactsAbstract, - NbBundleIn, - NbBundleOut, NbValidityError, + ProjectNb, RetrievalError, ) +from jupyter_cache.readers import DEFAULT_READ_DATA, NbReadError, get_reader from jupyter_cache.utils import to_relative_paths -from .db import NbCacheRecord, NbStageRecord, Setting, create_db +from .db import NbCacheRecord, NbProjectRecord, Setting, create_db, get_version CACHE_LIMIT_KEY = "cache_limit" DEFAULT_CACHE_LIMIT = 1000 @@ -72,7 +74,7 @@ def db(self): return self._db def __repr__(self): - return "{0}({1})".format(self.__class__.__name__, repr(str(self._path))) + return f"{self.__class__.__name__}({repr(str(self._path))})" def __getstate__(self): """For pickling instances, db must be removed.""" @@ -80,6 +82,9 @@ def __getstate__(self): state["_db"] = None return state + def get_version(self) -> Optional[str]: + return get_version(self.path) + def clear_cache(self): """Clear the cache completely.""" shutil.rmtree(self.path) @@ -89,7 +94,7 @@ def _get_notebook_path_cache(self, hashkey, raise_on_missing=False) -> Path: """Retrieve a relative path in the cache to a notebook, from its hash.""" path = self.path.joinpath(Path("executed", hashkey, "base.ipynb")) if not path.exists() and raise_on_missing: - raise RetrievalError("hashkey not in cache: {}".format(hashkey)) + raise RetrievalError(f"hashkey not in cache: {hashkey}") return path def _get_artifact_path_cache(self, hashkey) -> Path: @@ -171,7 +176,7 @@ def create_hashed_notebook( return (nb, hash_string) - def _validate_nb_bundle(self, nb_bundle: NbBundleIn): + def _validate_nb_bundle(self, nb_bundle: CacheBundleIn): """Validate that a notebook bundle should be cached. We check that the notebook has been executed correctly, @@ -194,7 +199,7 @@ def _validate_nb_bundle(self, nb_bundle: NbBundleIn): def cache_notebook_bundle( self, - bundle: NbBundleIn, + bundle: CacheBundleIn, check_validity: bool = True, overwrite: bool = False, description="", @@ -214,9 +219,13 @@ def cache_notebook_bundle( "Notebook already exists in cache and overwrite=False." ) shutil.rmtree(path.parent) + + try: record = NbCacheRecord.record_from_hashkey(hashkey, self.db) - # TODO record should be changed rather than deleted? - NbCacheRecord.remove_records([record.pk], self.db) + except KeyError: + pass + else: + NbCacheRecord.remove_record(record.pk, self.db) record = NbCacheRecord.create_record( uri=bundle.uri, @@ -266,7 +275,7 @@ def cache_notebook_file( """ notebook = nbf.read(str(path), nbf.NO_CONVERT) return self.cache_notebook_bundle( - NbBundleIn( + CacheBundleIn( notebook, uri or str(path), artifacts=NbArtifacts(artifacts, in_folder=Path(path).parent), @@ -282,17 +291,15 @@ def list_cache_records(self) -> List[NbCacheRecord]: def get_cache_record(self, pk: int) -> NbCacheRecord: return NbCacheRecord.record_from_pk(pk, self.db) - def get_cache_bundle(self, pk: int) -> NbBundleOut: + def get_cache_bundle(self, pk: int) -> CacheBundleOut: record = NbCacheRecord.record_from_pk(pk, self.db) NbCacheRecord.touch(pk, self.db) path = self._get_notebook_path_cache(record.hashkey) artifact_folder = self._get_artifact_path_cache(record.hashkey) if not path.exists(): - raise KeyError( - "Notebook file does not exist for cache record PK: {}".format(pk) - ) + raise KeyError(f"Notebook file does not exist for cache record PK: {pk}") - return NbBundleOut( + return CacheBundleOut( nbf.reads(path.read_text(encoding="utf8"), nbf.NO_CONVERT), record=record, artifacts=NbArtifacts( @@ -318,9 +325,7 @@ def remove_cache(self, pk: int): record = NbCacheRecord.record_from_pk(pk, self.db) path = self._get_notebook_path_cache(record.hashkey) if not path.exists(): - raise KeyError( - "Notebook file does not exist for cache record PK: {}".format(pk) - ) + raise KeyError(f"Notebook file does not exist for cache record PK: {pk}") shutil.rmtree(path.parent) NbCacheRecord.remove_records([pk], self.db) @@ -382,7 +387,12 @@ def diff_nbnode_with_cache( Note: this will not diff markdown content, since it is not stored in the cache. """ - import nbdime + try: + import nbdime + except ImportError: + raise ImportError( + "nbdime is required to diff notebooks, install with `pip install nbdime`" + ) from nbdime.prettyprint import PrettyPrintConfig, pretty_print_diff cached_nb = self.get_cache_bundle(pk).nb @@ -398,75 +408,91 @@ def diff_nbnode_with_cache( ) return stream.getvalue() - def stage_notebook_file(self, path: str, assets=()) -> NbStageRecord: - """Stage a single notebook for execution. - - :param uri: The path to the file - :param assets: The path of files required by the notebook to run. - These must be within the same folder as the notebook. - """ - return NbStageRecord.create_record( - str(Path(path).absolute()), self.db, raise_on_exists=False, assets=assets + def add_nb_to_project( + self, + path: str, + *, + read_data: Mapping = DEFAULT_READ_DATA, + assets: List[str] = (), + ) -> NbProjectRecord: + # check the reader can be loaded + read_data = dict(read_data) + _ = get_reader(read_data) + # TODO should we test that the file can be read by the reader? + return NbProjectRecord.create_record( + str(Path(path).absolute()), + self.db, + raise_on_exists=False, + read_data=read_data, + assets=assets, ) # TODO physically copy to cache? # TODO assets - def list_staged_records(self) -> List[NbStageRecord]: - return NbStageRecord.records_all(self.db) + def list_project_records( + self, + filter_uris: Optional[List[str]] = None, + filter_pks: Optional[List[int]] = None, + ) -> List[NbProjectRecord]: + records = NbProjectRecord.records_all(self.db) + if filter_uris is not None: + records = [r for r in records if r.uri in filter_uris] + if filter_pks is not None: + records = [r for r in records if r.pk in filter_pks] + return records - def get_staged_record(self, uri_or_pk: Union[int, str]) -> NbStageRecord: + def get_project_record(self, uri_or_pk: Union[int, str]) -> NbProjectRecord: if isinstance(uri_or_pk, int): - record = NbStageRecord.record_from_pk(uri_or_pk, self.db) + record = NbProjectRecord.record_from_pk(uri_or_pk, self.db) else: - record = NbStageRecord.record_from_uri(uri_or_pk, self.db) + record = NbProjectRecord.record_from_uri(uri_or_pk, self.db) return record - def discard_staged_notebook(self, uri_or_pk: Union[int, str]): - """Discard a staged notebook.""" + def remove_nb_from_project(self, uri_or_pk: Union[int, str]): if isinstance(uri_or_pk, int): - NbStageRecord.remove_pks([uri_or_pk], self.db) + NbProjectRecord.remove_pks([uri_or_pk], self.db) else: - NbStageRecord.remove_uris([uri_or_pk], self.db) + NbProjectRecord.remove_uris([uri_or_pk], self.db) - # TODO add discard all/multiple staged records method + # TODO add discard all/multiple project records method - def get_staged_notebook( - self, uri_or_pk: Union[int, str], converter: Optional[Callable] = None - ) -> NbBundleIn: - """Return a single staged notebook.""" + def get_project_notebook(self, uri_or_pk: Union[int, str]) -> ProjectNb: if isinstance(uri_or_pk, int): - uri_or_pk = NbStageRecord.record_from_pk(uri_or_pk, self.db).uri - if not Path(uri_or_pk).exists(): - raise IOError( - "The URI of the staged record no longer exists: {}".format(uri_or_pk) - ) - if converter is None: - notebook = nbf.read(uri_or_pk, nbf.NO_CONVERT) + record = NbProjectRecord.record_from_pk(uri_or_pk, self.db) else: - notebook = converter(uri_or_pk) - return NbBundleIn(notebook, uri_or_pk) - - def get_cache_record_of_staged( - self, uri_or_pk: Union[int, str], converter: Optional[Callable] = None + record = NbProjectRecord.record_from_uri(uri_or_pk, self.db) + if not Path(record.uri).exists(): + raise OSError( + f"The URI of the project record no longer exists: {record.uri}" + ) + try: + reader = get_reader(record.read_data) + notebook = reader(record.uri) + assert isinstance( + notebook, nbf.NotebookNode + ), f"Reader did not return a v4 NotebookNode: {type(notebook)} {notebook}" + except Exception as exc: + raise NbReadError(f"Failed to read the notebook: {exc}") from exc + return ProjectNb(record.pk, record.uri, notebook, record.assets) + + def get_cached_project_nb( + self, uri_or_pk: Union[int, str] ) -> Optional[NbCacheRecord]: - if isinstance(uri_or_pk, int): - record = NbStageRecord.record_from_pk(uri_or_pk, self.db) - else: - record = NbStageRecord.record_from_uri(uri_or_pk, self.db) - nb = self.get_staged_notebook(record.uri, converter=converter).nb + nb = self.get_project_notebook(uri_or_pk).nb _, hashkey = self.create_hashed_notebook(nb) try: return NbCacheRecord.record_from_hashkey(hashkey, self.db) except KeyError: return None - def list_staged_unexecuted( - self, converter: Optional[Callable] = None - ) -> List[NbStageRecord]: - """List staged notebooks, whose hash is not present in the cached notebooks.""" + def list_unexecuted( + self, + filter_uris: Optional[List[str]] = None, + filter_pks: Optional[List[int]] = None, + ) -> List[NbProjectRecord]: records = [] - for record in self.list_staged_records(): - nb = self.get_staged_notebook(record.uri, converter).nb + for record in self.list_project_records(filter_uris, filter_pks): + nb = self.get_project_notebook(record.uri).nb _, hashkey = self.create_hashed_notebook(nb) try: NbCacheRecord.record_from_hashkey(hashkey, self.db) diff --git a/jupyter_cache/cli/__init__.py b/jupyter_cache/cli/__init__.py index e69de29..2f272da 100644 --- a/jupyter_cache/cli/__init__.py +++ b/jupyter_cache/cli/__init__.py @@ -0,0 +1,45 @@ +import os +from pathlib import Path +from typing import TYPE_CHECKING + +import click + +if TYPE_CHECKING: + from jupyter_cache.base import JupyterCacheAbstract + + +class CacheContext: + """Context for retrieving the cache.""" + + def __init__(self, cache_path=None) -> None: + if cache_path is None: + self._cache_path = os.environ.get( + "JUPYTERCACHE", os.path.join(os.getcwd(), ".jupyter_cache") + ) + else: + self._cache_path = cache_path + + @property + def cache_path(self) -> Path: + return Path(self._cache_path) + + def get_cache(self, ask_on_missing=True) -> "JupyterCacheAbstract": + """Get the cache.""" + from jupyter_cache import get_cache + + if (not self.cache_path.exists()) and ask_on_missing: + click.secho("Cache path: ", fg="green", nl=False) + click.echo(str(self.cache_path)) + if not click.confirm( + "The cache does not yet exist, do you want to create it?" + ): + raise click.Abort() + + # gets created lazily + return get_cache(self.cache_path) + + def set_cache_path(self, cache_path: str) -> None: + self._cache_path = cache_path + + +pass_cache = click.make_pass_decorator(CacheContext, ensure=True) diff --git a/jupyter_cache/cli/arguments.py b/jupyter_cache/cli/arguments.py index 12ef22f..7f2e9d5 100644 --- a/jupyter_cache/cli/arguments.py +++ b/jupyter_cache/cli/arguments.py @@ -30,7 +30,17 @@ type=click.Path(dir_okay=False, exists=True, readable=True, resolve_path=True), ) +OUTPUT_PATH = click.argument( + "outpath", + metavar="OUTPUT_PATH", + type=click.Path(dir_okay=False, writable=True, resolve_path=True), +) + PK = click.argument("pk", metavar="ID", type=int) PKS = click.argument("pks", metavar="IDs", nargs=-1, type=int) + +PK_OR_PATH = click.argument("pk_path", metavar="ID_OR_PATH", type=str) + +PK_OR_PATHS = click.argument("pk_paths", metavar="ID_OR_PATHS", nargs=-1) diff --git a/jupyter_cache/cli/commands/__init__.py b/jupyter_cache/cli/commands/__init__.py index df7c551..87bd10e 100644 --- a/jupyter_cache/cli/commands/__init__.py +++ b/jupyter_cache/cli/commands/__init__.py @@ -1,9 +1,5 @@ -import click_completion - -# Activate the completion of parameter types provided by the click_completion package -click_completion.init() +"""The jupyter-cache CLI.""" from .cmd_cache import * # noqa: F401,F403,E402 -from .cmd_config import * # noqa: F401,F403,E402 -from .cmd_exec import * # noqa: F401,F403,E402 -from .cmd_stage import * # noqa: F401,F403,E402 +from .cmd_notebook import * # noqa: F401,F403,E402 +from .cmd_project import * # noqa: F401,F403,E402 diff --git a/jupyter_cache/cli/commands/cmd_cache.py b/jupyter_cache/cli/commands/cmd_cache.py index a1d2b40..5194ef1 100644 --- a/jupyter_cache/cli/commands/cmd_cache.py +++ b/jupyter_cache/cli/commands/cmd_cache.py @@ -1,21 +1,19 @@ -import sys - import click -from jupyter_cache import get_cache -from jupyter_cache.cli import arguments, options +from jupyter_cache.cli import arguments, options, pass_cache from jupyter_cache.cli.commands.cmd_main import jcache from jupyter_cache.utils import tabulate_cache_records @jcache.group("cache") -def cmnd_cache(): - """Commands for adding to and inspecting the cache.""" - pass +@options.CACHE_PATH +@pass_cache +def cmnd_cache(cache, cache_path): + """Work with cached execution(s) in a project.""" + cache.set_cache_path(cache_path) @cmnd_cache.command("list") -@options.CACHE_PATH @click.option( "-l", "--latest-only", @@ -24,9 +22,10 @@ def cmnd_cache(): ) @click.option("-h", "--hashkeys", is_flag=True, help="Show the hashkey of notebook.") @options.PATH_LENGTH -def list_caches(cache_path, latest_only, hashkeys, path_length): - """List cached notebook records in the cache.""" - db = get_cache(cache_path) +@pass_cache +def list_caches(cache, latest_only, hashkeys, path_length): + """List cached notebook records.""" + db = cache.get_cache() records = db.list_cache_records() if not records: click.secho("No Cached Notebooks", fg="blue") @@ -45,19 +44,19 @@ def list_caches(cache_path, latest_only, hashkeys, path_length): ) -@cmnd_cache.command("show") -@options.CACHE_PATH +@cmnd_cache.command("info") @arguments.PK -def show_cache(cache_path, pk): - """Show details of a cached notebook in the cache.""" +@pass_cache +def cached_info(cache, pk): + """Show details of a cached notebook.""" import yaml - db = get_cache(cache_path) + db = cache.get_cache() try: record = db.get_cache_record(pk) except KeyError: - click.secho("ID {} does not exist, Aborting!".format(pk), fg="red") - sys.exit(1) + click.secho(f"ID {pk} does not exist, Aborting!", fg="red") + raise click.Abort() data = record.format_dict(hashkey=True, path_length=None) click.echo(yaml.safe_dump(data, sort_keys=False), nl=False) with db.cache_artefacts_temppath(pk) as folder: @@ -71,21 +70,21 @@ def show_cache(cache_path, pk): click.echo(f"- {path}") -@cmnd_cache.command("cat-artifact") -@options.CACHE_PATH +@cmnd_cache.command("cat-artefact") @arguments.PK @arguments.ARTIFACT_RPATH -def cat_artifact(cache_path, pk, artifact_rpath): +@pass_cache +def cat_artifact(cache, pk, artifact_rpath): """Print the contents of a cached artefact.""" - db = get_cache(cache_path) + db = cache.get_cache() with db.cache_artefacts_temppath(pk) as path: artifact_path = path.joinpath(artifact_rpath) if not artifact_path.exists(): click.secho("Artifact does not exist", fg="red") - sys.exit(1) + raise click.Abort() if not artifact_path.is_file(): click.secho("Artifact is not a file", fg="red") - sys.exit(1) + raise click.Abort() text = artifact_path.read_text(encoding="utf8") click.echo(text) @@ -94,7 +93,7 @@ def cache_file(db, nbpath, validate, overwrite, artifact_paths=()): from jupyter_cache.base import NbValidityError - click.echo("Caching: {}".format(nbpath)) + click.echo(f"Caching: {nbpath}") try: db.cache_notebook_file( nbpath, @@ -113,11 +112,11 @@ def cache_file(db, nbpath, validate, overwrite, artifact_paths=()): check_validity=False, overwrite=overwrite, ) - except IOError as error: + except OSError as error: click.secho("Artifact Error: ", fg="red", nl=False) click.echo(str(error)) return False - except IOError as error: + except OSError as error: click.secho("Artifact Error: ", fg="red", nl=False) click.echo(str(error)) return False @@ -127,12 +126,12 @@ def cache_file(db, nbpath, validate, overwrite, artifact_paths=()): @cmnd_cache.command("add-with-artefacts") @arguments.ARTIFACT_PATHS @options.NB_PATH -@options.CACHE_PATH @options.VALIDATE_NB @options.OVERWRITE_CACHED -def cache_nb(cache_path, artifact_paths, nbpath, validate, overwrite): +@pass_cache +def cache_nb(cache, artifact_paths, nbpath, validate, overwrite): """Cache a notebook, with possible artefact files.""" - db = get_cache(cache_path) + db = cache.get_cache() success = cache_file(db, nbpath, validate, overwrite, artifact_paths) if success: click.secho("Success!", fg="green") @@ -140,12 +139,12 @@ def cache_nb(cache_path, artifact_paths, nbpath, validate, overwrite): @cmnd_cache.command("add") @arguments.NB_PATHS -@options.CACHE_PATH @options.VALIDATE_NB @options.OVERWRITE_CACHED -def cache_nbs(cache_path, nbpaths, validate, overwrite): +@pass_cache +def cache_nbs(cache, nbpaths, validate, overwrite): """Cache notebook(s) that have already been executed.""" - db = get_cache(cache_path) + db = cache.get_cache() success = True for nbpath in nbpaths: # TODO deal with errors (print all at end? or option to ignore) @@ -155,20 +154,35 @@ def cache_nbs(cache_path, nbpaths, validate, overwrite): click.secho("Success!", fg="green") +@cmnd_cache.command("clear") +@options.FORCE +@pass_cache +def clear_cache_cmd(cache, force): + """Remove all executed notebooks from the cache.""" + db = cache.get_cache() + if not force: + click.confirm( + "Are you sure you want to permanently clear the cache!?", abort=True + ) + for record in db.list_cache_records(): + db.remove_cache(record.pk) + click.secho("Cache cleared!", fg="green") + + @cmnd_cache.command("remove") @arguments.PKS -@options.CACHE_PATH @options.REMOVE_ALL -def remove_caches(cache_path, pks, remove_all): +@pass_cache +def remove_caches(cache, pks, remove_all): """Remove notebooks stored in the cache.""" from jupyter_cache.base import CachingError - db = get_cache(cache_path) + db = cache.get_cache() if remove_all: pks = [r.pk for r in db.list_cache_records()] for pk in pks: # TODO deal with errors (print all at end? or option to ignore) - click.echo("Removing Cache ID = {}".format(pk)) + click.echo(f"Removing Cache ID = {pk}") try: db.remove_cache(pk) except KeyError: @@ -179,12 +193,12 @@ def remove_caches(cache_path, pks, remove_all): click.secho("Success!", fg="green") -@cmnd_cache.command("diff-nb") +@cmnd_cache.command("diff") @arguments.PK @arguments.NB_PATH -@options.CACHE_PATH -def diff_nb(cache_path, pk, nbpath): +@pass_cache +def diff_nb(cache, pk, nbpath): """Print a diff of a notebook to one stored in the cache.""" - db = get_cache(cache_path) + db = cache.get_cache() click.echo(db.diff_nbfile_with_cache(pk, nbpath, as_str=True)) click.secho("Success!", fg="green") diff --git a/jupyter_cache/cli/commands/cmd_config.py b/jupyter_cache/cli/commands/cmd_config.py deleted file mode 100644 index dfea25b..0000000 --- a/jupyter_cache/cli/commands/cmd_config.py +++ /dev/null @@ -1,21 +0,0 @@ -import click - -from jupyter_cache import get_cache -from jupyter_cache.cli import options -from jupyter_cache.cli.commands.cmd_main import jcache - - -@jcache.group("config") -def cmnd_config(): - """Commands for configuring the cache.""" - pass - - -@cmnd_config.command("cache-limit") -@options.CACHE_PATH -@click.argument("limit", metavar="CACHE_LIMIT", type=int) -def change_cache_limit(cache_path, limit): - """Change the maximum number of notebooks stored in the cache.""" - db = get_cache(cache_path) - db.change_cache_limit(limit) - click.secho("Cache limit changed!", fg="green") diff --git a/jupyter_cache/cli/commands/cmd_exec.py b/jupyter_cache/cli/commands/cmd_exec.py deleted file mode 100644 index bae852e..0000000 --- a/jupyter_cache/cli/commands/cmd_exec.py +++ /dev/null @@ -1,36 +0,0 @@ -import logging - -import click -import click_log - -from jupyter_cache import get_cache -from jupyter_cache.cli import arguments, options -from jupyter_cache.cli.commands.cmd_main import jcache - -logger = logging.getLogger(__name__) -click_log.basic_config(logger) - - -@jcache.command("execute") -@click_log.simple_verbosity_option(logger) -@options.EXEC_ENTRYPOINT -@options.EXEC_TIMEOUT -@options.CACHE_PATH -@arguments.PKS -def execute_nbs(cache_path, entry_point, pks, timeout): - """Execute staged notebooks that are outdated.""" - import yaml - - from jupyter_cache.executors import load_executor - - db = get_cache(cache_path) - try: - executor = load_executor("basic", db, logger=logger) - except ImportError as error: - logger.error(str(error)) - return 1 - result = executor.run_and_cache(filter_pks=pks or None, timeout=timeout) - click.secho( - "Finished! Successfully executed notebooks have been cached.", fg="green" - ) - click.echo(yaml.safe_dump(result, sort_keys=False)) diff --git a/jupyter_cache/cli/commands/cmd_main.py b/jupyter_cache/cli/commands/cmd_main.py index a7388de..1581902 100644 --- a/jupyter_cache/cli/commands/cmd_main.py +++ b/jupyter_cache/cli/commands/cmd_main.py @@ -12,15 +12,3 @@ @options.AUTOCOMPLETE def jcache(*args, **kwargs): """The command line interface of jupyter-cache.""" - - -@jcache.command("clear") -@options.CACHE_PATH -def clear_cache(cache_path): - """Clear the cache completely.""" - from jupyter_cache.cache.main import JupyterCacheBase - - db = JupyterCacheBase(cache_path) - click.confirm("Are you sure you want to permanently clear the cache!?", abort=True) - db.clear_cache() - click.secho("Cache cleared!", fg="green") diff --git a/jupyter_cache/cli/commands/cmd_notebook.py b/jupyter_cache/cli/commands/cmd_notebook.py new file mode 100644 index 0000000..5f41fb2 --- /dev/null +++ b/jupyter_cache/cli/commands/cmd_notebook.py @@ -0,0 +1,214 @@ +import logging +import os + +import click +import nbformat + +from jupyter_cache.cli import arguments, options, pass_cache, utils +from jupyter_cache.cli.commands.cmd_main import jcache +from jupyter_cache.readers import NbReadError +from jupyter_cache.utils import tabulate_project_records + +logger = logging.getLogger(__name__) +utils.setup_logger(logger) + + +@jcache.group("notebook") +@options.CACHE_PATH +@pass_cache +def cmnd_notebook(cache, cache_path): + """Work with notebook(s) in a project.""" + cache.set_cache_path(cache_path) + + +@cmnd_notebook.command("add") +@arguments.NB_PATHS +@options.READER_KEY +@pass_cache +def add_notebooks(cache, nbpaths, reader): + """Add notebook(s) to the project.""" + db = cache.get_cache() + for path in nbpaths: + # TODO deal with errors (print all at end? or option to ignore) + click.echo(f"Adding: {path}") + db.add_nb_to_project(path, read_data={"name": reader, "type": "plugin"}) + click.secho("Success!", fg="green") + + +@cmnd_notebook.command("add-with-assets") +@arguments.ASSET_PATHS +@options.NB_PATH +@options.READER_KEY +@pass_cache +def add_notebook(cache, nbpath, reader, asset_paths): + """Add notebook(s) to the project, with possible asset files.""" + db = cache.get_cache() + db.add_nb_to_project( + nbpath, read_data={"name": reader, "type": "plugin"}, assets=asset_paths + ) + click.secho("Success!", fg="green") + + +@cmnd_notebook.command("clear") +@options.FORCE +@pass_cache +def clear_nbs(cache, force): + """Remove all notebooks from the project.""" + db = cache.get_cache() + if not force: + click.confirm( + "Are you sure you want to permanently clear the project!?", abort=True + ) + for record in db.list_project_records(): + db.remove_nb_from_project(record.pk) + click.secho("Project cleared!", fg="green") + + +@cmnd_notebook.command("remove") +@arguments.PK_OR_PATHS +@pass_cache +def remove_nbs(cache, pk_paths): + """Remove notebook(s) from the project (by ID/URI).""" + db = cache.get_cache() + for pk_path in pk_paths: + # TODO deal with errors (print all at end? or option to ignore) + click.echo(f"Removing: {pk_path}") + db.remove_nb_from_project( + int(pk_path) if pk_path.isdigit() else os.path.abspath(pk_path) + ) + click.secho("Success!", fg="green") + + +@cmnd_notebook.command("invalidate") +@arguments.PK_OR_PATHS +@options.INVALIDATE_ALL +@pass_cache +def invalidate_nbs(cache, pk_paths, invalidate_all): + """Remove any matching cache of the notebook(s) (by ID/URI).""" + db = cache.get_cache() + if invalidate_all: + pk_paths = [str(record.pk) for record in db.list_project_records()] + for pk_path in pk_paths: + # TODO deal with errors (print all at end? or option to ignore) + click.echo(f"Invalidating: {pk_path}") + record = db.get_cached_project_nb( + int(pk_path) if pk_path.isdigit() else os.path.abspath(pk_path) + ) + if record is not None: + db.remove_cache(record.pk) + click.secho("Success!", fg="green") + + +@cmnd_notebook.command("list") +# @click.option( +# "--compare/--no-compare", +# default=True, +# show_default=True, +# help="Compare to cached notebooks (to find cache ID).", +# ) +@options.PATH_LENGTH +@click.option( + "--assets", + is_flag=True, + help="Show the number of assets associated with each notebook", +) +@pass_cache +def list_nbs_in_project(cache, path_length, assets): + """List notebooks in the project.""" + db = cache.get_cache() + records = db.list_project_records() + if not records: + click.secho("No notebooks in project", fg="blue") + click.echo( + tabulate_project_records( + records, path_length=path_length, cache=db, assets=assets + ) + ) + + +@cmnd_notebook.command("info") +@arguments.PK_OR_PATH +@click.option( + "--tb/--no-tb", + default=True, + show_default=True, + help="Show traceback, if last execution failed.", +) +@pass_cache +def show_project_record(cache, pk_path, tb): + """Show details of a notebook (by ID).""" + import yaml + + db = cache.get_cache() + try: + record = db.get_project_record( + int(pk_path) if pk_path.isdigit() else os.path.abspath(pk_path) + ) + except KeyError: + click.secho(f"ID {pk_path} does not exist, Aborting!", fg="red") + raise click.Abort() + cache_record = None + try: + cache_record = db.get_cached_project_nb(record.uri) + except NbReadError as exc: + click.secho(f"File could not be read: {exc}", fg="red") + data = record.format_dict( + cache_record=cache_record, path_length=None, assets=False, read_name=False + ) + click.echo(yaml.safe_dump(data, sort_keys=False, allow_unicode=True).rstrip()) + if record.assets: + click.echo("Assets:") + for path in record.assets: + click.echo(f"- {path}") + if record.traceback: + click.secho("Failed Last Execution!", fg="red") + if tb: + click.echo(record.traceback) + + +@cmnd_notebook.command("merge") +@arguments.PK_OR_PATH +@arguments.OUTPUT_PATH +@pass_cache +def merge_executed(cache, pk_path, outpath): + """Create notebook merged with cached outputs (by ID/URI).""" + db = cache.get_cache() + nb = db.get_project_notebook( + int(pk_path) if pk_path.isdigit() else os.path.abspath(pk_path) + ).nb + cached_pk, nb = db.merge_match_into_notebook(nb) + nbformat.write(nb, outpath) + click.echo(f"Merged with cache PK {cached_pk}") + click.secho("Success!", fg="green") + + +@cmnd_notebook.command("execute") +@arguments.PK_OR_PATHS +@options.EXECUTOR_KEY +@options.EXEC_TIMEOUT +@options.EXEC_FORCE(default=True) +@options.set_log_level(logger) +@pass_cache +def execute_nbs(cache, pk_paths, executor, timeout, force): + """Execute specific notebooks in the project.""" + import yaml + + from jupyter_cache.executors import load_executor + + uris = [os.path.abspath(p) for p in pk_paths if not p.isdigit()] or None + pks = [int(p) for p in pk_paths if p.isdigit()] or None + + db = cache.get_cache() + + try: + executor = load_executor(executor, db, logger=logger) + except ImportError as error: + logger.error(str(error)) + return 1 + result = executor.run_and_cache( + filter_pks=pks, filter_uris=uris, timeout=timeout, force=force + ) + click.secho( + "Finished! Successfully executed notebooks have been cached.", fg="green" + ) + click.echo(yaml.safe_dump(result.as_json(), sort_keys=False)) diff --git a/jupyter_cache/cli/commands/cmd_project.py b/jupyter_cache/cli/commands/cmd_project.py new file mode 100644 index 0000000..f2e9e62 --- /dev/null +++ b/jupyter_cache/cli/commands/cmd_project.py @@ -0,0 +1,88 @@ +import logging + +import click + +from jupyter_cache.cli import options, pass_cache, utils +from jupyter_cache.cli.commands.cmd_main import jcache + +logger = logging.getLogger(__name__) +utils.setup_logger(logger) + + +@jcache.group("project") +@options.CACHE_PATH +@pass_cache +def cmnd_project(cache, cache_path): + """Work with a project.""" + cache.set_cache_path(cache_path) + + +@cmnd_project.command("version") +@pass_cache +def version(cache): + """Print the version of the cache.""" + if not cache.cache_path.exists(): + click.secho("No cache found.", fg="red") + raise click.Abort() + version = cache.get_cache().get_version() + if version is None: + click.secho("Cache version not found", fg="red") + raise click.Abort() + click.echo(version) + + +@cmnd_project.command("clear") +@options.FORCE +@pass_cache +def clear_cache(cache, force): + """Clear the project cache completely.""" + if not cache.cache_path.exists(): + click.secho("Cache does not exist", fg="green") + raise click.Abort() + if not force: + click.echo(f"Cache path: {cache.cache_path}") + click.confirm( + "Are you sure you want to permanently clear the cache!?", + abort=True, + ) + cache.get_cache().clear_cache() + click.secho("Cache cleared!", fg="green") + + +@cmnd_project.command("cache-limit") +@click.argument("limit", metavar="CACHE_LIMIT", type=int, required=False) +@pass_cache +def change_cache_limit(cache, limit): + """Get/set maximum number of notebooks stored in the cache.""" + db = cache.get_cache() + if limit is None: + limit = db.get_cache_limit() + click.echo(f"Current cache limit: {limit}") + else: + db.change_cache_limit(limit) + click.secho("Cache limit changed!", fg="green") + + +@cmnd_project.command("execute") +@options.EXECUTOR_KEY +@options.EXEC_TIMEOUT +@options.EXEC_FORCE(default=False) +@options.set_log_level(logger) +@pass_cache +def execute_nbs(cache, executor, timeout, force): + """Execute all outdated notebooks in the project.""" + import yaml + + from jupyter_cache.executors import load_executor + + db = cache.get_cache() + try: + executor = load_executor(executor, db, logger=logger) + except ImportError as error: + logger.error(str(error)) + return 1 + result = executor.run_and_cache(timeout=timeout, force=force) + click.secho( + "Finished! Successfully executed notebooks have been cached.", fg="green" + ) + click.echo(yaml.safe_dump(result.as_json(), sort_keys=False)) diff --git a/jupyter_cache/cli/commands/cmd_stage.py b/jupyter_cache/cli/commands/cmd_stage.py deleted file mode 100644 index 40a98de..0000000 --- a/jupyter_cache/cli/commands/cmd_stage.py +++ /dev/null @@ -1,120 +0,0 @@ -import sys - -import click - -from jupyter_cache import get_cache -from jupyter_cache.cli import arguments, options -from jupyter_cache.cli.commands.cmd_main import jcache -from jupyter_cache.utils import tabulate_stage_records - - -@jcache.group("stage") -def cmnd_stage(): - """Commands for staging notebooks to be executed.""" - pass - - -@cmnd_stage.command("add") -@arguments.NB_PATHS -@options.CACHE_PATH -def stage_nbs(cache_path, nbpaths): - """Stage notebook(s) for execution.""" - db = get_cache(cache_path) - for path in nbpaths: - # TODO deal with errors (print all at end? or option to ignore) - click.echo("Staging: {}".format(path)) - db.stage_notebook_file(path) - click.secho("Success!", fg="green") - - -@cmnd_stage.command("add-with-assets") -@arguments.ASSET_PATHS -@options.NB_PATH -@options.CACHE_PATH -def stage_nb(cache_path, nbpath, asset_paths): - """Stage a notebook, with possible asset files.""" - db = get_cache(cache_path) - db.stage_notebook_file(nbpath, asset_paths) - click.secho("Success!", fg="green") - - -@cmnd_stage.command("remove-uris") -@arguments.NB_PATHS -@options.CACHE_PATH -@options.REMOVE_ALL -def unstage_nbs_uri(cache_path, nbpaths, remove_all): - """Un-stage notebook(s), by URI.""" - db = get_cache(cache_path) - if remove_all: - nbpaths = [record.uri for record in db.list_staged_records()] - for path in nbpaths: - # TODO deal with errors (print all at end? or option to ignore) - click.echo("Unstaging: {}".format(path)) - db.discard_staged_notebook(path) - click.secho("Success!", fg="green") - - -@cmnd_stage.command("remove-ids") -@arguments.PKS -@options.CACHE_PATH -@options.REMOVE_ALL -def unstage_nbs_id(cache_path, pks, remove_all): - """Un-stage notebook(s), by ID.""" - db = get_cache(cache_path) - if remove_all: - pks = [record.pk for record in db.list_staged_records()] - for pk in pks: - # TODO deal with errors (print all at end? or option to ignore) - click.echo("Unstaging ID: {}".format(pk)) - db.discard_staged_notebook(pk) - click.secho("Success!", fg="green") - - -@cmnd_stage.command("list") -@options.CACHE_PATH -@click.option( - "--compare/--no-compare", - default=True, - show_default=True, - help="Compare to cached notebooks (to find cache ID).", -) -@options.PATH_LENGTH -def list_staged(cache_path, compare, path_length): - """List notebooks staged for possible execution.""" - db = get_cache(cache_path) - records = db.list_staged_records() - if not records: - click.secho("No Staged Notebooks", fg="blue") - click.echo(tabulate_stage_records(records, path_length=path_length, cache=db)) - - -@cmnd_stage.command("show") -@options.CACHE_PATH -@arguments.PK -@click.option( - "--tb/--no-tb", - default=True, - show_default=True, - help="Show traceback, if last execution failed.", -) -def show_staged(cache_path, pk, tb): - """Show details of a staged notebook.""" - import yaml - - db = get_cache(cache_path) - try: - record = db.get_staged_record(pk) - except KeyError: - click.secho("ID {} does not exist, Aborting!".format(pk), fg="red") - sys.exit(1) - cache_record = db.get_cache_record_of_staged(record.uri) - data = record.format_dict(cache_record=cache_record, path_length=None, assets=False) - click.echo(yaml.safe_dump(data, sort_keys=False).rstrip()) - if record.assets: - click.echo("Assets:") - for path in record.assets: - click.echo(f"- {path}") - if record.traceback: - click.secho("Failed Last Execution!", fg="red") - if tb: - click.echo(record.traceback) diff --git a/jupyter_cache/cli/options.py b/jupyter_cache/cli/options.py index c33998a..ff439bf 100644 --- a/jupyter_cache/cli/options.py +++ b/jupyter_cache/cli/options.py @@ -1,7 +1,11 @@ +import logging import os import click +from jupyter_cache.entry_points import ENTRY_POINT_GROUP_EXEC, list_group_names +from jupyter_cache.readers import list_readers + def callback_autocomplete(ctx, param, value): if value and not ctx.resilient_parsing: @@ -34,7 +38,7 @@ def callback_print_cache_path(ctx, param, value): PRINT_CACHE_PATH = click.option( "-p", - "--cache-path", + "--print-path", help="Print the current cache path and exit.", is_flag=True, expose_value=True, @@ -43,24 +47,12 @@ def callback_print_cache_path(ctx, param, value): ) -def check_cache_exists(ctx, param, value): - if os.path.exists(value): - return value - click.secho("Cache path: ", fg="green", nl=False) - click.echo(value) - if not click.confirm("The cache does not yet exist, do you want to create it?"): - click.secho("Aborted!", bold=True, fg="red") - ctx.exit() - return value - - CACHE_PATH = click.option( "-p", "--cache-path", - help="Path to cache.", + help="Path to project cache.", default=default_cache_path, show_default=".jupyter_cache", - callback=check_cache_exists, ) @@ -72,12 +64,22 @@ def check_cache_exists(ctx, param, value): type=click.Path(dir_okay=False, exists=True, readable=True, resolve_path=True), ) +READER_KEY = click.option( + "-r", + "--reader", + help="The notebook reader to use.", + default="nbformat", + type=click.Choice(list_readers()), + show_default=True, +) + -EXEC_ENTRYPOINT = click.option( +EXECUTOR_KEY = click.option( "-e", - "--entry-point", - help="The entry-point from which to load the executor.", - default="basic", + "--executor", + help="The executor to use.", + default="local-serial", + type=click.Choice(list_group_names(ENTRY_POINT_GROUP_EXEC)), show_default=True, ) @@ -90,6 +92,17 @@ def check_cache_exists(ctx, param, value): ) +def EXEC_FORCE(default=False): + return click.option( + "-f", + "--force/--no-force", + help="Execute a notebook even if it is cached.", + is_flag=True, + default=default, + show_default=True, + ) + + PATH_LENGTH = click.option( "-l", "--path-length", default=3, show_default=True, help="Maximum URI path." ) @@ -110,6 +123,10 @@ def check_cache_exists(ctx, param, value): help="Whether to overwrite an existing notebook with the same hash.", ) +FORCE = click.option( + "-f", "--force", default=False, is_flag=True, help="Do not ask for confirmation." +) + def confirm_remove_all(ctx, param, remove_all): if remove_all and not click.confirm("Are you sure you want to remove all?"): @@ -126,3 +143,42 @@ def confirm_remove_all(ctx, param, remove_all): help="Remove all notebooks.", callback=confirm_remove_all, ) + + +def confirm_invalidate_all(ctx, param, remove_all): + if remove_all and not click.confirm("Are you sure you want to invalidate all?"): + click.secho("Aborted!", bold=True, fg="red") + ctx.exit() + return remove_all + + +INVALIDATE_ALL = click.option( + "-a", + "--all", + "invalidate_all", + is_flag=True, + help="Invalidate all notebooks.", + callback=confirm_invalidate_all, +) + + +def set_log_level(logger): + """Set the log level of the logger.""" + + def _callback(ctx, param, value): + """Set logging level.""" + level = getattr(logging, value.upper(), None) + if level is None: + raise click.BadParameter(f"Unknown log level: {value.upper()}") + logger.setLevel(level) + + return click.option( + "-v", + "--verbosity", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), + default="INFO", + show_default=True, + expose_value=False, + callback=_callback, + help="Set logging verbosity.", + ) diff --git a/jupyter_cache/cli/utils.py b/jupyter_cache/cli/utils.py new file mode 100644 index 0000000..783cc0a --- /dev/null +++ b/jupyter_cache/cli/utils.py @@ -0,0 +1,24 @@ +import logging + +import click + + +class ClickLogHandler(logging.Handler): + _use_stderr = True + + def emit(self, record): + try: + msg = self.format(record) + click.echo(msg, err=self._use_stderr) + except Exception: + self.handleError(record) + + +def setup_logger(logger: logging.Logger) -> None: + """Add handler to log to click.""" + try: + import click_log + except ImportError: + logger.addHandler(ClickLogHandler()) + else: + click_log.basic_config(logger) diff --git a/jupyter_cache/entry_points.py b/jupyter_cache/entry_points.py new file mode 100644 index 0000000..846768d --- /dev/null +++ b/jupyter_cache/entry_points.py @@ -0,0 +1,36 @@ +"""Module for dealing with entry points.""" +from typing import Optional, Set + +# TODO importlib.metadata was introduced into the standard library in python 3.8 +# so we can change this when we drop support for 3.7 +# also, from importlib_metadata changed its API in v4.0, to use the python 3.10 API +# however, because of https://github.com/python/importlib_metadata/issues/308 +# we do not assume that we have this API, and instead use try/except for the new/old APIs +from importlib_metadata import EntryPoint +from importlib_metadata import entry_points as eps + +ENTRY_POINT_GROUP_READER = "jcache.readers" +ENTRY_POINT_GROUP_EXEC = "jcache.executors" + + +def list_group_names(group: str) -> Set[str]: + """Return the entry points within a group.""" + all_eps = eps() + try: + # importlib_metadata v4 / python 3.10 + return all_eps.select(group=group).names + except (AttributeError, TypeError): + return {ep.name for ep in all_eps.get(group, [])} + + +def get_entry_point(group: str, name: str) -> Optional[EntryPoint]: + """Return the entry point with the given name in the given group.""" + all_eps = eps() + try: + # importlib_metadata v4 / python 3.10 + found = all_eps.select(group=group, name=name) + ep = found[name] if name in found.names else None + except (AttributeError, TypeError): + found = {ep.name: ep for ep in all_eps.get(group, [])} + ep = found[name] if name in found else None + return ep diff --git a/jupyter_cache/executors/base.py b/jupyter_cache/executors/base.py index 5987206..26e35cd 100644 --- a/jupyter_cache/executors/base.py +++ b/jupyter_cache/executors/base.py @@ -1,15 +1,16 @@ -import logging from abc import ABC, abstractmethod -from typing import Callable, List, Optional +import logging +from typing import Any, Dict, List, Optional, Set -import pkg_resources +import attr from jupyter_cache.base import JupyterCacheAbstract - -# TODO abstact -from jupyter_cache.cache.db import NbCacheRecord - -ENTRY_POINT_GROUP = "jupyter_executors" +from jupyter_cache.cache.db import NbProjectRecord +from jupyter_cache.entry_points import ( + ENTRY_POINT_GROUP_EXEC, + get_entry_point, + list_group_names, +) base_logger = logging.getLogger(__name__) @@ -18,6 +19,30 @@ class ExecutionError(Exception): pass +@attr.s(slots=True) +class ExecutorRunResult: + """A container for the execution result.""" + + # URIs of notebooks which where successfully executed + succeeded: List[str] = attr.ib(factory=list) + # URIs of notebooks which excepted during execution + excepted: List[str] = attr.ib(factory=list) + # URIs of notebooks which errored before execution + errored: List[str] = attr.ib(factory=list) + + def all(self) -> List[str]: + """Return all notebooks.""" + return self.succeeded + self.excepted + self.errored + + def as_json(self) -> Dict[str, Any]: + """Return the result as a JSON serializable dict.""" + return { + "succeeded": self.succeeded, + "excepted": self.excepted, + "errored": self.errored, + } + + class JupyterExecutorAbstract(ABC): """An abstract class for executing notebooks in a cache.""" @@ -26,7 +51,7 @@ def __init__(self, cache: JupyterCacheAbstract, logger=None): self._logger = logger or logging.getLogger(__name__) def __repr__(self): - return "{0}(cache={1})".format(self.__class__.__name__, self._cache) + return f"{self.__class__.__name__}(cache={self._cache})" @property def cache(self): @@ -36,45 +61,62 @@ def cache(self): def logger(self): return self._logger + def get_records( + self, + filter_uris: Optional[List[str]] = None, + filter_pks: Optional[List[int]] = None, + clear_tracebacks: bool = True, + force: bool = False, + ) -> List[NbProjectRecord]: + """Return records to execute. + + :param clear_tracebacks: Remove any tracebacks from previous executions + """ + if force: + execute_records = self.cache.list_project_records(filter_uris, filter_pks) + else: + execute_records = self.cache.list_unexecuted(filter_uris, filter_pks) + if clear_tracebacks: + NbProjectRecord.remove_tracebacks( + [r.pk for r in execute_records], self.cache.db + ) + return execute_records + @abstractmethod def run_and_cache( self, + *, filter_uris: Optional[List[str]] = None, filter_pks: Optional[List[int]] = None, - converter: Optional[Callable] = None, - **kwargs - ) -> List[NbCacheRecord]: + timeout: Optional[int] = 30, + allow_errors: bool = False, + force: bool = False, + **kwargs: Any, + ) -> ExecutorRunResult: """Run execution, cache successfully executed notebooks and return their URIs - Parameters - ---------- - filter_uris: list - If specified filter the staged notebooks to execute by these URIs - filter_pks: list - If specified filter the staged notebooks to execute by these PKs - converter: - An optional converter for staged notebooks, - which takes the URI and returns a notebook node + :param filter_uris: Filter the notebooks in the project to execute by these URIs + :param filter_pks: Filter the notebooks in the project to execute by these PKs + :param timeout: Maximum time in seconds to wait for a single cell to run for + :param allow_errors: Whether to halt execution on the first cell exception + (provided the cell is not tagged as an expected exception) + :param force: Whether to force execution of all notebooks, even if they are cached + :param kwargs: Additional keyword arguments to pass to the executor """ - pass -def list_executors(): - return list(pkg_resources.iter_entry_points(ENTRY_POINT_GROUP)) +def list_executors() -> Set[str]: + return list_group_names(ENTRY_POINT_GROUP_EXEC) def load_executor( entry_point: str, cache: JupyterCacheAbstract, logger=None ) -> JupyterExecutorAbstract: """Retrieve an initialised JupyterExecutor from an entry point.""" - entry_points = list(pkg_resources.iter_entry_points(ENTRY_POINT_GROUP, entry_point)) - if len(entry_points) == 0: - raise ImportError( - "Entry point not found: {}.{}".format(ENTRY_POINT_GROUP, entry_point) - ) - if len(entry_points) != 1: + ep = get_entry_point(ENTRY_POINT_GROUP_EXEC, entry_point) + if ep is None: raise ImportError( - "Multiple entry points found: {}.{}".format(ENTRY_POINT_GROUP, entry_point) + f"Entry point not found: {ENTRY_POINT_GROUP_EXEC}:{entry_point}" ) - execute_cls = entry_points[0].load() + execute_cls = ep.load() return execute_cls(cache=cache, logger=logger) diff --git a/jupyter_cache/executors/basic.py b/jupyter_cache/executors/basic.py index e7e8d0f..61849e8 100644 --- a/jupyter_cache/executors/basic.py +++ b/jupyter_cache/executors/basic.py @@ -1,200 +1,276 @@ -import shutil -import tempfile +import logging +import multiprocessing as mproc +import os from pathlib import Path +import tempfile +from typing import NamedTuple, Tuple -from jupyter_cache.cache.db import NbStageRecord -from jupyter_cache.cache.main import NbArtifacts, NbBundleIn -from jupyter_cache.executors.base import JupyterExecutorAbstract -from jupyter_cache.executors.utils import single_nb_execution -from jupyter_cache.utils import to_relative_paths +from jupyter_cache.base import JupyterCacheAbstract, ProjectNb +from jupyter_cache.cache.db import NbProjectRecord +from jupyter_cache.executors.base import ExecutorRunResult, JupyterExecutorAbstract +from jupyter_cache.executors.utils import ( + ExecutionResult, + copy_assets, + create_cache_bundle, + single_nb_execution, +) -# from jupyter_client.kernelspec import get_kernel_spec, NoSuchKernel +REPORT_LEVEL = logging.INFO + 1 +logging.addLevelName(REPORT_LEVEL, "REPORT") -class ExecutionError(Exception): - """An exception to signify a error during execution of a specific URI.""" +class ProcessData(NamedTuple): + """Data for the process worker.""" - def __init__(self, message, uri, exc): - self.uri = uri - self.exc = exc - return super().__init__(message) + pk: int + uri: str + cache: JupyterCacheAbstract + timeout: int + allow_errors: bool -class JupyterExecutorBasic(JupyterExecutorAbstract): - """A basic implementation of an executor. +class ExecutionWorkerBase: + """Base execution worker. - The execution is split into two methods: `run` and `execute`. - In this way access to the cache can be synchronous, but the execution can be - multi/async processed. Takes timeout parameter in seconds for execution + Note this must be pickleable. """ + @property + def logger(self) -> logging.Logger: + raise NotImplementedError + + def log_info(self, msg: str): + self.logger.info(msg) + + def execute(self, project_nb: ProjectNb, data: ProcessData) -> ExecutionResult: + raise NotImplementedError + + def __call__(self, data: ProcessData) -> Tuple[int, str]: + + try: + project_nb = data.cache.get_project_notebook(data.pk) + except Exception: + self.logger.error( + "Failed Retrieving: %s" % data.uri, + exc_info=True, + ) + return (2, data.uri) + + try: + self.log_info("Executing: %s" % project_nb.uri) + result = self.execute(project_nb, data) + except Exception: + self.logger.error( + "Failed Executing: %s" % data.uri, + exc_info=True, + ) + return (2, data.uri) + + if result.err: + self.logger.warning( + "Execution Excepted: %s\n%s: %s" + % (project_nb.uri, type(result.err).__name__, str(result.err)) + ) + NbProjectRecord.set_traceback( + project_nb.uri, result.exc_string, data.cache.db + ) + return (1, data.uri) + + self.log_info("Execution Successful: %s" % project_nb.uri) + try: + # TODO deal with artifact retrieval + bundle = create_cache_bundle( + project_nb, result.cwd, None, result.time, result.exc_string + ) + data.cache.cache_notebook_bundle( + bundle, check_validity=False, overwrite=True + ) + except Exception: + self.logger.error( + "Failed Caching: %s" % data.uri, + exc_info=True, + ) + return (2, data.uri) + + return (0, data.uri) + + +class ExecutionWorkerLocalSerial(ExecutionWorkerBase): + """Execution worker, that executes in local folder.""" + + def __init__(self, logger: logging.Logger) -> None: + super().__init__() + self._logger = logger + + @property + def logger(self) -> logging.Logger: + return self._logger + + @staticmethod + def execute(project_nb: ProjectNb, data: ProcessData) -> ExecutionResult: + cwd = str(Path(project_nb.uri).parent) + return single_nb_execution( + project_nb.nb, + cwd=cwd, + timeout=data.timeout, + allow_errors=data.allow_errors, + ) + + +class ExecutionWorkerTempSerial(ExecutionWorkerBase): + """Execution worker, that executes in temporary folder.""" + + def __init__(self, logger: logging.Logger) -> None: + super().__init__() + self._logger = logger + + @property + def logger(self) -> logging.Logger: + return self._logger + + @staticmethod + def execute(project_nb: ProjectNb, data: ProcessData) -> ExecutionResult: + with tempfile.TemporaryDirectory() as cwd: + copy_assets(project_nb.uri, project_nb.assets, cwd) + return single_nb_execution( + project_nb.nb, + cwd=cwd, + timeout=data.timeout, + allow_errors=data.allow_errors, + ) + + +class ExecutionWorkerLocalMProc(ExecutionWorkerBase): + """Execution worker, that executes in local folder.""" + + @property + def logger(self) -> logging.Logger: + return mproc.get_logger() + + def log_info(self, msg: str): + # multiprocessing logs a lot at info level that we do not want to see + self.logger.log(REPORT_LEVEL, msg) + + @staticmethod + def execute(project_nb: ProjectNb, data: ProcessData) -> ExecutionResult: + cwd = str(Path(project_nb.uri).parent) + return single_nb_execution( + project_nb.nb, + cwd=cwd, + timeout=data.timeout, + allow_errors=data.allow_errors, + ) + + +class ExecutionWorkerTempMProc(ExecutionWorkerBase): + """Execution worker, that executes in temporary folder.""" + + @property + def logger(self) -> logging.Logger: + return mproc.get_logger() + + def log_info(self, msg: str): + # multiprocessing logs a lot at info level that we do not want to see + self.logger.log(REPORT_LEVEL, msg) + + @staticmethod + def execute(project_nb: ProjectNb, data: ProcessData) -> ExecutionResult: + with tempfile.TemporaryDirectory() as cwd: + copy_assets(project_nb.uri, project_nb.assets, cwd) + return single_nb_execution( + project_nb.nb, + cwd=cwd, + timeout=data.timeout, + allow_errors=data.allow_errors, + ) + + +class JupyterExecutorLocalSerial(JupyterExecutorAbstract): + """An implementation of an executor; executing locally in serial.""" + + _EXECUTION_WORKER = ExecutionWorkerLocalSerial + def run_and_cache( self, + *, filter_uris=None, filter_pks=None, - converter=None, timeout=30, allow_errors=False, - run_in_temp=True, - ): - """This function interfaces with the cache, deferring execution to `execute`.""" - # Get the notebook tha require re-execution - stage_records = self.cache.list_staged_unexecuted(converter=converter) - if filter_uris is not None: - stage_records = [r for r in stage_records if r.uri in filter_uris] - if filter_pks is not None: - stage_records = [r for r in stage_records if r.pk in filter_pks] - - # remove any tracebacks from previous executions - NbStageRecord.remove_tracebacks([r.pk for r in stage_records], self.cache.db) - - # setup an dictionary to categorise all executed notebook uris: - # excepted are where the actual notebook execution raised an exception; - # errored is where any other exception was raised - result = {"succeeded": [], "excepted": [], "errored": []} - # we pass an iterator to the execute method, - # so that we don't have to read all notebooks before execution - - def _iterator(): - for stage_record in stage_records: - try: - nb_bundle = self.cache.get_staged_notebook( - stage_record.pk, converter - ) - except Exception: - self.logger.error( - "Failed Retrieving: {}".format(stage_record.uri), exc_info=True - ) - result["errored"].append(stage_record.uri) - else: - yield stage_record, nb_bundle - - # The execute method yields notebook bundles, or ExecutionError - for bundle_or_exc in self.execute( - _iterator(), int(timeout), allow_errors, run_in_temp - ): - if isinstance(bundle_or_exc, ExecutionError): - self.logger.error(bundle_or_exc.uri, exc_info=bundle_or_exc.exc) - result["errored"].append(bundle_or_exc.uri) - continue - elif bundle_or_exc.traceback is not None: - # The notebook raised an exception during execution - # TODO store excepted bundles - result["excepted"].append(bundle_or_exc.uri) - NbStageRecord.set_traceback( - bundle_or_exc.uri, bundle_or_exc.traceback, self.cache.db - ) - continue - try: - # cache a successfully executed notebook - self.cache.cache_notebook_bundle( - bundle_or_exc, check_validity=False, overwrite=True - ) - except Exception: - self.logger.error( - "Failed Caching: {}".format(bundle_or_exc.uri), exc_info=True - ) - result["errored"].append(bundle_or_exc.uri) - else: - result["succeeded"].append(bundle_or_exc.uri) - - # TODO it would also be ideal to tag all notebooks - # that were executed at the same time (just part of `data` or separate column?). - # TODO maybe the status of success/failure could be explicitly stored on - # the stage record (cache_status=Enum('OK', 'FAILED', 'MISSING')) - # although now traceback is so this is an implicit sign of failure, - # TODO failed notebooks could be stored in the cache, which would be - # accessed by stage pk (and would be deleted when removing the stage record) - # see: https://python.quantecon.org/status.html - - return result - - def execute(self, input_iterator, timeout=30, allow_errors=False, in_temp=True): - """This function is isolated from the cache, and is responsible for execution. - - The method is only supplied with the staged record and input notebook bundle, - it then yield results for caching - """ - for stage_record, nb_bundle in input_iterator: - try: - uri = nb_bundle.uri - self.logger.info("Executing: {}".format(uri)) - - if in_temp: - with tempfile.TemporaryDirectory() as tmpdirname: - - try: - asset_files = _copy_assets(stage_record, tmpdirname) - except Exception as err: - yield ExecutionError("Assets Retrieval Error", uri, err) - continue - - yield self.execute_single( - nb_bundle, - uri, - tmpdirname, - timeout, - allow_errors, - asset_files, - ) - else: - yield self.execute_single( - nb_bundle, - uri, - str(Path(uri).parent), - timeout, - allow_errors, - None, - ) + force=False, + ) -> ExecutorRunResult: + # Get the notebook that require re-execution + execute_records = self.get_records( + filter_uris, filter_pks, clear_tracebacks=True, force=force + ) - except Exception as err: - yield ExecutionError("Unexpected Error", uri, err) + self.logger.info("Executing %s notebook(s) in serial" % len(execute_records)) - def execute_single(self, nb_bundle, uri, cwd, timeout, allow_errors, asset_files): - result = single_nb_execution( - nb_bundle.nb, - cwd=cwd, - timeout=timeout, - allow_errors=allow_errors, - ) - if result.err: - self.logger.error("Execution Failed: {}".format(uri)) - return _create_bundle( - nb_bundle, - cwd, - asset_files, - result.time, - result.exc_string, + results = [ + self._EXECUTION_WORKER(self.logger)( + ProcessData(record.pk, record.uri, self.cache, timeout, allow_errors) ) + for record in execute_records + ] - self.logger.info("Execution Succeeded: {}".format(uri)) - return _create_bundle(nb_bundle, cwd, asset_files, result.time, None) - - -def _copy_assets(record, folder): - """Copy notebook assets to the folder the notebook will be executed in.""" - asset_files = [] - relative_paths = to_relative_paths(record.assets, Path(record.uri).parent) - for path, rel_path in zip(record.assets, relative_paths): - temp_file = Path(folder).joinpath(rel_path) - temp_file.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(path, temp_file) - asset_files.append(temp_file) - return asset_files - - -def _create_bundle(nb_bundle, tmpdirname, asset_files, exec_time, exec_tb): - """Create a cache bundle.""" - return NbBundleIn( - nb_bundle.nb, - nb_bundle.uri, - # TODO retrieve assets that have changed file mtime? - artifacts=NbArtifacts( - [p for p in Path(tmpdirname).glob("**/*") if p not in asset_files], - tmpdirname, + return ExecutorRunResult( + succeeded=[p for i, p in results if i == 0], + excepted=[p for i, p in results if i == 1], + errored=[p for i, p in results if i == 2], ) - if asset_files is not None - else None, - data={"execution_seconds": exec_time}, - traceback=exec_tb, - ) + + +class JupyterExecutorTempSerial(JupyterExecutorLocalSerial): + """An implementation of an executor; executing in a temporary folder in serial.""" + + _EXECUTION_WORKER = ExecutionWorkerTempSerial + + +class JupyterExecutorLocalMproc(JupyterExecutorAbstract): + """An implementation of an executor; executing locally in parallel.""" + + _EXECUTION_WORKER = ExecutionWorkerLocalMProc + + def run_and_cache( + self, + *, + filter_uris=None, + filter_pks=None, + timeout=30, + allow_errors=False, + force=False, + ) -> ExecutorRunResult: + # Get the notebook that require re-execution + execute_records = self.get_records( + filter_uris, filter_pks, clear_tracebacks=True + ) + + self.logger.info( + "Executing %s notebook(s) over pool of %s processors" + % (len(execute_records), os.cpu_count()) + ) + mproc.log_to_stderr( + REPORT_LEVEL if self.logger.level == logging.INFO else self.logger.level + ) + + with mproc.Pool() as pool: + results = pool.map( + self._EXECUTION_WORKER(), + [ + ProcessData( + record.pk, record.uri, self.cache, timeout, allow_errors + ) + for record in execute_records + ], + ) + return ExecutorRunResult( + succeeded=[p for i, p in results if i == 0], + excepted=[p for i, p in results if i == 1], + errored=[p for i, p in results if i == 2], + ) + + +class JupyterExecutorTempMproc(JupyterExecutorLocalMproc): + """An implementation of an executor; executing in a temporary directory and in parallel.""" + + _EXECUTION_WORKER = ExecutionWorkerTempMProc diff --git a/jupyter_cache/executors/utils.py b/jupyter_cache/executors/utils.py index 01b90b0..33f4df8 100644 --- a/jupyter_cache/executors/utils.py +++ b/jupyter_cache/executors/utils.py @@ -1,17 +1,22 @@ +from pathlib import Path +import shutil import traceback -from typing import Optional, Union +from typing import Any, List, Optional, Union import attr from nbclient import execute as executenb from nbclient.client import CellExecutionError, CellTimeoutError from nbformat import NotebookNode -from jupyter_cache.utils import Timer +from jupyter_cache.base import CacheBundleIn, ProjectNb +from jupyter_cache.cache.main import NbArtifacts +from jupyter_cache.utils import Timer, to_relative_paths @attr.s() class ExecutionResult: nb: NotebookNode = attr.ib() + cwd: str = attr.ib() time: float = attr.ib() err: Optional[Union[CellExecutionError, CellTimeoutError]] = attr.ib(default=None) exc_string: Optional[str] = attr.ib(default=None) @@ -23,6 +28,8 @@ def single_nb_execution( timeout: Optional[int], allow_errors: bool, meta_override: bool = True, + record_timing: bool = False, + **kwargs: Any, ) -> ExecutionResult: """Execute notebook in place. @@ -33,6 +40,7 @@ def single_nb_execution( execution is stopped and a ``CellExecutionError`` is raised. :param meta_override: If ``True`` then timeout and allow_errors may be overridden by equivalent keys in nb.metadata.execution + :param kwargs: Additional keyword arguments to pass to the ``NotebookClient``. :returns: The execution time in seconds """ @@ -52,10 +60,46 @@ def single_nb_execution( cwd=cwd, timeout=timeout, allow_errors=allow_errors, - record_timing=False, + record_timing=record_timing, + **kwargs, ) except (CellExecutionError, CellTimeoutError) as err: error = err exc_string = "".join(traceback.format_exc()) - return ExecutionResult(nb, timer.last_split, error, exc_string) + return ExecutionResult(nb, cwd, timer.last_split, error, exc_string) + + +def copy_assets(uri: str, assets: List[str], folder: str) -> List[Path]: + """Copy notebook assets to the folder the notebook will be executed in.""" + asset_files = [] + relative_paths = to_relative_paths(assets, Path(uri).parent) + for path, rel_path in zip(assets, relative_paths): + temp_file = Path(folder).joinpath(rel_path) + temp_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copyfile(path, temp_file) + asset_files.append(temp_file) + return asset_files + + +def create_cache_bundle( + project_nb: ProjectNb, + execdir: Optional[str], + asset_files: Optional[List[Path]], + exec_time: float, + exec_tb: Optional[str], +) -> CacheBundleIn: + """Create a cache bundle to save.""" + return CacheBundleIn( + project_nb.nb, + project_nb.uri, + # TODO retrieve assets that have changed file mtime? + artifacts=NbArtifacts( + [p for p in Path(execdir).glob("**/*") if p not in asset_files], + execdir, + ) + if (execdir is not None and asset_files is not None) + else None, + data={"execution_seconds": exec_time}, + traceback=exec_tb, + ) diff --git a/jupyter_cache/readers.py b/jupyter_cache/readers.py new file mode 100644 index 0000000..669198f --- /dev/null +++ b/jupyter_cache/readers.py @@ -0,0 +1,41 @@ +"""Module for handling different functions to read "notebook-like" files.""" +from typing import Any, Callable, Dict, Set + +import nbformat as nbf + +from .entry_points import ENTRY_POINT_GROUP_READER, get_entry_point, list_group_names + +DEFAULT_READ_DATA = (("name", "nbformat"), ("type", "plugin")) + + +def nbf_reader(uri: str) -> nbf.NotebookNode: + """Standard notebook reader.""" + return nbf.read(uri, nbf.NO_CONVERT) + + +def jupytext_reader(uri: str) -> nbf.NotebookNode: + """Jupytext notebook reader.""" + try: + import jupytext + except ImportError: + raise ImportError("jupytext must be installed to use this reader") + return jupytext.read(uri) + + +def list_readers() -> Set[str]: + """List all available readers.""" + return list_group_names(ENTRY_POINT_GROUP_READER) + + +def get_reader(data: Dict[str, Any]) -> Callable[[str], nbf.NotebookNode]: + """Returns a function to read a file URI and return a notebook.""" + if data.get("type") == "plugin": + key = data.get("name", "") + reader = get_entry_point(ENTRY_POINT_GROUP_READER, key) + if reader is not None: + return reader.load() + raise ValueError(f"No reader found for: {data!r}") + + +class NbReadError(IOError): + """Error raised when a notebook cannot be read.""" diff --git a/jupyter_cache/utils.py b/jupyter_cache/utils.py index 9219c70..db40a97 100644 --- a/jupyter_cache/utils.py +++ b/jupyter_cache/utils.py @@ -1,8 +1,14 @@ """Non-core imports in this module are lazily loaded, in order to improve CLI speed """ -import time from pathlib import Path -from typing import List, Union +import time +from typing import TYPE_CHECKING, List, Optional, Union + +from jupyter_cache.readers import NbReadError + +if TYPE_CHECKING: + from jupyter_cache.base import JupyterCacheAbstract + from jupyter_cache.cache.db import NbCacheRecord, NbProjectRecord def to_relative_paths( @@ -23,13 +29,13 @@ def to_relative_paths( for path in paths: path = Path(path).absolute() if check_existence and not path.exists(): - raise IOError(f"Path does not exist: {path}") + raise OSError(f"Path does not exist: {path}") if check_existence and not path.is_file(): - raise IOError(f"Path is not a file: {path}") + raise OSError(f"Path is not a file: {path}") try: rel_path = path.relative_to(folder) except ValueError: - raise IOError(f"Path '{path}' is not in folder '{folder}''") + raise OSError(f"Path '{path}' is not in folder '{folder}''") rel_paths.append(rel_path) return rel_paths @@ -64,7 +70,7 @@ def __exit__(self, *exc_info): self.split() -def shorten_path(file_path, length): +def shorten_path(file_path: Union[str, Path], length: Optional[int]) -> Path: """Split the path into separate parts, select the last 'length' elements and join them again """ @@ -73,7 +79,9 @@ def shorten_path(file_path, length): return Path(*Path(file_path).parts[-length:]) -def tabulate_cache_records(records: list, hashkeys=False, path_length=None) -> str: +def tabulate_cache_records( + records: List["NbCacheRecord"], hashkeys=False, path_length=None +) -> str: """Tabulate cache records. :param records: list of ``NbCacheRecord`` @@ -91,22 +99,37 @@ def tabulate_cache_records(records: list, hashkeys=False, path_length=None) -> s ) -def tabulate_stage_records(records: list, path_length=None, cache=None) -> str: +def tabulate_project_records( + records: List["NbProjectRecord"], + path_length: Optional[int] = None, + cache: Optional["JupyterCacheAbstract"] = None, + assets=False, +) -> str: """Tabulate cache records. - :param records: list of ``NbStageRecord`` + :param records: list of ``NbProjectRecord`` :param path_length: truncate URI paths to x components :param cache: If the cache is given, we use it to add a column of matched cached pk (if available) + :param assets: Show the number of assets """ import tabulate rows = [] - for record in sorted(records, key=lambda r: r.created, reverse=True): + for record in records: cache_record = None + read_error = None if cache is not None: - cache_record = cache.get_cache_record_of_staged(record.uri) + try: + cache_record = cache.get_cached_project_nb(record.uri) + except NbReadError as exc: + read_error = f"{exc.__class__.__name__}: {exc}" rows.append( - record.format_dict(cache_record=cache_record, path_length=path_length) + record.format_dict( + cache_record=cache_record, + path_length=path_length, + assets=assets, + read_error=read_error, + ) ) return tabulate.tabulate(rows, headers="keys") diff --git a/pyproject.toml b/pyproject.toml index dbc3d7d..b90d5e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,3 +5,4 @@ build-backend = "setuptools.build_meta" [tool.isort] profile = "black" src_paths = ["jupyter_cache", "tests"] +force_sort_within_sections = true diff --git a/setup.cfg b/setup.cfg index 6a30278..3d55bb1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -15,10 +15,10 @@ classifiers = Intended Audience :: Developers License :: OSI Approved :: MIT License Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 Programming Language :: Python :: Implementation :: CPython Topic :: Software Development :: Libraries :: Python Modules keywords = sphinx extension material design web components @@ -29,41 +29,50 @@ project_urls = packages = find: install_requires = attrs + click + importlib-metadata nbclient>=0.2,<0.6 - nbdime nbformat + pyyaml sqlalchemy>=1.3.12,<1.5 -python_requires = ~=3.6 + tabulate +python_requires = ~=3.7 include_package_data = True zip_safe = True [options.entry_points] console_scripts = jcache = jupyter_cache.cli.commands.cmd_main:jcache -jupyter_executors = - basic = jupyter_cache.executors.basic:JupyterExecutorBasic +jcache.executors = + local-serial = jupyter_cache.executors.basic:JupyterExecutorLocalSerial + temp-serial = jupyter_cache.executors.basic:JupyterExecutorTempSerial + local-parallel = jupyter_cache.executors.basic:JupyterExecutorLocalMproc + temp-parallel = jupyter_cache.executors.basic:JupyterExecutorTempMproc +jcache.readers = + nbformat = jupyter_cache.readers:nbf_reader + jupytext = jupyter_cache.readers:jupytext_reader [options.extras_require] cli = - click - click-completion click-log - pyyaml - tabulate code_style = pre-commit~=2.12 rtd = + nbdime + jupytext myst-nb~=0.12.3 sphinx-book-theme~=0.1.1 sphinx-copybutton testing = + nbdime coverage ipykernel + jupytext matplotlib nbformat>=5.1 numpy pandas - pytest>=3.6,<4 + pytest>=6,<7 pytest-cov pytest-regressions sympy diff --git a/tests/make_cli_readme.py b/tests/make_cli_readme.py deleted file mode 100644 index 41618e2..0000000 --- a/tests/make_cli_readme.py +++ /dev/null @@ -1,244 +0,0 @@ -import os -from datetime import datetime -from glob import glob -from textwrap import dedent - -from click.testing import CliRunner - -from jupyter_cache.cache.main import DEFAULT_CACHE_LIMIT -from jupyter_cache.cli.commands import cmd_cache, cmd_exec, cmd_main, cmd_stage - - -def get_string(cli, group=None, args=(), input=None): - command_str = ["jcache"] if cli.name != "jcache" else [] - if group: - command_str.append(group) - command_str.append(cli.name) - command_str = " ".join(command_str) - - runner = CliRunner() - result = runner.invoke(cli, args, input=input) - root_path = os.getcwd() + os.sep - return "```console\n$ {}{}\n{}```".format( - command_str, - (" " + " ".join(args)) if args else "", - result.output.replace(root_path, "../"), - ) - - -def main(): - - get_string(cmd_main.clear_cache, input="y") - - strings = [] - strings.append( - "".format( - datetime.now().isoformat(" ", "minutes"), __file__ - ) - ) - strings.append("From the checked-out repository folder:") - strings.append(get_string(cmd_main.jcache, None, ["--help"])) - strings.append( - dedent( - """\ - ````{tip} - Execute this in the terminal for auto-completion: - - ```console - eval "$(_JCACHE_COMPLETE=source jcache)" - ``` - ````""" - ) - ) - - # cache - strings.append("### Caching Executed Notebooks") - cache_name = cmd_cache.cmnd_cache.name - strings.append(get_string(cmd_cache.cmnd_cache, None, ["--help"])) - strings.append("The first time the cache is required, it will be lazily created:") - strings.append(get_string(cmd_cache.list_caches, cache_name, input="y")) - strings.append( - dedent( - """\ - You can add notebooks straight into the cache. - When caching, a check will be made that the notebooks look to have been executed - correctly, i.e. the cell execution counts go sequentially up from 1.""" - ) - ) - strings.append( - get_string( - cmd_cache.cache_nbs, cache_name, ["tests/notebooks/basic.ipynb"], input="y" - ) - ) - strings.append("Or to skip validation:") - strings.append( - get_string( - cmd_cache.cache_nbs, - cache_name, - ["--no-validate"] + glob("tests/notebooks/*.ipynb"), - ) - ) - strings.append( - dedent( - """\ - Once you've cached some notebooks, you can look at the 'cache records' - for what has been cached. - - Each notebook is hashed (code cells and kernel spec only), - which is used to compare against 'staged' notebooks. - Multiple hashes for the same URI can be added - (the URI is just there for inspetion) and the size of the cache is limited - (current default {}) so that, at this size, - the last accessed records begin to be deleted. - You can remove cached records by their ID.""".format( - DEFAULT_CACHE_LIMIT - ) - ) - ) - strings.append(get_string(cmd_cache.list_caches, cache_name)) - strings.append( - dedent( - """\ - ````{tip} - To only show the latest versions of cached notebooks. - - ```console - $ jcache cache list --latest-only - ``` - ````""" - ) - ) - strings.append( - dedent( - """\ - You can also cache notebooks with artefacts - (external outputs of the notebook execution).""" - ) - ) - strings.append( - get_string( - cmd_cache.cache_nb, - cache_name, - [ - "-nb", - "tests/notebooks/basic.ipynb", - "tests/notebooks/artifact_folder/artifact.txt", - ], - input="y", - ) - ) - strings.append( - "Show a full description of a cached notebook by referring to its ID" - ) - strings.append(get_string(cmd_cache.show_cache, cache_name, ["6"])) - strings.append("Note artefact paths must be 'upstream' of the notebook folder:") - strings.append( - get_string( - cmd_cache.cache_nb, - cache_name, - ["-nb", "tests/notebooks/basic.ipynb", "tests/test_db.py"], - ) - ) - strings.append("To view the contents of an execution artefact:") - strings.append( - get_string( - cmd_cache.cat_artifact, cache_name, ["6", "artifact_folder/artifact.txt"] - ) - ) - strings.append("You can directly remove a cached notebook by its ID:") - strings.append(get_string(cmd_cache.remove_caches, cache_name, ["4"])) - strings.append( - "You can also diff any of the cached notebooks with any (external) notebook:" - ) - strings.append( - get_string(cmd_cache.diff_nb, cache_name, ["2", "tests/notebooks/basic.ipynb"]) - ) - - # staging - strings.append("### Staging Notebooks for execution") - stage_name = cmd_stage.cmnd_stage.name - strings.append(get_string(cmd_stage.cmnd_stage, None, ["--help"])) - strings.append( - dedent( - """\ - Staged notebooks are recorded as pointers to their URI, - i.e. no physical copying takes place until execution time. - - If you stage some notebooks for execution, then - you can list them to see which have existing records in the cache (by hash), - and which will require execution:""" - ) - ) - strings.append( - get_string(cmd_stage.stage_nbs, stage_name, glob("tests/notebooks/*.ipynb")) - ) - strings.append(get_string(cmd_stage.list_staged, stage_name)) - strings.append("You can remove a staged notebook by its URI or ID:") - strings.append(get_string(cmd_stage.unstage_nbs_id, stage_name, ["4"])) - strings.append("You can then run a basic execution of the required notebooks:") - strings.append(get_string(cmd_cache.remove_caches, cache_name, ["6", "2"])) - strings.append(get_string(cmd_exec.execute_nbs, None)) - strings.append( - dedent( - """\ - Successfully executed notebooks will be cached to the cache, - along with any 'artefacts' created by the execution, - that are inside the notebook folder, and data supplied by the executor.""" - ) - ) - strings.append(get_string(cmd_stage.list_staged, stage_name)) - strings.append( - "Execution data (such as execution time) will be stored in the cache record:" - ) - strings.append(get_string(cmd_cache.show_cache, cache_name, ["6"])) - strings.append( - "Failed notebooks will not be cached, " - "but the exception traceback will be added to the stage record:" - ) - strings.append(get_string(cmd_stage.show_staged, stage_name, ["2"])) - strings.append( - dedent( - """\ - ```{tip} - Code cells can be tagged with `raises-exception` to let the executor known that - a cell *may* raise an exception(see - [this issue on its behaviour](https://github.com/jupyter/nbconvert/issues/730)). - ```""" - ) - ) - strings.append( - "Once executed you may leave staged notebooks, " - "for later re-execution, or remove them:" - ) - strings.append( - get_string(cmd_stage.unstage_nbs_id, stage_name, ["--all"], input="y") - ) - - # assets - strings.append( - dedent( - """\ - You can also stage notebooks with assets; - external files that are required by the notebook during execution. - As with artefacts, these files must be in the same folder as the notebook, - or a sub-folder.""" - ) - ) - strings.append( - get_string( - cmd_stage.stage_nb, - stage_name, - [ - "-nb", - "tests/notebooks/basic.ipynb", - "tests/notebooks/artifact_folder/artifact.txt", - ], - ) - ) - strings.append(get_string(cmd_stage.show_staged, stage_name, ["1"])) - - return "\n\n".join(strings) - - -if __name__ == "__main__": - print(main()) diff --git a/tests/notebooks/basic.md b/tests/notebooks/basic.md new file mode 100644 index 0000000..1f4eba3 --- /dev/null +++ b/tests/notebooks/basic.md @@ -0,0 +1,21 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.11.3 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +# a title + +some text + +```{code-cell} ipython3 +a=1 +print(a) +``` diff --git a/tests/test_cache.py b/tests/test_cache.py index ae871ee..be7c0ca 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,15 +1,23 @@ import os +import shutil from textwrap import dedent import nbformat as nbf import pytest +from jupyter_cache import __version__ from jupyter_cache.base import NbValidityError from jupyter_cache.cache.main import JupyterCacheBase NB_PATH = os.path.join(os.path.realpath(os.path.dirname(__file__)), "notebooks") +def test_get_version(tmp_path): + cache = JupyterCacheBase(str(tmp_path)) + cache.db + assert cache.get_version() == __version__ + + def test_basic_workflow(tmp_path): cache = JupyterCacheBase(str(tmp_path)) with pytest.raises(NbValidityError): @@ -74,19 +82,19 @@ def test_basic_workflow(tmp_path): check_validity=False, ) with pytest.raises(ValueError): - cache.stage_notebook_file(os.path.join(NB_PATH, "basic.ipynb"), assets=[""]) - cache.stage_notebook_file( + cache.add_nb_to_project(os.path.join(NB_PATH, "basic.ipynb"), assets=[""]) + cache.add_nb_to_project( os.path.join(NB_PATH, "basic.ipynb"), assets=[os.path.join(NB_PATH, "basic.ipynb")], ) - assert [r.pk for r in cache.list_staged_records()] == [1] - assert [r.pk for r in cache.list_staged_unexecuted()] == [] + assert [r.pk for r in cache.list_project_records()] == [1] + assert [r.pk for r in cache.list_unexecuted()] == [] - cache.stage_notebook_file(os.path.join(NB_PATH, "basic_failing.ipynb")) - assert [r.pk for r in cache.list_staged_records()] == [1, 2] - assert [r.pk for r in cache.list_staged_unexecuted()] == [2] + cache.add_nb_to_project(os.path.join(NB_PATH, "basic_failing.ipynb")) + assert [r.pk for r in cache.list_project_records()] == [1, 2] + assert [r.pk for r in cache.list_unexecuted()] == [2] - bundle = cache.get_staged_notebook(os.path.join(NB_PATH, "basic_failing.ipynb")) + bundle = cache.get_project_notebook(os.path.join(NB_PATH, "basic_failing.ipynb")) assert bundle.nb.metadata cache.clear_cache() @@ -165,6 +173,7 @@ def test_artifacts(tmp_path): str(p.relative_to(tmp_path)) for p in tmp_path.glob("**/*") if p.is_file() } == { "global.db", + "__version__.txt", f"executed/{hashkey}/base.ipynb", f"executed/{hashkey}/artifacts/artifact_folder/artifact.txt", } @@ -181,32 +190,37 @@ def test_artifacts(tmp_path): assert path.joinpath("artifact_folder").exists() -# jupyter_client/session.py:371: DeprecationWarning: -# Session._key_changed is deprecated in traitlets: use @observe and @unobserve instead -@pytest.mark.filterwarnings("ignore") -def test_execution(tmp_path): +@pytest.mark.parametrize( + "executor_key", ["local-serial", "temp-serial", "local-parallel", "temp-parallel"] +) +def test_execution(tmp_path, executor_key): from jupyter_cache.executors import load_executor - db = JupyterCacheBase(str(tmp_path)) - db.stage_notebook_file(path=os.path.join(NB_PATH, "basic_unrun.ipynb")) - db.stage_notebook_file(path=os.path.join(NB_PATH, "basic_failing.ipynb")) - db.stage_notebook_file( - path=os.path.join(NB_PATH, "external_output.ipynb"), - assets=(os.path.join(NB_PATH, "basic.ipynb"),), + db = JupyterCacheBase(str(tmp_path / "cache")) + temp_nb_path = tmp_path / "notebooks" + shutil.copytree(NB_PATH, temp_nb_path) + db.add_nb_to_project(path=os.path.join(temp_nb_path, "basic_unrun.ipynb")) + db.add_nb_to_project(path=os.path.join(temp_nb_path, "basic_failing.ipynb")) + db.add_nb_to_project( + path=os.path.join(temp_nb_path, "external_output.ipynb"), + assets=(os.path.join(temp_nb_path, "basic.ipynb"),), ) - executor = load_executor("basic", db) + executor = load_executor(executor_key, db) result = executor.run_and_cache() - print(result) - assert result == { + # print(result) + json_result = result.as_json() + json_result["succeeded"] = list(sorted(json_result.get("succeeded", []))) + assert json_result == { "succeeded": [ - os.path.join(NB_PATH, "basic_unrun.ipynb"), - os.path.join(NB_PATH, "external_output.ipynb"), + os.path.join(temp_nb_path, "basic_unrun.ipynb"), + os.path.join(temp_nb_path, "external_output.ipynb"), ], - "excepted": [os.path.join(NB_PATH, "basic_failing.ipynb")], + "excepted": [os.path.join(temp_nb_path, "basic_failing.ipynb")], "errored": [], } assert len(db.list_cache_records()) == 2 - bundle = db.get_cache_bundle(1) + cache_record = db.get_cached_project_nb(1) + bundle = db.get_cache_bundle(cache_record.pk) assert bundle.nb.cells[0] == { "cell_type": "code", "execution_count": 1, @@ -215,51 +229,76 @@ def test_execution(tmp_path): "source": "a=1\nprint(a)", } assert "execution_seconds" in bundle.record.data - with db.cache_artefacts_temppath(2) as path: - paths = [str(p.relative_to(path)) for p in path.glob("**/*") if p.is_file()] - assert paths == ["artifact.txt"] - assert path.joinpath("artifact.txt").read_text(encoding="utf8") == "hi" - stage_record = db.get_staged_record(2) - assert stage_record.traceback is not None - assert "Exception: oopsie!" in stage_record.traceback + + # TODO artifacts + # with db.cache_artefacts_temppath(2) as path: + # paths = [str(p.relative_to(path)) for p in path.glob("**/*") if p.is_file()] + # assert paths == ["artifact.txt"] + # assert path.joinpath("artifact.txt").read_text(encoding="utf8") == "hi" + + project_record = db.get_project_record(2) + assert project_record.traceback is not None + assert "Exception: oopsie!" in project_record.traceback + + +def test_execution_jupytext(tmp_path): + """Test execution with the jupytext reader.""" + from jupyter_cache.executors import load_executor + + db = JupyterCacheBase(str(tmp_path / "cache")) + temp_nb_path = tmp_path / "notebooks" + shutil.copytree(NB_PATH, temp_nb_path) + db.add_nb_to_project( + path=os.path.join(temp_nb_path, "basic.md"), + read_data={"name": "jupytext", "type": "plugin"}, + ) + executor = load_executor("local-serial", db) + result = executor.run_and_cache() + print(result) + assert result.as_json() == { + "succeeded": [ + os.path.join(temp_nb_path, "basic.md"), + ], + "excepted": [], + "errored": [], + } + assert len(db.list_cache_records()) == 1 -@pytest.mark.filterwarnings("ignore") def test_execution_timeout_config(tmp_path): """tests the timeout value passed to the executor""" from jupyter_cache.executors import load_executor db = JupyterCacheBase(str(tmp_path)) - db.stage_notebook_file(path=os.path.join(NB_PATH, "sleep_2.ipynb")) - executor = load_executor("basic", db) + db.add_nb_to_project(path=os.path.join(NB_PATH, "sleep_2.ipynb")) + executor = load_executor("local-serial", db) result = executor.run_and_cache(timeout=10) - assert result == { + assert result.as_json() == { "succeeded": [os.path.join(NB_PATH, "sleep_2.ipynb")], "excepted": [], "errored": [], } db.clear_cache() - db.stage_notebook_file(path=os.path.join(NB_PATH, "sleep_2.ipynb")) - executor = load_executor("basic", db) + db.add_nb_to_project(path=os.path.join(NB_PATH, "sleep_2.ipynb")) + executor = load_executor("local-serial", db) result = executor.run_and_cache(timeout=1) - assert result == { + assert result.as_json() == { "succeeded": [], "excepted": [os.path.join(NB_PATH, "sleep_2.ipynb")], "errored": [], } -@pytest.mark.filterwarnings("ignore") def test_execution_timeout_metadata(tmp_path): """tests the timeout metadata key in notebooks""" from jupyter_cache.executors import load_executor db = JupyterCacheBase(str(tmp_path)) - db.stage_notebook_file(path=os.path.join(NB_PATH, "sleep_2_timeout_1.ipynb")) - executor = load_executor("basic", db) + db.add_nb_to_project(path=os.path.join(NB_PATH, "sleep_2_timeout_1.ipynb")) + executor = load_executor("local-serial", db) result = executor.run_and_cache() - assert result == { + assert result.as_json() == { "succeeded": [], "excepted": [os.path.join(NB_PATH, "sleep_2_timeout_1.ipynb")], "errored": [], @@ -271,10 +310,10 @@ def test_execution_allow_errors_config(tmp_path): from jupyter_cache.executors import load_executor db = JupyterCacheBase(str(tmp_path)) - db.stage_notebook_file(path=os.path.join(NB_PATH, "basic_failing.ipynb")) - executor = load_executor("basic", db) + db.add_nb_to_project(path=os.path.join(NB_PATH, "basic_failing.ipynb")) + executor = load_executor("local-serial", db) result = executor.run_and_cache(allow_errors=True) - assert result == { + assert result.as_json() == { "succeeded": [os.path.join(NB_PATH, "basic_failing.ipynb")], "excepted": [], "errored": [], @@ -286,10 +325,10 @@ def test_run_in_temp_false(tmp_path): from jupyter_cache.executors import load_executor db = JupyterCacheBase(str(tmp_path)) - db.stage_notebook_file(path=os.path.join(NB_PATH, "basic.ipynb")) - executor = load_executor("basic", db) - result = executor.run_and_cache(run_in_temp=False) - assert result == { + db.add_nb_to_project(path=os.path.join(NB_PATH, "basic.ipynb")) + executor = load_executor("temp-serial", db) + result = executor.run_and_cache() + assert result.as_json() == { "succeeded": [os.path.join(NB_PATH, "basic.ipynb")], "excepted": [], "errored": [], diff --git a/tests/test_cli.py b/tests/test_cli.py index b9a6359..846a70c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,46 +1,62 @@ import os +from pathlib import Path from click.testing import CliRunner +import pytest from jupyter_cache.cache.main import JupyterCacheBase -from jupyter_cache.cli.commands import cmd_cache, cmd_main, cmd_stage +from jupyter_cache.cli import CacheContext +from jupyter_cache.cli.commands import cmd_cache, cmd_main, cmd_notebook, cmd_project NB_PATH = os.path.join(os.path.realpath(os.path.dirname(__file__)), "notebooks") -def test_base(): - runner = CliRunner() +class Runner(CliRunner): + def __init__(self, path) -> None: + super().__init__() + self._cache_path = path + + def create_cache(self) -> JupyterCacheBase: + return JupyterCacheBase(str(self._cache_path)) + + def invoke(self, *args, **kwargs): + return super().invoke(*args, **kwargs, obj=CacheContext(self._cache_path)) + + +@pytest.fixture() +def runner(tmp_path): + return Runner(tmp_path) + + +def test_base(runner: Runner): result = runner.invoke(cmd_main.jcache, "-v") assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "jupyter-cache version" in result.output.strip(), result.output -def test_clear_cache(tmp_path): - JupyterCacheBase(str(tmp_path)) - runner = CliRunner() - result = runner.invoke(cmd_main.clear_cache, ["-p", tmp_path], input="y") +def test_clear_cache(runner: Runner): + result = runner.invoke(cmd_project.clear_cache, input="y") assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "Cache cleared!" in result.output.strip(), result.output -def test_list_caches(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_list_caches(runner: Runner): + db = runner.create_cache() db.cache_notebook_file( path=os.path.join(NB_PATH, "basic.ipynb"), uri="basic.ipynb", check_validity=False, ) - runner = CliRunner() - result = runner.invoke(cmd_cache.list_caches, ["-p", tmp_path]) + result = runner.invoke(cmd_cache.list_caches, []) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "basic.ipynb" in result.output.strip(), result.output -def test_list_caches_latest_only(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_list_caches_latest_only(runner: Runner): + db = runner.create_cache() db.cache_notebook_file( path=os.path.join(NB_PATH, "basic.ipynb"), uri="basic.ipynb", @@ -51,68 +67,64 @@ def test_list_caches_latest_only(tmp_path): uri="basic.ipynb", check_validity=False, ) - runner = CliRunner() - result = runner.invoke(cmd_cache.list_caches, ["-p", tmp_path]) + result = runner.invoke(cmd_cache.list_caches, []) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert len(result.output.strip().splitlines()) == 4, result.output - result = runner.invoke(cmd_cache.list_caches, ["-p", tmp_path, "--latest-only"]) + result = runner.invoke(cmd_cache.list_caches, ["--latest-only"]) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert len(result.output.strip().splitlines()) == 3, result.output -def test_cache_with_artifact(tmp_path): - JupyterCacheBase(str(tmp_path)) +def test_cache_with_artifact(runner: Runner): + nb_path = os.path.join(NB_PATH, "basic.ipynb") a_path = os.path.join(NB_PATH, "artifact_folder", "artifact.txt") - runner = CliRunner() result = runner.invoke( - cmd_cache.cache_nb, ["-p", tmp_path, "--no-validate", "-nb", nb_path, a_path] + cmd_cache.cache_nb, ["--no-validate", "-nb", nb_path, a_path] ) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "basic.ipynb" in result.output.strip(), result.output - result = runner.invoke(cmd_cache.show_cache, ["-p", tmp_path, "1"]) + result = runner.invoke(cmd_cache.cached_info, ["1"]) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "- artifact_folder/artifact.txt" in result.output.strip(), result.output result = runner.invoke( - cmd_cache.cat_artifact, ["-p", tmp_path, "1", "artifact_folder/artifact.txt"] + cmd_cache.cat_artifact, ["1", "artifact_folder/artifact.txt"] ) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "An artifact" in result.output.strip(), result.output -def test_cache_nbs(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_cache_nbs(runner: Runner): + db = runner.create_cache() path = os.path.join(NB_PATH, "basic.ipynb") - runner = CliRunner() - result = runner.invoke(cmd_cache.cache_nbs, ["-p", tmp_path, "--no-validate", path]) + result = runner.invoke(cmd_cache.cache_nbs, ["--no-validate", path]) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "basic.ipynb" in result.output.strip(), result.output assert db.list_cache_records()[0].uri == path -def test_remove_caches(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_remove_caches(runner: Runner): + db = runner.create_cache() db.cache_notebook_file( path=os.path.join(NB_PATH, "basic.ipynb"), uri="basic.ipynb", check_validity=False, ) - runner = CliRunner() - result = runner.invoke(cmd_cache.remove_caches, ["-p", tmp_path, "1"]) + result = runner.invoke(cmd_cache.remove_caches, ["1"]) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "Success" in result.output.strip(), result.output assert db.list_cache_records() == [] -def test_diff_nbs(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_diff_nbs(runner: Runner): + db = runner.create_cache() path = os.path.join(NB_PATH, "basic.ipynb") path2 = os.path.join(NB_PATH, "basic_failing.ipynb") db.cache_notebook_file(path, check_validity=False) @@ -120,8 +132,7 @@ def test_diff_nbs(tmp_path): # nb_bundle.nb.cells[0].source = "# New Title" # db.stage_notebook_bundle(nb_bundle) - runner = CliRunner() - result = runner.invoke(cmd_cache.diff_nb, ["-p", tmp_path, "1", path2]) + result = runner.invoke(cmd_cache.diff_nb, ["1", path2]) assert result.exception is None, result.output assert result.exit_code == 0, result.output print(result.output.splitlines()[2:]) @@ -151,53 +162,99 @@ def test_diff_nbs(tmp_path): ] -def test_stage_nbs(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_add_nbs_to_project(runner: Runner): + db = runner.create_cache() path = os.path.join(NB_PATH, "basic.ipynb") - runner = CliRunner() - result = runner.invoke(cmd_stage.stage_nbs, ["-p", tmp_path, path]) + result = runner.invoke(cmd_notebook.add_notebooks, [path]) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "basic.ipynb" in result.output.strip(), result.output - assert db.list_staged_records()[0].uri == path + assert db.list_project_records()[0].uri == path -def test_unstage_nbs(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_remove_nbs_from_project(runner: Runner): + db = runner.create_cache() path = os.path.join(NB_PATH, "basic.ipynb") - runner = CliRunner() - result = runner.invoke(cmd_stage.stage_nbs, ["-p", tmp_path, path]) - result = runner.invoke(cmd_stage.unstage_nbs_uri, ["-p", tmp_path, path]) + result = runner.invoke(cmd_notebook.add_notebooks, [path]) + result = runner.invoke(cmd_notebook.remove_nbs, [path]) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "basic.ipynb" in result.output.strip(), result.output - assert db.list_staged_records() == [] + assert db.list_project_records() == [] + + +def test_clear_project(runner: Runner): + db = runner.create_cache() + path = os.path.join(NB_PATH, "basic.ipynb") + result = runner.invoke(cmd_notebook.add_notebooks, [path]) + result = runner.invoke(cmd_notebook.clear_nbs, [], input="y") + assert result.exception is None, result.output + assert result.exit_code == 0, result.output + assert db.list_project_records() == [] -def test_list_staged(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_list_nbs_in_project(runner: Runner): + db = runner.create_cache() db.cache_notebook_file( path=os.path.join(NB_PATH, "basic.ipynb"), check_validity=False ) - db.stage_notebook_file(path=os.path.join(NB_PATH, "basic.ipynb")) - db.stage_notebook_file(path=os.path.join(NB_PATH, "basic_failing.ipynb")) + db.add_nb_to_project(path=os.path.join(NB_PATH, "basic.ipynb")) + db.add_nb_to_project(path=os.path.join(NB_PATH, "basic_failing.ipynb")) - runner = CliRunner() - result = runner.invoke(cmd_stage.list_staged, ["-p", tmp_path]) + result = runner.invoke(cmd_notebook.list_nbs_in_project, []) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "basic.ipynb" in result.output.strip(), result.output -def test_show_staged(tmp_path): - db = JupyterCacheBase(str(tmp_path)) +def test_show_project_record(runner: Runner): + db = runner.create_cache() db.cache_notebook_file( path=os.path.join(NB_PATH, "basic.ipynb"), check_validity=False ) - db.stage_notebook_file(path=os.path.join(NB_PATH, "basic.ipynb")) + db.add_nb_to_project(path=os.path.join(NB_PATH, "basic.ipynb")) - runner = CliRunner() - result = runner.invoke(cmd_stage.show_staged, ["-p", tmp_path, "1"]) + result = runner.invoke(cmd_notebook.show_project_record, ["1"]) assert result.exception is None, result.output assert result.exit_code == 0, result.output assert "basic.ipynb" in result.output.strip(), result.output + + +def test_project_execute(runner: Runner): + db = runner.create_cache() + db.add_nb_to_project(path=os.path.join(NB_PATH, "basic.ipynb")) + result = runner.invoke(cmd_project.execute_nbs, []) + assert result.exception is None, result.output + assert result.exit_code == 0, result.output + assert len(db.list_cache_records()) == 1 + + +def test_project_merge(runner: Runner, tmp_path: Path): + db = runner.create_cache() + record = db.add_nb_to_project(path=os.path.join(NB_PATH, "basic_unrun.ipynb")) + db.cache_notebook_file( + path=os.path.join(NB_PATH, "basic.ipynb"), + uri="basic.ipynb", + check_validity=False, + ) + result = runner.invoke( + cmd_notebook.merge_executed, + [str(record.pk), str(tmp_path / "output.ipynb")], + ) + assert result.exception is None, result.output + assert result.exit_code == 0, result.output + assert (tmp_path / "output.ipynb").exists() + + +def test_project_invalidate(runner: Runner): + db = runner.create_cache() + db.cache_notebook_file( + path=os.path.join(NB_PATH, "basic.ipynb"), check_validity=False + ) + db.add_nb_to_project(path=os.path.join(NB_PATH, "basic.ipynb")) + + result = runner.invoke(cmd_notebook.invalidate_nbs, ["1"]) + assert result.exception is None, result.output + assert result.exit_code == 0, result.output + assert db.list_project_records() + assert not db.list_cache_records() diff --git a/tox.ini b/tox.ini index c975f2d..dfbd14a 100644 --- a/tox.ini +++ b/tox.ini @@ -17,15 +17,25 @@ envlist = py38 usedevelop = true [testenv:py{36,37,38,39}] -extras = cli,testing +extras = testing +deps = + black + flake8 +setenv = + SQLALCHEMY_WARN_20 = 1 commands = pytest {posargs} [testenv:cli] -extras = cli +; extras = cli +deps = + ipykernel + jupytext commands = jcache {posargs} [testenv:docs-{clean,update}] -extras = rtd +extras = + cli + rtd whitelist_externals = echo rm