From 96fb7fe4fb0fc36306a5e582784c719d9ca8852a Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Sat, 28 Dec 2024 05:55:47 +0100 Subject: [PATCH 01/18] Remove unit tests for 3.9 and 3.8 --- .github/workflows/unitest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index e1fb7eb..0a53777 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.10", "3.11"] steps: - uses: actions/checkout@v4 @@ -30,4 +30,4 @@ jobs: - name: Run tests run: | - pytest \ No newline at end of file + pytest From eb73a0cc1f820bdffed3004a1e8b5711585625be Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sat, 28 Dec 2024 05:59:11 +0100 Subject: [PATCH 02/18] chore: add pre-commit config, type hints, badges, and lint codebase (#57) * chore: add pre-commit config, type hints, badges, and lint codebase - Add .pre-commit-config.yaml and pyproject.toml for Black and isort - Add missing type hints throughout the code (Dict[...] for Python 3.8 compatibility) - Added badges and convert existing badges to use format - Lint Markdown files - Lint Jinja templates with djlint * Resolve error and fix remaining type hint violations * Fix absolute imports and mock paths in test_clone.py to resolve test failures. * Replace deprecated 'dotenv' with 'python-dotenv' in requirements.txt to resolve installation errors. --- .github/workflows/unitest.yml | 6 +- .gitignore | 1 + .pre-commit-config.yaml | 78 +++++++++ CODE_OF_CONDUCT.md | 12 +- README.md | 80 ++++++--- SECURITY.md | 2 +- pyproject.toml | 17 ++ pytest.ini | 3 +- requirements.txt | 15 +- setup.py | 4 +- src/config.py | 2 +- src/gitingest/__init__.py | 10 +- src/gitingest/cli.py | 24 ++- src/gitingest/clone.py | 22 +-- src/gitingest/ignore_patterns.py | 102 +++++++++++ src/gitingest/ingest.py | 37 +++- src/gitingest/ingest_from_query.py | 136 ++++++++++----- src/gitingest/parse_query.py | 97 ++++------- src/gitingest/tests/conftest.py | 2 +- src/gitingest/tests/test_clone.py | 46 ++--- src/gitingest/tests/test_ingest.py | 72 ++++---- src/gitingest/tests/test_parse_query.py | 29 ++-- src/gitingest/utils.py | 19 +- src/main.py | 60 ++++--- src/process_query.py | 103 ++++++++--- src/routers/__init__.py | 8 +- src/routers/download.py | 23 +-- src/routers/dynamic.py | 28 +-- src/routers/index.py | 37 ++-- src/server_utils.py | 13 +- src/static/favicon.svg | 2 +- src/static/js/snow.js | 2 +- src/static/robots.txt | 3 +- src/templates/api.jinja | 60 +++---- src/templates/base.jinja | 111 ++++++------ src/templates/components/footer.jinja | 22 ++- src/templates/components/github_form.jinja | 108 ++++++------ src/templates/components/navbar.jinja | 30 ++-- src/templates/components/result.jinja | 191 +++++++++------------ src/templates/github.jinja | 50 +++--- src/templates/index.jinja | 104 +++++------ 41 files changed, 1048 insertions(+), 723 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml create mode 100644 src/gitingest/ignore_patterns.py diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index 0a53777..6759ecd 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -15,19 +15,19 @@ jobs: steps: - uses: actions/checkout@v4 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - + - name: Install dependencies run: | python -m pip install --upgrade pip pip install pytest pytest-asyncio if [ -f requirements.txt ]; then pip install -r requirements.txt; fi pip install -e . - + - name: Run tests run: | pytest diff --git a/.gitignore b/.gitignore index e98f538..09c9945 100644 --- a/.gitignore +++ b/.gitignore @@ -131,6 +131,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.python-version # Spyder project settings .spyderproject diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1b3eabd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,78 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + # Files + - id: check-added-large-files + description: 'Prevent large files from being committed.' + args: ['--maxkb=10000'] + - id: check-case-conflict + description: 'Check for files that would conflict in case-insensitive filesystems.' + - id: fix-byte-order-marker + description: 'Remove utf-8 byte order marker.' + - id: mixed-line-ending + description: 'Replace mixed line ending.' + + # Links + - id: destroyed-symlinks + description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' + + # File files for parseable syntax: python + - id: check-ast + + # File and line endings + - id: end-of-file-fixer + description: 'Ensure that a file is either empty, or ends with one newline.' + - id: trailing-whitespace + description: 'Trim trailing whitespace.' + + # Python + - id: check-docstring-first + description: 'Check a common error of defining a docstring after code.' + - id: requirements-txt-fixer + description: 'Sort entries in requirements.txt.' + + - repo: https://github.com/MarcoGorelli/absolufy-imports + rev: v0.3.1 + hooks: + - id: absolufy-imports + description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' + + - repo: https://github.com/psf/black + rev: 24.10.0 + hooks: + - id: black + + - repo: https://github.com/asottile/pyupgrade + rev: v3.19.1 + hooks: + - id: pyupgrade + description: 'Automatically upgrade syntax for newer versions.' + args: [--py3-plus, --py36-plus, --py38-plus] + + - repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.10.0 + hooks: + - id: python-check-blanket-noqa + description: 'Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`.' + - id: python-check-blanket-type-ignore + description: 'Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`.' + - id: python-use-type-annotations + description: 'Enforce that python3.6+ type annotations are used instead of type comments.' + + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + description: 'Sort imports alphabetically, and automatically separated into sections and by type.' + + - repo: https://github.com/hadialqattan/pycln + rev: v2.4.0 + hooks: + - id: pycln + description: 'Remove unused import statements.' + + - repo: https://github.com/djlint/djLint + rev: v1.36.4 + hooks: + - id: djlint-reformat-jinja diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 48ba75f..2293c26 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -60,7 +60,7 @@ representative at an online or offline event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -romain@coderamp.io. +. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the @@ -114,15 +114,13 @@ the community. ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.0, available at -https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. +. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/diversity). -[homepage]: https://www.contributor-covenant.org - For answers to common questions about this code of conduct, see the FAQ at -https://www.contributor-covenant.org/faq. Translations are available at -https://www.contributor-covenant.org/translations. +. Translations are available at +. diff --git a/README.md b/README.md index 991aeaf..6d0747a 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,56 @@ -[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com/) +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) + + + + License + + + + PyPI version + + + + Downloads + + + + GitHub issues + + + + Code style: black + + + + + Discord + + +# GitIngest -![License](https://img.shields.io/badge/license-MIT-blue.svg) - -# GitIngest πŸ” Turn any Git repository into a prompt-friendly text ingest for LLMs. You can also replace `hub` with `ingest` in any github url to access the coresponding digest -[gitingest.com](https://gitingest.com/) - +[gitingest.com](https://gitingest.com) ## πŸš€ Features - **Easy code context**: Get a text digest from a git repository URL or a directory - **Smart Formatting**: Optimized output format for LLM prompts -- **Statistics about**: : +- **Statistics about**: - File and directory structure - Size of the extract - - Token count + - Token count - **CLI tool**: Run it as a command (Currently on Linux only) - **Python package**: Import it in your code - ## πŸ“¦ Installation -``` +``` bash pip install gitingest ``` - ## πŸ’‘ Command Line usage The `gitingest` command line tool allows you to analyze codebases and create a text dump of their contents. @@ -46,60 +68,62 @@ gitingest --help This will write the digest in a text file (default `digest.txt`) in your current working directory. - ## πŸ› Python package usage - ```python from gitingest import ingest summary, tree, content = ingest("path/to/directory") -#or from URL +# or from URL summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") ``` By default, this won't write a file but can be enabled with the `output` argument - ## πŸ› οΈ Using + - Tailwind CSS - Frontend - [FastAPI](https://github.com/fastapi/fastapi) - Backend framework - [tiktoken](https://github.com/openai/tiktoken) - Token estimation - [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics +## 🌐 Self-host -## 🌐 Self-host 1. Build the image: -``` + +``` bash docker build -t gitingest . ``` 2. Run the container: -``` + +``` bash docker run -d --name gitingest -p 8000:8000 gitingest ``` + The application will be available at `http://localhost:8000` Ensure environment variables are set before running the application or deploying it via Docker. ## βœ”οΈ Contributing -Contributions are welcome! +Contributions are welcome! Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) -### Ways to contribute +### Ways to contribute 1. Provide your feedback and ideas on discord -2. Open an Issue on github to report a bug -2. Create a Pull request +2. Open an Issue on github to report a bug +3. Create a Pull request - Fork the repository - Make your changes and test them locally - Open a pull request for review and feedback ### πŸ”§ Local dev -#### Environment Configuration +#### Environment Configuration + - **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. You can configure the application using the following environment variables: @@ -108,23 +132,25 @@ ALLOWED_HOSTS="gitingest.local,localhost" ``` #### Run locally -1. Clone the repository + +1. Clone the repository + ```bash git clone https://github.com/cyclotruc/gitingest.git cd gitingest ``` 2. Install dependencies + ```bash pip install -r requirements.txt ``` 3. Run the application: + ```bash cd src uvicorn main:app --reload ``` -The frontend will be available at `localhost:8000` - - +The frontend will be available at `localhost:8000` diff --git a/SECURITY.md b/SECURITY.md index cf4a494..90a6d68 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,4 +2,4 @@ ## Reporting a Vulnerability -If you have discovered a vulnerability inside the project, report it privately at romain@coderamp.io. This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. +If you have discovered a vulnerability inside the project, report it privately at . This way the maintainer can work on a proper fix without disclosing the problem to the public before it has been solved. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f7d6c65 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,17 @@ +[tool.pylint.format] +max-line-length = 119 + +[tool.pycln] +all = true + +[tool.isort] +profile = "black" +line_length = 119 +remove_redundant_aliases = true +float_to_top = true +order_by_type = true +filter_files = true + +[tool.black] +line-length = 119 +skip-string-normalization = true diff --git a/pytest.ini b/pytest.ini index 7444d64..2a15500 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,7 +3,6 @@ pythonpath = src testpaths = src/gitingest/tests asyncio_mode = auto - python_files = test_*.py python_classes = Test* -python_functions = test_* \ No newline at end of file +python_functions = test_* diff --git a/requirements.txt b/requirements.txt index 6848603..2688a88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,13 @@ -fastapi[standard] -uvicorn +black +click>=8.0.0 +djlint fastapi-analytics -slowapi -tiktoken +fastapi[standard] +pre-commit pytest pytest-asyncio -click>=8.0.0 +python-dotenv +slowapi +starlette +tiktoken +uvicorn diff --git a/setup.py b/setup.py index 8afe6b7..6778a92 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( name="gitingest", @@ -28,4 +28,4 @@ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", ], -) \ No newline at end of file +) diff --git a/src/config.py b/src/config.py index cdf2849..b918fb2 100644 --- a/src/config.py +++ b/src/config.py @@ -1,4 +1,4 @@ -MAX_DISPLAY_SIZE = 300000 +MAX_DISPLAY_SIZE = 300_000 TMP_BASE_PATH = "../tmp" EXAMPLE_REPOS = [ diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index ed84b21..212fefc 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,6 +1,6 @@ -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query -from .ingest import ingest +from gitingest.clone import clone_repo +from gitingest.ingest import ingest +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query -__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] \ No newline at end of file +__all__ = ["ingest_from_query", "clone_repo", "parse_query", "ingest"] diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 81823e6..14df219 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,10 +1,12 @@ import os -import pathlib +from typing import Optional, Tuple + import click +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE -from gitingest.parse_query import DEFAULT_IGNORE_PATTERNS + def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() @@ -13,30 +15,38 @@ def normalize_pattern(pattern: str) -> str: pattern += "*" return pattern + @click.command() @click.argument('source', type=str, required=True) @click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') @click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') @click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') @click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') -def main(source, output, max_size, exclude_pattern, include_pattern): +def main( + source: str, + output: Optional[str], + max_size: int, + exclude_pattern: Tuple[str, ...], + include_pattern: Tuple[str, ...], +) -> None: """Analyze a directory and create a text dump of its contents.""" try: # Combine default and custom ignore patterns exclude_patterns = list(exclude_pattern) include_patterns = list(set(include_pattern)) - + if not output: output = "digest.txt" summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) - + click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") click.echo(summary) - + except Exception as e: click.echo(f"Error: {str(e)}", err=True) raise click.Abort() + if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index 4b69bc7..e7994c1 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,10 +1,11 @@ import asyncio -from typing import Tuple +from typing import Any, Dict, Tuple from gitingest.utils import async_timeout CLONE_TIMEOUT = 20 + async def check_repo_exists(url: str) -> bool: proc = await asyncio.create_subprocess_exec( "curl", @@ -20,14 +21,15 @@ async def check_repo_exists(url: str) -> bool: stdout_str = stdout.decode() return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str + @async_timeout(CLONE_TIMEOUT) -async def clone_repo(query: dict) -> str: +async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]: if not await check_repo_exists(query['url']): raise ValueError("Repository not found, make sure it is public") - + if query['commit']: proc = await asyncio.create_subprocess_exec( - "git", + "git", "clone", "--single-branch", query['url'], @@ -36,21 +38,21 @@ async def clone_repo(query: dict) -> str: stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() - + proc = await asyncio.create_subprocess_exec( "git", "-C", query['local_path'], "checkout", query['branch'], - stdout=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await proc.communicate() elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: proc = await asyncio.create_subprocess_exec( "git", - "clone", + "clone", "--depth=1", "--single-branch", "--branch", @@ -71,7 +73,7 @@ async def clone_repo(query: dict) -> str: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - + stdout, stderr = await proc.communicate() - - return stdout, stderr \ No newline at end of file + + return stdout, stderr diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py new file mode 100644 index 0000000..803c6ed --- /dev/null +++ b/src/gitingest/ignore_patterns.py @@ -0,0 +1,102 @@ +from typing import List + +DEFAULT_IGNORE_PATTERNS: List[str] = [ + # Python + '*.pyc', + '*.pyo', + '*.pyd', + '__pycache__', + '.pytest_cache', + '.coverage', + '.tox', + '.nox', + '.mypy_cache', + '.ruff_cache', + '.hypothesis', + 'poetry.lock', + 'Pipfile.lock', + # JavaScript/Node + 'node_modules', + 'bower_components', + 'package-lock.json', + 'yarn.lock', + '.npm', + '.yarn', + '.pnpm-store', + # Version control + '.git', + '.svn', + '.hg', + '.gitignore', + '.gitattributes', + '.gitmodules', + # Images and media + '*.svg', + '*.png', + '*.jpg', + '*.jpeg', + '*.gif', + '*.ico', + '*.pdf', + '*.mov', + '*.mp4', + '*.mp3', + '*.wav', + # Virtual environments + 'venv', + '.venv', + 'env', + '.env', + 'virtualenv', + # IDEs and editors + '.idea', + '.vscode', + '.vs', + '*.swp', + '*.swo', + '*.swn', + '.settings', + '.project', + '.classpath', + '*.sublime-*', + # Temporary and cache files + '*.log', + '*.bak', + '*.swp', + '*.tmp', + '*.temp', + '.cache', + '.sass-cache', + '.eslintcache', + '.DS_Store', + 'Thumbs.db', + 'desktop.ini', + # Build directories and artifacts + 'build', + 'dist', + 'target', + 'out', + '*.egg-info', + '*.egg', + '*.whl', + '*.so', + '*.dylib', + '*.dll', + '*.class', + # Documentation + 'site-packages', + '.docusaurus', + '.next', + '.nuxt', + # Other common patterns + ## Minified files + '*.min.js', + '*.min.css', + ## Source maps + '*.map', + ## Terraform + '.terraform', + '*.tfstate*', + ## Dependencies in various languages + 'vendor/', +] diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index eac2081..22fae6d 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -1,18 +1,36 @@ import asyncio +import inspect import shutil -from typing import Union, List from pathlib import Path +from typing import List, Optional, Tuple, Union -from .ingest_from_query import ingest_from_query -from .clone import clone_repo -from .parse_query import parse_query +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query -def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: Union[List[str], str] = None, exclude_patterns: Union[List[str], str] = None, output: str = None) -> str: + +def ingest( + source: str, + max_file_size: int = 10 * 1024 * 1024, + include_patterns: Union[List[str], str, None] = None, + exclude_patterns: Union[List[str], str, None] = None, + output: Optional[str] = None, +) -> Tuple[str, str, str]: try: - query = parse_query(source, max_file_size, False, include_patterns, exclude_patterns) + query = parse_query( + source=source, + max_file_size=max_file_size, + from_web=False, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) if query['url']: - asyncio.run(clone_repo(query)) - + clone_result = clone_repo(query) + if inspect.iscoroutine(clone_result): + asyncio.run(clone_result) + else: + raise TypeError("clone_repo did not return a coroutine as expected.") + summary, tree, content = ingest_from_query(query) if output: @@ -20,9 +38,10 @@ def ingest(source: str, max_file_size: int = 10 * 1024 * 1024, include_patterns: f.write(tree + "\n" + content) return summary, tree, content + finally: # Clean up the temporary directory if it was created if query['url']: # Get parent directory two levels up from local_path (../tmp) cleanup_path = str(Path(query['local_path']).parents[1]) - shutil.rmtree(cleanup_path, ignore_errors=True) \ No newline at end of file + shutil.rmtree(cleanup_path, ignore_errors=True) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 4e7d5e7..0080c25 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,13 +1,13 @@ import os from fnmatch import fnmatch -from typing import Dict, List, Union -import tiktoken +from typing import Any, Dict, List, Optional, Set, Tuple +import tiktoken -MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB +MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal -MAX_FILES = 10000 # Maximum number of files to process -MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500MB +MAX_FILES = 10_000 # Maximum number of files to process +MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: @@ -18,6 +18,7 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo include = True return include + def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: @@ -27,6 +28,7 @@ def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> boo return True return False + def is_safe_symlink(symlink_path: str, base_path: str) -> bool: """Check if a symlink points to a location within the base directory.""" try: @@ -37,23 +39,32 @@ def is_safe_symlink(symlink_path: str, base_path: str) -> bool: # If there's any error resolving the paths, consider it unsafe return False + def is_text_file(file_path: str) -> bool: """Determines if a file is likely a text file based on its content.""" try: with open(file_path, 'rb') as file: chunk = file.read(1024) return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) - except IOError: + except OSError: return False + def read_file_content(file_path: str) -> str: try: - with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + with open(file_path, encoding='utf-8', errors='ignore') as f: return f.read() except Exception as e: return f"Error reading file: {str(e)}" -def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = 0, stats: Dict = None) -> Dict: + +def scan_directory( + path: str, + query: Dict[str, Any], + seen_paths: Optional[Set[str]] = None, + depth: int = 0, + stats: Optional[Dict[str, int]] = None, +) -> Optional[Dict[str, Any]]: """Recursively analyzes a directory and its contents with safety limits.""" if seen_paths is None: seen_paths = set() @@ -76,6 +87,7 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = if real_path in seen_paths: print(f"Skipping already visited path: {path}") return None + seen_paths.add(real_path) result = { @@ -86,7 +98,7 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "file_count": 0, "dir_count": 0, "path": path, - "ignore_content": False + "ignore_content": False, } ignore_patterns = query['ignore_patterns'] @@ -137,14 +149,20 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "type": "file", "size": file_size, "content": content, - "path": item_path + "path": item_path, } result["children"].append(child) result["size"] += file_size result["file_count"] += 1 elif os.path.isdir(real_path): - subdir = scan_directory(real_path, query, seen_paths, depth + 1, stats) + subdir = scan_directory( + path=real_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) if subdir and (not include_patterns or subdir["file_count"] > 0): subdir["name"] = item subdir["path"] = item_path @@ -175,14 +193,20 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = "type": "file", "size": file_size, "content": content, - "path": item_path + "path": item_path, } result["children"].append(child) result["size"] += file_size result["file_count"] += 1 elif os.path.isdir(item_path): - subdir = scan_directory(item_path, query, seen_paths, depth + 1, stats) + subdir = scan_directory( + path=item_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) if subdir and (not include_patterns or subdir["file_count"] > 0): result["children"].append(subdir) result["size"] += subdir["size"] @@ -194,7 +218,13 @@ def scan_directory(path: str, query: dict, seen_paths: set = None, depth: int = return result -def extract_files_content(query: dict, node: Dict, max_file_size: int, files: List = None) -> List[Dict]: + +def extract_files_content( + query: Dict[str, Any], + node: Dict[str, Any], + max_file_size: int, + files: Optional[List[Dict[str, Any]]] = None, +) -> List[Dict[str, Any]]: """Recursively collects all text files with their contents.""" if files is None: files = [] @@ -204,17 +234,21 @@ def extract_files_content(query: dict, node: Dict, max_file_size: int, files: Li if node["size"] > max_file_size: content = None - files.append({ - "path": node["path"].replace(query['local_path'], ""), - "content": content, - "size": node["size"] - }) + files.append( + { + "path": node["path"].replace(query['local_path'], ""), + "content": content, + "size": node["size"], + }, + ) elif node["type"] == "directory": for child in node["children"]: - extract_files_content(query, child, max_file_size, files) + extract_files_content(query=query, node=child, max_file_size=max_file_size, files=files) + return files -def create_file_content_string(files: List[Dict]) -> str: + +def create_file_content_string(files: List[Dict[str, Any]]) -> str: """Creates a formatted string of file contents with separators.""" output = "" separator = "=" * 48 + "\n" @@ -223,6 +257,7 @@ def create_file_content_string(files: List[Dict]) -> str: for file in files: if not file['content']: continue + if file['path'].lower() == '/readme.md': output += separator output += f"File: {file['path']}\n" @@ -234,6 +269,7 @@ def create_file_content_string(files: List[Dict]) -> str: for file in files: if not file['content'] or file['path'].lower() == '/readme.md': continue + output += separator output += f"File: {file['path']}\n" output += separator @@ -241,12 +277,14 @@ def create_file_content_string(files: List[Dict]) -> str: return output -def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: + +def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" else: summary = f"Repository: {query['slug']}\n" + summary += f"Files analyzed: {nodes['file_count']}\n" if 'subpath' in query and query['subpath'] != '/': @@ -255,11 +293,19 @@ def create_summary_string(query: dict, nodes: Dict, files: List[Dict]) -> str: summary += f"Commit: {query['commit']}\n" elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: summary += f"Branch: {query['branch']}\n" + return summary -def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bool = True) -> str: + +def create_tree_structure( + query: Dict[str, Any], + node: Dict[str, Any], + prefix: str = "", + is_last: bool = True, +) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" + if not node["name"]: node["name"] = query['slug'] @@ -267,6 +313,7 @@ def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bo current_prefix = "└── " if is_last else "β”œβ”€β”€ " name = node["name"] + "/" if node["type"] == "directory" else node["name"] tree += prefix + current_prefix + name + "\n" + if node["type"] == "directory": # Adjust prefix only if we added a node name new_prefix = prefix + (" " if is_last else "β”‚ ") if node["name"] else prefix @@ -276,25 +323,29 @@ def create_tree_structure(query: dict, node: Dict, prefix: str = "", is_last: bo return tree -def generate_token_string(context_string: str) -> str: + +def generate_token_string(context_string: str) -> Optional[str]: """Returns the number of tokens in a text string.""" formatted_tokens = "" try: - encoding = tiktoken.get_encoding("cl100k_base", ) + encoding = tiktoken.get_encoding("cl100k_base") total_tokens = len(encoding.encode(context_string, disallowed_special=())) - + except Exception as e: print(e) return None - if total_tokens > 1000000: - formatted_tokens = f"{total_tokens/1000000:.1f}M" - elif total_tokens > 1000: - formatted_tokens = f"{total_tokens/1000:.1f}k" + + if total_tokens > 1_000_000: + formatted_tokens = f"{total_tokens / 1_000_000:.1f}M" + elif total_tokens > 1_000: + formatted_tokens = f"{total_tokens / 1_000:.1f}k" else: formatted_tokens = f"{total_tokens}" + return formatted_tokens -def ingest_single_file(path: str, query: dict) -> Dict: + +def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") @@ -310,7 +361,7 @@ def ingest_single_file(path: str, query: dict) -> Dict: file_info = { "path": path.replace(query['local_path'], ""), "content": content, - "size": file_size + "size": file_size, } summary = ( @@ -326,11 +377,15 @@ def ingest_single_file(path: str, query: dict) -> Dict: formatted_tokens = generate_token_string(files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) -def ingest_directory(path: str, query: dict) -> Dict: - nodes = scan_directory(path, query) - files = extract_files_content(query, nodes, query['max_file_size']) + return summary, tree, files_content + + +def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: + nodes = scan_directory(path=path, query=query) + if not nodes: + raise ValueError(f"No files found in {path}") + files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) summary = create_summary_string(query, nodes, files) tree = "Directory structure:\n" + create_tree_structure(query, nodes) files_content = create_file_content_string(files) @@ -338,9 +393,11 @@ def ingest_directory(path: str, query: dict) -> Dict: formatted_tokens = generate_token_string(tree + files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" - return (summary, tree, files_content) -def ingest_from_query(query: dict) -> Dict: + return summary, tree, files_content + + +def ingest_from_query(query: Dict[str, Any]) -> Tuple[str, str, str]: """Main entry point for analyzing a codebase directory or single file.""" path = f"{query['local_path']}{query['subpath']}" if not os.path.exists(path): @@ -348,6 +405,5 @@ def ingest_from_query(query: dict) -> Dict: if query.get('type') == 'blob': return ingest_single_file(path, query) - else: - return ingest_directory(path, query) + return ingest_directory(path, query) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 8b8f97a..669f28f 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,55 +1,13 @@ -from typing import List, Union +import os import uuid -import os - - -DEFAULT_IGNORE_PATTERNS = [ - # Python - '*.pyc', '*.pyo', '*.pyd', '__pycache__', '.pytest_cache', '.coverage', - '.tox', '.nox', '.mypy_cache', '.ruff_cache', '.hypothesis', - 'poetry.lock', 'Pipfile.lock', - - # JavaScript/Node - 'node_modules', 'bower_components', 'package-lock.json', 'yarn.lock', - '.npm', '.yarn', '.pnpm-store', - - # Version control - '.git', '.svn', '.hg', '.gitignore', '.gitattributes', '.gitmodules', - - # Images and media - '*.svg', '*.png', '*.jpg', '*.jpeg', '*.gif', '*.ico', '*.pdf', - '*.mov', '*.mp4', '*.mp3', '*.wav', - - # Virtual environments - 'venv', '.venv', 'env', '.env', 'virtualenv', - - # IDEs and editors - '.idea', '.vscode', '.vs', '*.swp', '*.swo', '*.swn', - '.settings', '.project', '.classpath', '*.sublime-*', - - # Temporary and cache files - '*.log', '*.bak', '*.swp', '*.tmp', '*.temp', - '.cache', '.sass-cache', '.eslintcache', - '.DS_Store', 'Thumbs.db', 'desktop.ini', - - # Build directories and artifacts - 'build', 'dist', 'target', 'out', - '*.egg-info', '*.egg', '*.whl', - '*.so', '*.dylib', '*.dll', '*.class', - - # Documentation - 'site-packages', '.docusaurus', '.next', '.nuxt', - - # Other common patterns - '*.min.js', '*.min.css', # Minified files - '*.map', # Source maps - '.terraform', '*.tfstate*', # Terraform - 'vendor/', # Dependencies in various languages -] +from typing import Any, Dict, List, Optional, Union + +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH = "../tmp" -def parse_url(url: str) -> dict: + +def parse_url(url: str) -> Dict[str, Any]: parsed = { "user_name": None, "repo_name": None, @@ -62,22 +20,22 @@ def parse_url(url: str) -> dict: "slug": None, "id": None, } - + url = url.split(" ")[0] if not url.startswith('https://'): url = 'https://' + url - + # Extract domain and path url_parts = url.split('/') domain = url_parts[2] path_parts = url_parts[3:] - + if len(path_parts) < 2: raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - + parsed["user_name"] = path_parts[0] parsed["repo_name"] = path_parts[1] - + # Keep original URL format parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" @@ -89,10 +47,12 @@ def parse_url(url: str) -> dict: parsed["branch"] = path_parts[3] if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): parsed["commit"] = parsed['branch'] - + parsed["subpath"] = "/" + "/".join(path_parts[4:]) + return parsed + def normalize_pattern(pattern: str) -> str: pattern = pattern.strip() pattern = pattern.lstrip(os.sep) @@ -100,16 +60,21 @@ def normalize_pattern(pattern: str) -> str: pattern += "*" return pattern + def parse_patterns(pattern: Union[List[str], str]) -> List[str]: if isinstance(pattern, list): pattern = ",".join(pattern) for p in pattern.split(","): if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): - raise ValueError(f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed.") + raise ValueError( + f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " + "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) patterns = [normalize_pattern(p) for p in pattern.split(",")] return patterns + def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: for pattern in include_patterns: if pattern in ignore_patterns: @@ -117,8 +82,7 @@ def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[ return ignore_patterns -def parse_path(path: str) -> dict: - +def parse_path(path: str) -> Dict[str, Any]: query = { "local_path": os.path.abspath(path), "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), @@ -128,7 +92,14 @@ def parse_path(path: str) -> dict: } return query -def parse_query(source: str, max_file_size: int, from_web: bool, include_patterns: Union[List[str], str] = None, ignore_patterns: Union[List[str], str] = None) -> dict: + +def parse_query( + source: str, + max_file_size: int, + from_web: bool, + include_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, +) -> Dict[str, Any]: if from_web: query = parse_url(source) else: @@ -136,21 +107,21 @@ def parse_query(source: str, max_file_size: int, from_web: bool, include_pattern query = parse_url(source) else: query = parse_path(source) + query['max_file_size'] = max_file_size if ignore_patterns and ignore_patterns != "": ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) else: ignore_patterns = DEFAULT_IGNORE_PATTERNS - + if include_patterns and include_patterns != "": include_patterns = parse_patterns(include_patterns) ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) - else: + else: include_patterns = None - + query['ignore_patterns'] = ignore_patterns query['include_patterns'] = include_patterns - - return query + return query diff --git a/src/gitingest/tests/conftest.py b/src/gitingest/tests/conftest.py index d31de7b..31dba62 100644 --- a/src/gitingest/tests/conftest.py +++ b/src/gitingest/tests/conftest.py @@ -6,4 +6,4 @@ # Add both the project root and src directory to PYTHONPATH sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, 'src')) \ No newline at end of file +sys.path.insert(0, os.path.join(project_root, 'src')) diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 06579b6..680181c 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -1,72 +1,78 @@ +from unittest.mock import AsyncMock, patch + import pytest -from clone import clone_repo, check_repo_exists -from unittest.mock import patch, AsyncMock + +from gitingest.clone import check_repo_exists, clone_repo + @pytest.mark.asyncio -async def test_clone_repo_with_commit(): +async def test_clone_repo_with_commit() -> None: query = { 'commit': 'a' * 40, # Simulating a valid commit hash 'branch': 'main', 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: + + with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - + await clone_repo(query) mock_check.assert_called_once_with(query['url']) assert mock_exec.call_count == 2 # Clone and checkout calls + @pytest.mark.asyncio -async def test_clone_repo_without_commit(): +async def test_clone_repo_without_commit() -> None: query = { 'commit': None, 'branch': 'main', 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - - with patch('clone.check_repo_exists', return_value=True) as mock_check: + + with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - + await clone_repo(query) mock_check.assert_called_once_with(query['url']) assert mock_exec.call_count == 1 # Only clone call + @pytest.mark.asyncio -async def test_clone_repo_nonexistent_repository(): +async def test_clone_repo_nonexistent_repository() -> None: query = { 'commit': None, 'branch': 'main', 'url': 'https://github.com/user/nonexistent-repo', - 'local_path': '/tmp/repo' + 'local_path': '/tmp/repo', } - + with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): await clone_repo(query) mock_check.assert_called_once_with(query['url']) + @pytest.mark.asyncio -async def test_check_repo_exists(): +async def test_check_repo_exists() -> None: url = "https://github.com/user/repo" - + with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') mock_exec.return_value = mock_process - + # Test existing repository mock_process.returncode = 0 assert await check_repo_exists(url) is True - + # Test non-existing repository (404 response) mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') mock_process.returncode = 0 @@ -74,4 +80,4 @@ async def test_check_repo_exists(): # Test failed request mock_process.returncode = 1 - assert await check_repo_exists(url) is False \ No newline at end of file + assert await check_repo_exists(url) is False diff --git a/src/gitingest/tests/test_ingest.py b/src/gitingest/tests/test_ingest.py index 19a57b5..33b174b 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/src/gitingest/tests/test_ingest.py @@ -1,12 +1,14 @@ +from pathlib import Path +from typing import Any, Dict + import pytest -from src.gitingest.ingest_from_query import ( - scan_directory, - extract_files_content, -) + +from gitingest.ingest_from_query import extract_files_content, scan_directory + # Test fixtures @pytest.fixture -def sample_query(): +def sample_query() -> Dict[str, Any]: return { 'user_name': 'test_user', 'repo_name': 'test_repo', @@ -14,16 +16,16 @@ def sample_query(): 'subpath': '/', 'branch': 'main', 'commit': None, - 'max_file_size': 1000000, + 'max_file_size': 1_000_000, 'slug': 'test_user/test_repo', 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], 'include_patterns': None, - 'pattern_type': 'exclude' - + 'pattern_type': 'exclude', } + @pytest.fixture -def temp_directory(tmp_path): +def temp_directory(tmp_path: Path) -> Path: # Creates the following structure: # test_repo/ # β”œβ”€β”€ file1.txt @@ -38,58 +40,57 @@ def temp_directory(tmp_path): # | └── file_dir1.txt # └── dir2/ # └── file_dir2.txt - + test_dir = tmp_path / "test_repo" test_dir.mkdir() - + # Root files (test_dir / "file1.txt").write_text("Hello World") (test_dir / "file2.py").write_text("print('Hello')") - + # src directory and its files src_dir = test_dir / "src" src_dir.mkdir() (src_dir / "subfile1.txt").write_text("Hello from src") (src_dir / "subfile2.py").write_text("print('Hello from src')") - + # src/subdir and its files subdir = src_dir / "subdir" subdir.mkdir() (subdir / "file_subdir.txt").write_text("Hello from subdir") (subdir / "file_subdir.py").write_text("print('Hello from subdir')") - + # dir1 and its file dir1 = test_dir / "dir1" dir1.mkdir() (dir1 / "file_dir1.txt").write_text("Hello from dir1") - + # dir2 and its file dir2 = test_dir / "dir2" dir2.mkdir() (dir2 / "file_dir2.txt").write_text("Hello from dir2") - + return test_dir -def test_scan_directory(temp_directory, sample_query): - result = scan_directory( - str(temp_directory), - query=sample_query - ) - + +def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + result = scan_directory(str(temp_directory), query=sample_query) + if result is None: + assert False, "Result is None" + assert result['type'] == 'directory' assert result['file_count'] == 8 # All .txt and .py files - assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 + assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 -def test_extract_files_content(temp_directory, sample_query): - nodes = scan_directory( - str(temp_directory), - query=sample_query - ) - - files = extract_files_content(sample_query, nodes, max_file_size=1000000) + +def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: + nodes = scan_directory(str(temp_directory), query=sample_query) + if nodes is None: + assert False, "Nodes is None" + files = extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) assert len(files) == 8 # All .txt and .py files - + # Check for presence of key files paths = [f['path'] for f in files] assert any('file1.txt' in p for p in paths) @@ -101,22 +102,17 @@ def test_extract_files_content(temp_directory, sample_query): assert any('file_dir2.txt' in p for p in paths) - # TODO: test with include patterns: ['*.txt'] # TODO: test with wrong include patterns: ['*.qwerty'] -#single folder patterns +# single folder patterns # TODO: test with include patterns: ['src/*'] # TODO: test with include patterns: ['/src/*'] # TODO: test with include patterns: ['/src/'] # TODO: test with include patterns: ['/src*'] -#multiple patterns +# multiple patterns # TODO: test with multiple include patterns: ['*.txt', '*.py'] # TODO: test with multiple include patterns: ['/src/*', '*.txt'] # TODO: test with multiple include patterns: ['/src*', '*.txt'] - - - - diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index da61404..ae4c165 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -1,12 +1,14 @@ import pytest -from gitingest.parse_query import parse_query, parse_url, DEFAULT_IGNORE_PATTERNS +from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from gitingest.parse_query import parse_query, parse_url -def test_parse_url_valid(): + +def test_parse_url_valid() -> None: test_cases = [ "https://github.com/user/repo", - "https://gitlab.com/user/repo", - "https://bitbucket.org/user/repo" + "https://gitlab.com/user/repo", + "https://bitbucket.org/user/repo", ] for url in test_cases: result = parse_url(url) @@ -14,16 +16,15 @@ def test_parse_url_valid(): assert result["repo_name"] == "repo" assert result["url"] == url -def test_parse_url_invalid(): + +def test_parse_url_invalid() -> None: url = "https://only-domain.com" with pytest.raises(ValueError, match="Invalid repository URL"): parse_url(url) -def test_parse_query_basic(): - test_cases = [ - "https://github.com/user/repo", - "https://gitlab.com/user/repo" - ] + +def test_parse_query_basic() -> None: + test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"] for url in test_cases: result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') assert result["user_name"] == "user" @@ -31,13 +32,15 @@ def test_parse_query_basic(): assert result["url"] == url assert "*.txt" in result["ignore_patterns"] -def test_parse_query_include_pattern(): + +def test_parse_query_include_pattern() -> None: url = "https://github.com/user/repo" result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') assert result["include_patterns"] == ["*.py"] assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS -def test_parse_query_invalid_pattern(): + +def test_parse_query_invalid_pattern() -> None: url = "https://github.com/user/repo" with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') \ No newline at end of file + parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index ebbb409..1f07b53 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,22 +1,27 @@ - ## Async Timeout decorator import asyncio import functools -from typing import TypeVar, Callable +from typing import Awaitable, Callable, ParamSpec, TypeVar T = TypeVar("T") +P = ParamSpec("P") + class AsyncTimeoutError(Exception): """Raised when an async operation exceeds its timeout limit.""" + pass -def async_timeout(seconds: int = 10): - def decorator(func: Callable[..., T]) -> Callable[..., T]: + +def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @functools.wraps(func) - async def wrapper(*args, **kwargs) -> T: + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: try: return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) except asyncio.TimeoutError: - raise AsyncTimeoutError(f"Clone timed out after {seconds} seconds") + raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") + return wrapper - return decorator \ No newline at end of file + + return decorator diff --git a/src/main.py b/src/main.py index 3fe9423..a50a1c5 100644 --- a/src/main.py +++ b/src/main.py @@ -1,27 +1,41 @@ import os -from dotenv import load_dotenv +from typing import Dict +from api_analytics.fastapi import Analytics +from dotenv import load_dotenv from fastapi import FastAPI, Request -from fastapi.templating import Jinja2Templates -from fastapi.responses import HTMLResponse, FileResponse, Response +from fastapi.responses import FileResponse, HTMLResponse, Response from fastapi.staticfiles import StaticFiles -from starlette.middleware.trustedhost import TrustedHostMiddleware -from api_analytics.fastapi import Analytics +from fastapi.templating import Jinja2Templates from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded +from starlette.middleware.trustedhost import TrustedHostMiddleware -from server_utils import limiter from routers import download, dynamic, index - +from server_utils import limiter load_dotenv() app = FastAPI() app.state.limiter = limiter -app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) + + +# Define a wrapper handler with the correct signature +async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: + if isinstance(exc, RateLimitExceeded): + # Delegate to the actual handler + return _rate_limit_exceeded_handler(request, exc) + # Optionally, handle other exceptions or re-raise + raise exc + + +# Register the wrapper handler +app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) app.mount("/static", StaticFiles(directory="static"), name="static") -app.add_middleware(Analytics, api_key=os.getenv('API_ANALYTICS_KEY')) +app_analytics_key = os.getenv('API_ANALYTICS_KEY') +if app_analytics_key: + app.add_middleware(Analytics, api_key=app_analytics_key) # Define the default allowed hosts default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] @@ -36,31 +50,29 @@ app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) templates = Jinja2Templates(directory="templates") + @app.get("/health") -async def health_check(): +async def health_check() -> Dict[str, str]: return {"status": "healthy"} + @app.head("/") -async def head_root(): +async def head_root() -> HTMLResponse: """Mirror the headers and status code of the index page""" - return HTMLResponse( - content=None, - headers={ - "content-type": "text/html; charset=utf-8" - } - ) - + return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) + + @app.get("/api/", response_class=HTMLResponse) @app.get("/api", response_class=HTMLResponse) -async def api_docs(request: Request): - return templates.TemplateResponse( - "api.jinja", {"request": request} - ) +async def api_docs(request: Request) -> HTMLResponse: + return templates.TemplateResponse("api.jinja", {"request": request}) + @app.get("/robots.txt") -async def robots(): +async def robots() -> FileResponse: return FileResponse('static/robots.txt') + app.include_router(index) app.include_router(download) -app.include_router(dynamic) \ No newline at end of file +app.include_router(dynamic) diff --git a/src/process_query.py b/src/process_query.py index 18d8d76..466b11d 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -1,16 +1,27 @@ -from typing import List -from fastapi.templating import Jinja2Templates +from typing import Any, Dict + from fastapi import Request +from fastapi.templating import Jinja2Templates +from starlette.templating import _TemplateResponse -from config import MAX_DISPLAY_SIZE, EXAMPLE_REPOS -from gitingest import ingest_from_query, clone_repo, parse_query -from server_utils import logSliderToSize, Colors +from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE +from gitingest.clone import clone_repo +from gitingest.ingest_from_query import ingest_from_query +from gitingest.parse_query import parse_query +from server_utils import Colors, logSliderToSize templates = Jinja2Templates(directory="templates") -def print_query(query, request, max_file_size, pattern_type, pattern): + +def print_query( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") - if int(max_file_size/1024) != 50: + if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") if pattern_type == "include" and pattern != "": print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") @@ -18,46 +29,74 @@ def print_query(query, request, max_file_size, pattern_type, pattern): print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") -def print_error(query, request, e, max_file_size, pattern_type, pattern): +def print_error( + query: Dict[str, Any], + request: Request, + e: Exception, + max_file_size: int, + pattern_type: str, + pattern: str, +) -> None: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print_query(query, request, max_file_size, pattern_type, pattern) print(f" | {Colors.RED}{e}{Colors.END}") -def print_success(query, request, max_file_size, pattern_type, pattern, summary): + +def print_success( + query: Dict[str, Any], + request: Request, + max_file_size: int, + pattern_type: str, + pattern: str, + summary: str, +) -> None: estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") print_query(query, request, max_file_size, pattern_type, pattern) print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - -async def process_query(request: Request, input_text: str, slider_position: int, pattern_type: str = "exclude", pattern: str = "", is_index: bool = False) -> str: +async def process_query( + request: Request, + input_text: str, + slider_position: int, + pattern_type: str = "exclude", + pattern: str = "", + is_index: bool = False, +) -> _TemplateResponse: template = "index.jinja" if is_index else "github.jinja" max_file_size = logSliderToSize(slider_position) + if pattern_type == "include": include_patterns = pattern exclude_patterns = None elif pattern_type == "exclude": exclude_patterns = pattern include_patterns = None + try: - query = parse_query(input_text, max_file_size, True, include_patterns, exclude_patterns) + query = parse_query( + source=input_text, + max_file_size=max_file_size, + from_web=True, + include_patterns=include_patterns, + ignore_patterns=exclude_patterns, + ) await clone_repo(query) summary, tree, content = ingest_from_query(query) with open(f"{query['local_path']}.txt", "w") as f: f.write(tree + "\n" + content) - - except Exception as e: - #hack to print error message when query is not defined + # hack to print error message when query is not defined if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query, request, e, max_file_size, pattern_type, pattern) + print_error(query, request, e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") + return templates.TemplateResponse( - template, + template, { "request": request, "github_url": input_text, @@ -66,25 +105,37 @@ async def process_query(request: Request, input_text: str, slider_position: int, "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, - } + }, ) - + if len(content) > MAX_DISPLAY_SIZE: - content = f"(Files content cropped to {int(MAX_DISPLAY_SIZE/1000)}k characters, download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] - print_success(query, request, max_file_size, pattern_type, pattern, summary) + content = ( + f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + ) + + print_success( + query=query, + request=request, + max_file_size=max_file_size, + pattern_type=pattern_type, + pattern=pattern, + summary=summary, + ) + return templates.TemplateResponse( - template, + template, { - "request": request, + "request": request, "github_url": input_text, - "result": True, + "result": True, "summary": summary, - "tree": tree, + "tree": tree, "content": content, "examples": EXAMPLE_REPOS if is_index else [], "ingest_id": query['id'], "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, - } + }, ) diff --git a/src/routers/__init__.py b/src/routers/__init__.py index b1871c1..ace7bd0 100644 --- a/src/routers/__init__.py +++ b/src/routers/__init__.py @@ -1,5 +1,5 @@ -from .download import router as download -from .dynamic import router as dynamic -from .index import router as index +from routers.download import router as download +from routers.dynamic import router as dynamic +from routers.index import router as index -__all__ = ["download", "dynamic", "index"] \ No newline at end of file +__all__ = ["download", "dynamic", "index"] diff --git a/src/routers/download.py b/src/routers/download.py index e26b2df..95cec0f 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -1,29 +1,30 @@ -from fastapi import HTTPException, APIRouter +import os + +from fastapi import APIRouter, HTTPException from fastapi.responses import Response + from config import TMP_BASE_PATH -import os router = APIRouter() + @router.get("/download/{digest_id}") -async def download_ingest(digest_id: str): +async def download_ingest(digest_id: str) -> Response: try: # Find the first .txt file in the directory directory = f"{TMP_BASE_PATH}/{digest_id}" txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] - + if not txt_files: raise FileNotFoundError("No .txt file found") - - with open(f"{directory}/{txt_files[0]}", "r") as f: + + with open(f"{directory}/{txt_files[0]}") as f: content = f.read() - + return Response( content=content, media_type="text/plain", - headers={ - "Content-Disposition": f"attachment; filename={txt_files[0]}" - } + headers={"Content-Disposition": f"attachment; filename={txt_files[0]}"}, ) except FileNotFoundError: - raise HTTPException(status_code=404, detail="Digest not found") \ No newline at end of file + raise HTTPException(status_code=404, detail="Digest not found") diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py index 6a0a2f9..12216f1 100644 --- a/src/routers/dynamic.py +++ b/src/routers/dynamic.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Request, Form +from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates @@ -8,26 +8,34 @@ router = APIRouter() templates = Jinja2Templates(directory="templates") + @router.get("/{full_path:path}") -async def catch_all(request: Request, full_path: str): +async def catch_all(request: Request, full_path: str) -> HTMLResponse: return templates.TemplateResponse( "github.jinja", { "request": request, "github_url": f"https://github.com/{full_path}", "loading": True, - "default_file_size": 243 - } + "default_file_size": 243, + }, ) + @router.post("/{full_path:path}", response_class=HTMLResponse) -@limiter.limit("10/minute") +@limiter.limit("10/minute") async def process_catch_all( - request: Request, + request: Request, input_text: str = Form(...), max_file_size: int = Form(...), pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=False) - \ No newline at end of file + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=False, + ) diff --git a/src/routers/index.py b/src/routers/index.py index 610d87c..f272880 100644 --- a/src/routers/index.py +++ b/src/routers/index.py @@ -1,40 +1,41 @@ -from fastapi import APIRouter, Request, Form +from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates -from server_utils import limiter -from process_query import process_query from config import EXAMPLE_REPOS - +from process_query import process_query +from server_utils import limiter router = APIRouter() templates = Jinja2Templates(directory="templates") @router.get("/", response_class=HTMLResponse) -async def home(request: Request): +async def home(request: Request) -> HTMLResponse: return templates.TemplateResponse( - "index.jinja", + "index.jinja", { "request": request, "examples": EXAMPLE_REPOS, - "default_file_size": 243 - } + "default_file_size": 243, + }, ) @router.post("/", response_class=HTMLResponse) -@limiter.limit("10/minute") +@limiter.limit("10/minute") async def index_post( - request: Request, + request: Request, input_text: str = Form(...), max_file_size: int = Form(...), pattern_type: str = Form(...), - pattern: str = Form(...) -): - return await process_query(request, input_text, max_file_size, pattern_type, pattern, is_index=True) - - - - - + pattern: str = Form(...), +) -> HTMLResponse: + return await process_query( + request, + input_text, + max_file_size, + pattern_type, + pattern, + is_index=True, + ) diff --git a/src/server_utils.py b/src/server_utils.py index 584041b..2a6e186 100644 --- a/src/server_utils.py +++ b/src/server_utils.py @@ -1,21 +1,26 @@ +import math + ## Rate Limiter from slowapi import Limiter from slowapi.util import get_remote_address + limiter = Limiter(key_func=get_remote_address) -## Logarithmic slider to file size -import math -def logSliderToSize(position): + +## Logarithmic slider to file size conversion +def logSliderToSize(position: int) -> int: """Convert slider position to file size in KB""" maxp = 500 minv = math.log(1) maxv = math.log(102400) - + return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 + ## Color printing utility class Colors: """ANSI color codes""" + BLACK = "\033[0;30m" RED = "\033[0;31m" GREEN = "\033[0;32m" diff --git a/src/static/favicon.svg b/src/static/favicon.svg index f9b0ae4..dc5a443 100644 --- a/src/static/favicon.svg +++ b/src/static/favicon.svg @@ -1 +1 @@ -1 \ No newline at end of file +1 diff --git a/src/static/js/snow.js b/src/static/js/snow.js index 0576bff..a5e1d87 100644 --- a/src/static/js/snow.js +++ b/src/static/js/snow.js @@ -88,4 +88,4 @@ function initSnow() { document.addEventListener('DOMContentLoaded', initSnow); // Also initialize when the HTMX content is swapped -document.addEventListener('htmx:afterSettle', initSnow); \ No newline at end of file +document.addEventListener('htmx:afterSettle', initSnow); diff --git a/src/static/robots.txt b/src/static/robots.txt index 49e4f2d..b757ab6 100644 --- a/src/static/robots.txt +++ b/src/static/robots.txt @@ -1,5 +1,4 @@ User-agent: * -Allow: / +Allow: / Allow: /api/ Allow: /cyclotruc/gitingest/ - diff --git a/src/templates/api.jinja b/src/templates/api.jinja index 41f0e83..c5e57bd 100644 --- a/src/templates/api.jinja +++ b/src/templates/api.jinja @@ -1,41 +1,35 @@ {% extends "base.jinja" %} - {% block title %}Git ingest API{% endblock %} - {% block content %} -
-
-
-

API Documentation

- - -
-
-
-
- - - -
-
-

- The API is currently under development.. -

+
+
+
+

API Documentation

+
+
+
+
+ + + +
+
+

The API is currently under development..

+
+

+ We're working on making our API available to the public. + In the meantime, you can + open an issue on github + to suggest features. +

-

- We're working on making our API available to the public. - In the meantime, you can - - open an issue on github - - to suggest features. -

-
-{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/src/templates/base.jinja b/src/templates/base.jinja index 3ef8bd7..e6c3fcd 100644 --- a/src/templates/base.jinja +++ b/src/templates/base.jinja @@ -1,41 +1,44 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - {% block title %}Git ingest{% endblock %} - - - - - - {% block extra_head %}{% endblock %} - - - - - {% include 'components/navbar.jinja' %} - - -
-
- {% block content %}{% endblock %} -
-
- - {% include 'components/footer.jinja' %} - - {% block extra_scripts %}{% endblock %} - - \ No newline at end of file + + {% block extra_head %}{% endblock %} + + + + {% include 'components/navbar.jinja' %} + +
+
+ {% block content %}{% endblock %} +
+
+ {% include 'components/footer.jinja' %} + {% block extra_scripts %}{% endblock %} + + diff --git a/src/templates/components/footer.jinja b/src/templates/components/footer.jinja index a082041..e8ffa9e 100644 --- a/src/templates/components/footer.jinja +++ b/src/templates/components/footer.jinja @@ -4,19 +4,23 @@
- \ No newline at end of file + diff --git a/src/templates/components/github_form.jinja b/src/templates/components/github_form.jinja index ec6054e..7be65ae 100644 --- a/src/templates/components/github_form.jinja +++ b/src/templates/components/github_form.jinja @@ -2,28 +2,30 @@
- + class="absolute md:block hidden left-0 h-[4.5rem] w-[4.5rem] bottom-0 -translate-x-full ml-3">
+ id="ingestForm" + onsubmit="handleSubmit(event{% if is_index %}, true{% endif %})">
- +
-
@@ -31,74 +33,62 @@
- - - + +
-
-
- - + +
- {% if show_examples %} - -
-

Try these example repositories:

-
- {% for example in examples %} - + +
+

Try these example repositories:

+
+ {% for example in examples %} + {% endfor %} +
-
{% endif %}
-
\ No newline at end of file +
diff --git a/src/templates/components/navbar.jinja b/src/templates/components/navbar.jinja index 6275cb8..6f4b2ce 100644 --- a/src/templates/components/navbar.jinja +++ b/src/templates/components/navbar.jinja @@ -21,7 +21,6 @@ fetchGitHubStars(); -
- \ No newline at end of file + diff --git a/src/templates/components/result.jinja b/src/templates/components/result.jinja index 00b6f93..cd0a978 100644 --- a/src/templates/components/result.jinja +++ b/src/templates/components/result.jinja @@ -1,115 +1,94 @@ {% if result %} -
-
-
-
- -
- -
-
-

Summary

-
- - -
-
-
- -
- {% if ingest_id %} -
-
-
-
- -
- {% endif %} - - +
+
+
+
+ +
+ +
+
+

Summary

+
+
+
+ +
+ {% if ingest_id %} + - - -
-
-

Directory Structure

-
-
-
- -
-
-
-
-
- -
+
+
+
-
- - -
-
-

Files Content

-
-
-
- -
-
-
-
-
- + + + + Copy +
+
+
+ +
+
+
+ +
+
+

Files Content

+
+
+ +
+
+
+
+
+
+
-{% endif %} \ No newline at end of file +{% endif %} diff --git a/src/templates/github.jinja b/src/templates/github.jinja index fdedcce..c373367 100644 --- a/src/templates/github.jinja +++ b/src/templates/github.jinja @@ -1,39 +1,33 @@ {% extends "base.jinja" %} - {% block content %} -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=false %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% if loading %} -
-
-
-
-

Loading...

-
-
-{% endif %} - -{% include 'components/result.jinja' %} + {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=false %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% if loading %} +
+
+
+
+

Loading...

+
+
+ {% endif %} + {% include 'components/result.jinja' %} {% endblock content %} - {% block extra_scripts %} - -{% endblock extra_scripts %} \ No newline at end of file + +{% endblock extra_scripts %} diff --git a/src/templates/index.jinja b/src/templates/index.jinja index 80015ad..e29066f 100644 --- a/src/templates/index.jinja +++ b/src/templates/index.jinja @@ -1,67 +1,57 @@ {% extends "base.jinja" %} - {% block extra_head %} - + {% endblock %} - {% block content %} -
-
- - - - -

- Prompt-friendly
codebase  -

- +
+
+ + + + + + +

+ Prompt-friendly +
+ codebase  +

+ +
+

+ Turn any Git repository into a simple text ingest of its codebase. +

+

+ This is useful for feeding a codebase into any LLM. +

+

+ You can also replace 'hub' with 'ingest' in any Github URL +

-

- Turn any Git repository into a simple text ingest of its codebase. -

-

- This is useful for feeding a codebase into any LLM. -

-

- You can also replace 'hub' with 'ingest' in any Github URL -

-
- -{% if error_message %} -
- {{ error_message }} -
-{% endif %} - -{% with is_index=true, show_examples=true %} - {% include 'components/github_form.jinja' %} -{% endwith %} - -{% include 'components/result.jinja' %} - - - - -{% endblock %} \ No newline at end of file + {% if error_message %} +
{{ error_message }}
+ {% endif %} + {% with is_index=true, show_examples=true %} + {% include 'components/github_form.jinja' %} + {% endwith %} + {% include 'components/result.jinja' %} +{% endblock %} From 16def8ab9f5d6dc9c858da7238dd5aa6a7b2df50 Mon Sep 17 00:00:00 2001 From: Harshit Wadhwani <77691984+harshit-wadhwani@users.noreply.github.com> Date: Sat, 28 Dec 2024 12:20:32 +0530 Subject: [PATCH 03/18] Fix: issue #40 : Bug: Branch names with "/" in the branch name are not cloned correctly (#52) * Enhance URL parsing to better handle branch names and commit hashes --- src/gitingest/parse_query.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 669f28f..fcf8186 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,7 +1,8 @@ import os import uuid -from typing import Any, Dict, List, Optional, Union +from urllib.parse import unquote +from typing import Any, Dict, List, Optional, Union from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH = "../tmp" @@ -22,6 +23,8 @@ def parse_url(url: str) -> Dict[str, Any]: } url = url.split(" ")[0] + url = unquote(url) # Decode URL-encoded characters + if not url.startswith('https://'): url = 'https://' + url @@ -36,19 +39,34 @@ def parse_url(url: str) -> Dict[str, Any]: parsed["user_name"] = path_parts[0] parsed["repo_name"] = path_parts[1] - # Keep original URL format + # Keep original URL format but with decoded components parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" parsed["id"] = str(uuid.uuid4()) parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" if len(path_parts) > 3: - parsed["type"] = path_parts[2] - parsed["branch"] = path_parts[3] - if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): - parsed["commit"] = parsed['branch'] - parsed["subpath"] = "/" + "/".join(path_parts[4:]) + parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' + + # Find the commit hash or reconstruct the branch name + remaining_parts = path_parts[3:] + if remaining_parts[0] and len(remaining_parts[0]) == 40 and all(c in '0123456789abcdefABCDEF' for c in remaining_parts[0]): + parsed["commit"] = remaining_parts[0] + parsed["subpath"] = "/" + "/".join(remaining_parts[1:]) if len(remaining_parts) > 1 else "/" + else: + # Handle branch names with slashes and special characters + for i, part in enumerate(remaining_parts): + if part in ('tree', 'blob'): + # Found another type indicator, everything before this was the branch name + parsed["branch"] = "/".join(remaining_parts[:i]) + parsed["subpath"] = "/" + "/".join(remaining_parts[i+2:]) if len(remaining_parts) > i+2 else "/" + break + else: + # No additional type indicator found, assume everything is part of the branch name + parsed["branch"] = "/".join(remaining_parts) + parsed["subpath"] = "/" + return parsed From 75ee8f77695a03fd665b54d2bfb66514300667e6 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 29 Dec 2024 03:26:45 +0100 Subject: [PATCH 04/18] Refactor/gitingest structure (#66) Refactor and enhance gitingest module for improved clarity, maintainability, and functionality. --- src/gitingest/cli.py | 2 +- src/gitingest/clone.py | 171 +++++++++++++------- src/gitingest/ingest.py | 15 +- src/gitingest/ingest_from_query.py | 11 +- src/gitingest/parse_query.py | 199 +++++++++++++++--------- src/gitingest/tests/test_clone.py | 49 +++--- src/gitingest/tests/test_parse_query.py | 2 +- src/gitingest/utils.py | 2 - src/process_query.py | 51 ++---- 9 files changed, 295 insertions(+), 207 deletions(-) diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 14df219..9e0e3c4 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -37,7 +37,7 @@ def main( if not output: output = "digest.txt" - summary, tree, content = ingest(source, max_size, include_patterns, exclude_patterns, output=output) + summary, _, _ = ingest(source, max_size, include_patterns, exclude_patterns, output=output) click.echo(f"Analysis complete! Output written to: {output}") click.echo("\nSummary:") diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index e7994c1..4a3fda3 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,12 +1,34 @@ import asyncio -from typing import Any, Dict, Tuple +from dataclasses import dataclass +from typing import Optional, Tuple -from gitingest.utils import async_timeout +from gitingest.utils import AsyncTimeoutError, async_timeout CLONE_TIMEOUT = 20 +@dataclass +class CloneConfig: + url: str + local_path: str + commit: Optional[str] = None + branch: Optional[str] = None + + async def check_repo_exists(url: str) -> bool: + """ + Check if a repository exists at the given URL using an HTTP HEAD request. + + Parameters + ---------- + url : str + The URL of the repository. + + Returns + ------- + bool + True if the repository exists, False otherwise. + """ proc = await asyncio.create_subprocess_exec( "curl", "-I", @@ -14,7 +36,7 @@ async def check_repo_exists(url: str) -> bool: stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - stdout, stderr = await proc.communicate() + stdout, _ = await proc.communicate() if proc.returncode != 0: return False # Check if stdout contains "404" status code @@ -22,58 +44,101 @@ async def check_repo_exists(url: str) -> bool: return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str -@async_timeout(CLONE_TIMEOUT) -async def clone_repo(query: Dict[str, Any]) -> Tuple[bytes, bytes]: - if not await check_repo_exists(query['url']): - raise ValueError("Repository not found, make sure it is public") +async def run_git_command(*args: str) -> Tuple[bytes, bytes]: + """ + Executes a git command asynchronously and captures its output. - if query['commit']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - - proc = await asyncio.create_subprocess_exec( - "git", - "-C", - query['local_path'], - "checkout", - query['branch'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - elif query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - "--branch", - query['branch'], - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - else: - proc = await asyncio.create_subprocess_exec( - "git", - "clone", - "--depth=1", - "--single-branch", - query['url'], - query['local_path'], - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) + Parameters + ---------- + *args : str + The git command and its arguments to execute. + Returns + ------- + Tuple[bytes, bytes] + A tuple containing the stdout and stderr of the git command. + + Raises + ------ + RuntimeError + If the git command exits with a non-zero status. + """ + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) stdout, stderr = await proc.communicate() + if proc.returncode != 0: + error_message = stderr.decode().strip() + raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}") return stdout, stderr + + +@async_timeout(CLONE_TIMEOUT) +async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: + """ + Clones a repository to a local path based on the provided query parameters. + + Parameters + ---------- + config : CloneConfig + A dictionary containing the following keys: + - url (str): The URL of the repository. + - local_path (str): The local path to clone the repository to. + - commit (Optional[str]): The specific commit hash to checkout. + - branch (Optional[str]): The branch to clone. Defaults to 'main' or 'master' if not provided. + + Returns + ------- + Tuple[bytes, bytes] + A tuple containing the stdout and stderr of the git commands executed. + + Raises + ------ + ValueError + If the repository does not exist or if required query parameters are missing. + RuntimeError + If any git command fails during execution. + AsyncTimeoutError + If the cloning process exceeds the specified timeout. + """ + # Extract and validate query parameters + url: str = config.url + local_path: str = config.local_path + commit: Optional[str] = config.commit + branch: Optional[str] = config.branch + + if not url: + raise ValueError("The 'url' parameter is required.") + + if not local_path: + raise ValueError("The 'local_path' parameter is required.") + + # Check if the repository exists + if not await check_repo_exists(url): + raise ValueError("Repository not found, make sure it is public") + + try: + if commit: + # Scenario 1: Clone and checkout a specific commit + # Clone the repository without depth to ensure full history for checkout + clone_cmd = ["git", "clone", "--single-branch", url, local_path] + await run_git_command(*clone_cmd) + + # Checkout the specific commit + checkout_cmd = ["git", "-C", local_path, "checkout", commit] + return await run_git_command(*checkout_cmd) + + if branch and branch.lower() not in ('main', 'master'): + # Scenario 2: Clone a specific branch with shallow depth + clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] + return await run_git_command(*clone_cmd) + + # Scenario 3: Clone the default branch with shallow depth + clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path] + return await run_git_command(*clone_cmd) + + except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError): + raise # Re-raise the exception diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index 22fae6d..4889bc5 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -4,14 +4,14 @@ from pathlib import Path from typing import List, Optional, Tuple, Union -from gitingest.clone import clone_repo +from gitingest.clone import CloneConfig, clone_repo from gitingest.ingest_from_query import ingest_from_query from gitingest.parse_query import parse_query def ingest( source: str, - max_file_size: int = 10 * 1024 * 1024, + max_file_size: int = 10 * 1024 * 1024, # 10 MB include_patterns: Union[List[str], str, None] = None, exclude_patterns: Union[List[str], str, None] = None, output: Optional[str] = None, @@ -25,7 +25,16 @@ def ingest( ignore_patterns=exclude_patterns, ) if query['url']: - clone_result = clone_repo(query) + + # Extract relevant fields for CloneConfig + clone_config = CloneConfig( + url=query["url"], + local_path=query['local_path'], + commit=query.get('commit'), + branch=query.get('branch'), + ) + clone_result = clone_repo(clone_config) + if inspect.iscoroutine(clone_result): asyncio.run(clone_result) else: diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 0080c25..a9130a3 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str: return output -def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: List[Dict[str, Any]]) -> str: +def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" @@ -297,12 +297,7 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any], files: L return summary -def create_tree_structure( - query: Dict[str, Any], - node: Dict[str, Any], - prefix: str = "", - is_last: bool = True, -) -> str: +def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" @@ -386,7 +381,7 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: if not nodes: raise ValueError(f"No files found in {path}") files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) - summary = create_summary_string(query, nodes, files) + summary = create_summary_string(query, nodes) tree = "Directory structure:\n" + create_tree_structure(query, nodes) files_content = create_file_content_string(files) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index fcf8186..fe7b01a 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,30 +1,19 @@ import os +import string import uuid - -from urllib.parse import unquote from typing import Any, Dict, List, Optional, Union +from urllib.parse import unquote + from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH = "../tmp" +HEX_DIGITS = set(string.hexdigits) def parse_url(url: str) -> Dict[str, Any]: - parsed = { - "user_name": None, - "repo_name": None, - "type": None, - "branch": None, - "commit": None, - "subpath": "/", - "local_path": None, - "url": None, - "slug": None, - "id": None, - } - url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters - + if not url.startswith('https://'): url = 'https://' + url @@ -36,43 +25,63 @@ def parse_url(url: str) -> Dict[str, Any]: if len(path_parts) < 2: raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") - parsed["user_name"] = path_parts[0] - parsed["repo_name"] = path_parts[1] - - # Keep original URL format but with decoded components - parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" - parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" - parsed["id"] = str(uuid.uuid4()) - parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" - - if len(path_parts) > 3: - - parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' - - # Find the commit hash or reconstruct the branch name - remaining_parts = path_parts[3:] - if remaining_parts[0] and len(remaining_parts[0]) == 40 and all(c in '0123456789abcdefABCDEF' for c in remaining_parts[0]): - parsed["commit"] = remaining_parts[0] - parsed["subpath"] = "/" + "/".join(remaining_parts[1:]) if len(remaining_parts) > 1 else "/" - else: - # Handle branch names with slashes and special characters - for i, part in enumerate(remaining_parts): - if part in ('tree', 'blob'): - # Found another type indicator, everything before this was the branch name - parsed["branch"] = "/".join(remaining_parts[:i]) - parsed["subpath"] = "/" + "/".join(remaining_parts[i+2:]) if len(remaining_parts) > i+2 else "/" - break - else: - # No additional type indicator found, assume everything is part of the branch name - parsed["branch"] = "/".join(remaining_parts) - parsed["subpath"] = "/" + user_name = path_parts[0] + repo_name = path_parts[1] + _id = str(uuid.uuid4()) + slug = f"{user_name}-{repo_name}" + + parsed = { + "user_name": user_name, + "repo_name": repo_name, + "type": None, + "branch": None, + "commit": None, + "subpath": "/", + "local_path": f"{TMP_BASE_PATH}/{_id}/{slug}", + # Keep original URL format but with decoded components + "url": f"https://{domain}/{user_name}/{repo_name}", + "slug": slug, + "id": _id, + } + + if len(path_parts) < 4: + return parsed + + parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' + commit = path_parts[3] + + # Find the commit hash or reconstruct the branch name + remaining_parts = path_parts[3:] + + if _is_valid_git_commit_hash(commit): + parsed["commit"] = commit + if len(remaining_parts) > 1: + parsed["subpath"] += "/".join(remaining_parts[1:]) + return parsed + # Handle branch names with slashes and special characters + + # Find the index of the first type indicator ('tree' or 'blob'), if any + type_indicator_index = next((i for i, part in enumerate(remaining_parts) if part in ('tree', 'blob')), None) + + if type_indicator_index is None: + # No type indicator found; assume the entire input is the branch name + parsed["branch"] = "/".join(remaining_parts) + return parsed + + # Found a type indicator; update branch and subpath + parsed["branch"] = "/".join(remaining_parts[:type_indicator_index]) + if len(remaining_parts) > type_indicator_index + 2: + parsed["subpath"] += "/".join(remaining_parts[type_indicator_index + 2 :]) return parsed +def _is_valid_git_commit_hash(commit: str) -> bool: + return len(commit) == 40 and all(c in HEX_DIGITS for c in commit) + + def normalize_pattern(pattern: str) -> str: - pattern = pattern.strip() pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): pattern += "*" @@ -80,33 +89,45 @@ def normalize_pattern(pattern: str) -> str: def parse_patterns(pattern: Union[List[str], str]) -> List[str]: - if isinstance(pattern, list): - pattern = ",".join(pattern) + patterns = pattern if isinstance(pattern, list) else [pattern] + patterns = [p.strip() for p in patterns] - for p in pattern.split(","): - if not all(c.isalnum() or c in "-_./+*" for c in p.strip()): + for p in patterns: + if not all(c.isalnum() or c in "-_./+*" for c in p): raise ValueError( f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." ) - patterns = [normalize_pattern(p) for p in pattern.split(",")] - return patterns + + return [normalize_pattern(p) for p in patterns] def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: - for pattern in include_patterns: - if pattern in ignore_patterns: - ignore_patterns.remove(pattern) - return ignore_patterns + """ + Removes patterns from ignore_patterns that are present in include_patterns using set difference. + + Parameters + ---------- + ignore_patterns : List[str] + The list of patterns to potentially remove. + include_patterns : List[str] + The list of patterns to exclude from ignore_patterns. + + Returns + ------- + List[str] + A new list of ignore_patterns with specified patterns removed. + """ + return list(set(ignore_patterns) - set(include_patterns)) def parse_path(path: str) -> Dict[str, Any]: query = { + "url": None, "local_path": os.path.abspath(path), "slug": os.path.basename(os.path.dirname(path)) + "/" + os.path.basename(path), "subpath": "/", "id": str(uuid.uuid4()), - "url": None, } return query @@ -118,28 +139,52 @@ def parse_query( include_patterns: Optional[Union[List[str], str]] = None, ignore_patterns: Optional[Union[List[str], str]] = None, ) -> Dict[str, Any]: - if from_web: + """ + Parses the input source to construct a query dictionary with specified parameters. + + Parameters + ---------- + source : str + The source URL or file path to parse. + max_file_size : int + The maximum file size in bytes to include. + from_web : bool + Flag indicating whether the source is a web URL. + include_patterns : Optional[Union[List[str], str]], optional + Patterns to include, by default None. Can be a list of strings or a single string. + ignore_patterns : Optional[Union[List[str], str]], optional + Patterns to ignore, by default None. Can be a list of strings or a single string. + + Returns + ------- + Dict[str, Any] + A dictionary containing the parsed query parameters, including 'max_file_size', + 'ignore_patterns', and 'include_patterns'. + """ + # Determine the parsing method based on the source type + if from_web or source.startswith("https://") or "github.com" in source: query = parse_url(source) else: - if source.startswith("https://") or "github.com" in source: - query = parse_url(source) - else: - query = parse_path(source) + query = parse_path(source) - query['max_file_size'] = max_file_size + # Process ignore patterns + ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() + if ignore_patterns: + ignore_patterns_list += parse_patterns(ignore_patterns) - if ignore_patterns and ignore_patterns != "": - ignore_patterns = DEFAULT_IGNORE_PATTERNS + parse_patterns(ignore_patterns) + # Process include patterns and override ignore patterns accordingly + if include_patterns: + parsed_include = parse_patterns(include_patterns) + ignore_patterns_list = override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) else: - ignore_patterns = DEFAULT_IGNORE_PATTERNS - - if include_patterns and include_patterns != "": - include_patterns = parse_patterns(include_patterns) - ignore_patterns = override_ignore_patterns(ignore_patterns, include_patterns) - else: - include_patterns = None - - query['ignore_patterns'] = ignore_patterns - query['include_patterns'] = include_patterns - + parsed_include = None + + # Update the query dictionary with max_file_size and processed patterns + query.update( + { + 'max_file_size': max_file_size, + 'ignore_patterns': ignore_patterns_list, + 'include_patterns': parsed_include, + } + ) return query diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 680181c..5f33b98 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -2,62 +2,55 @@ import pytest -from gitingest.clone import check_repo_exists, clone_repo +from gitingest.clone import CloneConfig, check_repo_exists, clone_repo @pytest.mark.asyncio async def test_clone_repo_with_commit() -> None: - query = { - 'commit': 'a' * 40, # Simulating a valid commit hash - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo', - } + clone_config = CloneConfig( + url='https://github.com/user/repo', + local_path='/tmp/repo', + commit='a' * 40, # Simulating a valid commit hash + branch='main', + ) with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process - - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) + await clone_repo(clone_config) + mock_check.assert_called_once_with(clone_config.url) assert mock_exec.call_count == 2 # Clone and checkout calls @pytest.mark.asyncio async def test_clone_repo_without_commit() -> None: - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/repo', - 'local_path': '/tmp/repo', - } + query = CloneConfig(url='https://github.com/user/repo', local_path='/tmp/repo', commit=None, branch='main') with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b'output', b'error') mock_exec.return_value = mock_process await clone_repo(query) - mock_check.assert_called_once_with(query['url']) + mock_check.assert_called_once_with(query.url) assert mock_exec.call_count == 1 # Only clone call @pytest.mark.asyncio async def test_clone_repo_nonexistent_repository() -> None: - query = { - 'commit': None, - 'branch': 'main', - 'url': 'https://github.com/user/nonexistent-repo', - 'local_path': '/tmp/repo', - } - + clone_config = CloneConfig( + url='https://github.com/user/nonexistent-repo', + local_path='/tmp/repo', + commit=None, + branch='main', + ) with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): - await clone_repo(query) - mock_check.assert_called_once_with(query['url']) + await clone_repo(clone_config) + mock_check.assert_called_once_with(clone_config.url) @pytest.mark.asyncio diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index ae4c165..1ab5e44 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -37,7 +37,7 @@ def test_parse_query_include_pattern() -> None: url = "https://github.com/user/repo" result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') assert result["include_patterns"] == ["*.py"] - assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS + assert set(result["ignore_patterns"]) == set(DEFAULT_IGNORE_PATTERNS) def test_parse_query_invalid_pattern() -> None: diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 1f07b53..2445f14 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -10,8 +10,6 @@ class AsyncTimeoutError(Exception): """Raised when an async operation exceeds its timeout limit.""" - pass - def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: diff --git a/src/process_query.py b/src/process_query.py index 466b11d..761fdf2 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -1,11 +1,9 @@ -from typing import Any, Dict - from fastapi import Request from fastapi.templating import Jinja2Templates from starlette.templating import _TemplateResponse from config import EXAMPLE_REPOS, MAX_DISPLAY_SIZE -from gitingest.clone import clone_repo +from gitingest.clone import CloneConfig, clone_repo from gitingest.ingest_from_query import ingest_from_query from gitingest.parse_query import parse_query from server_utils import Colors, logSliderToSize @@ -13,14 +11,8 @@ templates = Jinja2Templates(directory="templates") -def print_query( - query: Dict[str, Any], - request: Request, - max_file_size: int, - pattern_type: str, - pattern: str, -) -> None: - print(f"{Colors.WHITE}{query['url']:<20}{Colors.END}", end="") +def print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: + print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") if pattern_type == "include" and pattern != "": @@ -29,30 +21,16 @@ def print_query( print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") -def print_error( - query: Dict[str, Any], - request: Request, - e: Exception, - max_file_size: int, - pattern_type: str, - pattern: str, -) -> None: +def print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) + print_query(url, max_file_size, pattern_type, pattern) print(f" | {Colors.RED}{e}{Colors.END}") -def print_success( - query: Dict[str, Any], - request: Request, - max_file_size: int, - pattern_type: str, - pattern: str, - summary: str, -) -> None: +def print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - print_query(query, request, max_file_size, pattern_type, pattern) + print_query(url, max_file_size, pattern_type, pattern) print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") @@ -82,15 +60,21 @@ async def process_query( include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - await clone_repo(query) + clone_config = CloneConfig( + url=query["url"], + local_path=query['local_path'], + commit=query.get('commit'), + branch=query.get('branch'), + ) + await clone_repo(clone_config) summary, tree, content = ingest_from_query(query) - with open(f"{query['local_path']}.txt", "w") as f: + with open(f"{clone_config.local_path}.txt", "w") as f: f.write(tree + "\n" + content) except Exception as e: # hack to print error message when query is not defined if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query, request, e, max_file_size, pattern_type, pattern) + print_error(query['url'], e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") @@ -115,8 +99,7 @@ async def process_query( ) print_success( - query=query, - request=request, + url=query['url'], max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, From 3c5e7e95ae5be88e4b54de8a6179362f7c3ad51b Mon Sep 17 00:00:00 2001 From: raf <108032229+rafeyrana@users.noreply.github.com> Date: Sat, 28 Dec 2024 23:35:44 -0500 Subject: [PATCH 05/18] adding default ignore patterns for more languages: C / C++ / Java / Swift / Ruby / Rust / Go / C# (#63) --- src/gitingest/ignore_patterns.py | 60 ++++++++++++++++++++++++++++++++ src/gitingest/parse_query.py | 1 + 2 files changed, 61 insertions(+) diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py index 803c6ed..8c738b8 100644 --- a/src/gitingest/ignore_patterns.py +++ b/src/gitingest/ignore_patterns.py @@ -23,6 +23,66 @@ '.npm', '.yarn', '.pnpm-store', + # Java + '*.class', + '*.jar', + '*.war', + '*.ear', + '*.nar', + 'target/', + '.gradle/', + 'build/', + '.settings/', + '.project', + '.classpath', + 'gradle-app.setting', + '*.gradle', + # C/C++ + '*.o', + '*.obj', + '*.so', + '*.dll', + '*.dylib', + '*.exe', + '*.lib', + '*.out', + '*.a', + '*.pdb', + # Swift/Xcode + '.build/', + '*.xcodeproj/', + '*.xcworkspace/', + '*.pbxuser', + '*.mode1v3', + '*.mode2v3', + '*.perspectivev3', + '*.xcuserstate', + 'xcuserdata/', + '.swiftpm/', + # Ruby + '*.gem', + '.bundle/', + 'vendor/bundle', + 'Gemfile.lock', + '.ruby-version', + '.ruby-gemset', + '.rvmrc', + # Rust + 'target/', + 'Cargo.lock', + '**/*.rs.bk', + # Go + 'bin/', + 'pkg/', + # .NET/C# + 'bin/', + 'obj/', + '*.suo', + '*.user', + '*.userosscache', + '*.sln.docstates', + 'packages/', + '*.nupkg', # Version control '.git', '.svn', diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index fe7b01a..5053dfa 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -6,6 +6,7 @@ from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS + TMP_BASE_PATH = "../tmp" HEX_DIGITS = set(string.hexdigits) From 848b9dc1ed09a3bf5a01ca1b55fea4cbae2d5ac2 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 29 Dec 2024 06:52:24 +0100 Subject: [PATCH 06/18] Modernize Codebase and Enhance CI Workflow (#67) Updated Type Hints: Used pyupgrade with --py39-plus and --py310-plus flags to upgrade type hints to Python 3.9+ and 3.10+ syntax. Enforced Double Quotes: Removed skip-string-normalization = true from the black configuration in pyproject.toml. Reran black via pre-commit hooks to enforce double quotes in the codebase. Refactored Dependency Management: Split requirements.txt into requirements.txt (runtime dependencies) and requirements-dev.txt (development dependencies). Enhanced CI Workflow: Integrated pre-commit hooks into the CI pipeline to enforce code quality checks automatically. Added pip caching to the CI workflow to speed up dependency installation. Automated Package Publishing: Added a publish.yml GitHub Actions workflow to automate publishing to PyPI. The workflow triggers on release creation or manual dispatch, builds the package, and publishes it to PyPI using twine. --- .github/workflows/ci.yml | 47 ++++ .github/workflows/publish.yml | 41 ++++ .github/workflows/unitest.yml | 33 --- .pre-commit-config.yaml | 36 +-- pyproject.toml | 1 - requirements-dev.txt | 7 + requirements.txt | 5 - src/gitingest/cli.py | 20 +- src/gitingest/clone.py | 19 +- src/gitingest/ignore_patterns.py | 278 ++++++++++++------------ src/gitingest/ingest.py | 22 +- src/gitingest/ingest_from_query.py | 74 +++---- src/gitingest/parse_query.py | 32 +-- src/gitingest/tests/conftest.py | 2 +- src/gitingest/tests/test_clone.py | 37 ++-- src/gitingest/tests/test_ingest.py | 54 ++--- src/gitingest/tests/test_parse_query.py | 6 +- src/gitingest/utils.py | 3 +- src/main.py | 7 +- src/process_query.py | 14 +- src/routers/download.py | 2 +- 21 files changed, 398 insertions(+), 342 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/publish.yml delete mode 100644 .github/workflows/unitest.yml create mode 100644 requirements-dev.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..163c2a8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,47 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: true + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/*requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r requirements-dev.txt + + - name: Run tests + run: | + pytest + + # Run pre-commit only on Python 3.13 + ubuntu. + - name: Run pre-commit hooks + if: ${{ matrix.python-version == '3.13' && matrix.os == 'ubuntu-latest' }} + run: | + pre-commit run --all-files diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..3b2764d --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,41 @@ +name: Publish to PyPI + +on: + release: + types: [created] # Trigger only when a release is created + workflow_dispatch: # Allows manual triggering of the workflow + +jobs: + publish: + runs-on: ubuntu-latest + + steps: + # Step 1: Check out the code + - name: Checkout code + uses: actions/checkout@v4 + + # Step 2: Set up Python + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.13 + + # Step 3: Install dependencies for building and publishing + - name: Install build tools + run: | + pip install --upgrade pip + pip install build twine + + # Step 4: Build the package + - name: Build the package + run: | + python -m build + + # Step 5: Publish to PyPI + - name: Publish to PyPI + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: | + python -m twine check dist/* + python -m twine upload --skip-existing dist/* diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml deleted file mode 100644 index 6759ecd..0000000 --- a/.github/workflows/unitest.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Unit Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.10", "3.11"] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest pytest-asyncio - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install -e . - - - name: Run tests - run: | - pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1b3eabd..68eedb4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,39 +4,39 @@ repos: hooks: # Files - id: check-added-large-files - description: 'Prevent large files from being committed.' - args: ['--maxkb=10000'] + description: "Prevent large files from being committed." + args: ["--maxkb=10000"] - id: check-case-conflict - description: 'Check for files that would conflict in case-insensitive filesystems.' + description: "Check for files that would conflict in case-insensitive filesystems." - id: fix-byte-order-marker - description: 'Remove utf-8 byte order marker.' + description: "Remove utf-8 byte order marker." - id: mixed-line-ending - description: 'Replace mixed line ending.' + description: "Replace mixed line ending." # Links - id: destroyed-symlinks - description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' + description: "Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to." # File files for parseable syntax: python - id: check-ast # File and line endings - id: end-of-file-fixer - description: 'Ensure that a file is either empty, or ends with one newline.' + description: "Ensure that a file is either empty, or ends with one newline." - id: trailing-whitespace - description: 'Trim trailing whitespace.' + description: "Trim trailing whitespace." # Python - id: check-docstring-first - description: 'Check a common error of defining a docstring after code.' + description: "Check a common error of defining a docstring after code." - id: requirements-txt-fixer - description: 'Sort entries in requirements.txt.' + description: "Sort entries in requirements.txt." - repo: https://github.com/MarcoGorelli/absolufy-imports rev: v0.3.1 hooks: - id: absolufy-imports - description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' + description: "Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)" - repo: https://github.com/psf/black rev: 24.10.0 @@ -47,30 +47,30 @@ repos: rev: v3.19.1 hooks: - id: pyupgrade - description: 'Automatically upgrade syntax for newer versions.' - args: [--py3-plus, --py36-plus, --py38-plus] + description: "Automatically upgrade syntax for newer versions." + args: [--py3-plus, --py36-plus, --py38-plus, --py39-plus, --py310-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: - id: python-check-blanket-noqa - description: 'Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`.' + description: "Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`." - id: python-check-blanket-type-ignore - description: 'Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`.' + description: "Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`." - id: python-use-type-annotations - description: 'Enforce that python3.6+ type annotations are used instead of type comments.' + description: "Enforce that python3.6+ type annotations are used instead of type comments." - repo: https://github.com/PyCQA/isort rev: 5.13.2 hooks: - id: isort - description: 'Sort imports alphabetically, and automatically separated into sections and by type.' + description: "Sort imports alphabetically, and automatically separated into sections and by type." - repo: https://github.com/hadialqattan/pycln rev: v2.4.0 hooks: - id: pycln - description: 'Remove unused import statements.' + description: "Remove unused import statements." - repo: https://github.com/djlint/djLint rev: v1.36.4 diff --git a/pyproject.toml b/pyproject.toml index f7d6c65..f30623c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,4 +14,3 @@ filter_files = true [tool.black] line-length = 119 -skip-string-normalization = true diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..eb733ff --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +-r requirements.txt +black +djlint +pre-commit +pylint +pytest +pytest-asyncio diff --git a/requirements.txt b/requirements.txt index 2688a88..e147ebf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,6 @@ -black click>=8.0.0 -djlint fastapi-analytics fastapi[standard] -pre-commit -pytest -pytest-asyncio python-dotenv slowapi starlette diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 9e0e3c4..c5f8a49 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,9 +1,7 @@ import os -from typing import Optional, Tuple import click -from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE @@ -17,17 +15,17 @@ def normalize_pattern(pattern: str) -> str: @click.command() -@click.argument('source', type=str, required=True) -@click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') -@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') -@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') -@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') +@click.argument("source", type=str, required=True) +@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") +@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") +@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") +@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") def main( source: str, - output: Optional[str], + output: str | None, max_size: int, - exclude_pattern: Tuple[str, ...], - include_pattern: Tuple[str, ...], + exclude_pattern: tuple[str, ...], + include_pattern: tuple[str, ...], ) -> None: """Analyze a directory and create a text dump of its contents.""" try: @@ -48,5 +46,5 @@ def main( raise click.Abort() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index 4a3fda3..df9aba8 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,6 +1,5 @@ import asyncio from dataclasses import dataclass -from typing import Optional, Tuple from gitingest.utils import AsyncTimeoutError, async_timeout @@ -11,8 +10,9 @@ class CloneConfig: url: str local_path: str - commit: Optional[str] = None - branch: Optional[str] = None + commit: str | None = None + branch: str | None = None + async def check_repo_exists(url: str) -> bool: @@ -44,7 +44,8 @@ async def check_repo_exists(url: str) -> bool: return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str -async def run_git_command(*args: str) -> Tuple[bytes, bytes]: +async def run_git_command(*args: str) -> tuple[bytes, bytes]: + """ Executes a git command asynchronously and captures its output. @@ -77,7 +78,7 @@ async def run_git_command(*args: str) -> Tuple[bytes, bytes]: @async_timeout(CLONE_TIMEOUT) -async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: +async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: """ Clones a repository to a local path based on the provided query parameters. @@ -107,8 +108,9 @@ async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: # Extract and validate query parameters url: str = config.url local_path: str = config.local_path - commit: Optional[str] = config.commit - branch: Optional[str] = config.branch + commit: str | None = config.commit + branch: str | None = config.branch + if not url: raise ValueError("The 'url' parameter is required.") @@ -131,7 +133,8 @@ async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: checkout_cmd = ["git", "-C", local_path, "checkout", commit] return await run_git_command(*checkout_cmd) - if branch and branch.lower() not in ('main', 'master'): + if branch and branch.lower() not in ("main", "master"): + # Scenario 2: Clone a specific branch with shallow depth clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] return await run_git_command(*clone_cmd) diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py index 8c738b8..f8ab453 100644 --- a/src/gitingest/ignore_patterns.py +++ b/src/gitingest/ignore_patterns.py @@ -1,162 +1,160 @@ -from typing import List - -DEFAULT_IGNORE_PATTERNS: List[str] = [ +DEFAULT_IGNORE_PATTERNS: list[str] = [ # Python - '*.pyc', - '*.pyo', - '*.pyd', - '__pycache__', - '.pytest_cache', - '.coverage', - '.tox', - '.nox', - '.mypy_cache', - '.ruff_cache', - '.hypothesis', - 'poetry.lock', - 'Pipfile.lock', + "*.pyc", + "*.pyo", + "*.pyd", + "__pycache__", + ".pytest_cache", + ".coverage", + ".tox", + ".nox", + ".mypy_cache", + ".ruff_cache", + ".hypothesis", + "poetry.lock", + "Pipfile.lock", # JavaScript/Node - 'node_modules', - 'bower_components', - 'package-lock.json', - 'yarn.lock', - '.npm', - '.yarn', - '.pnpm-store', + "node_modules", + "bower_components", + "package-lock.json", + "yarn.lock", + ".npm", + ".yarn", + ".pnpm-store", # Java - '*.class', - '*.jar', - '*.war', - '*.ear', - '*.nar', - 'target/', - '.gradle/', - 'build/', - '.settings/', - '.project', - '.classpath', - 'gradle-app.setting', - '*.gradle', + "*.class", + "*.jar", + "*.war", + "*.ear", + "*.nar", + "target/", + ".gradle/", + "build/", + ".settings/", + ".project", + ".classpath", + "gradle-app.setting", + "*.gradle", # C/C++ - '*.o', - '*.obj', - '*.so', - '*.dll', - '*.dylib', - '*.exe', - '*.lib', - '*.out', - '*.a', - '*.pdb', + "*.o", + "*.obj", + "*.so", + "*.dll", + "*.dylib", + "*.exe", + "*.lib", + "*.out", + "*.a", + "*.pdb", # Swift/Xcode - '.build/', - '*.xcodeproj/', - '*.xcworkspace/', - '*.pbxuser', - '*.mode1v3', - '*.mode2v3', - '*.perspectivev3', - '*.xcuserstate', - 'xcuserdata/', - '.swiftpm/', + ".build/", + "*.xcodeproj/", + "*.xcworkspace/", + "*.pbxuser", + "*.mode1v3", + "*.mode2v3", + "*.perspectivev3", + "*.xcuserstate", + "xcuserdata/", + ".swiftpm/", # Ruby - '*.gem', - '.bundle/', - 'vendor/bundle', - 'Gemfile.lock', - '.ruby-version', - '.ruby-gemset', - '.rvmrc', + "*.gem", + ".bundle/", + "vendor/bundle", + "Gemfile.lock", + ".ruby-version", + ".ruby-gemset", + ".rvmrc", # Rust - 'target/', - 'Cargo.lock', - '**/*.rs.bk', + "target/", + "Cargo.lock", + "**/*.rs.bk", # Go - 'bin/', - 'pkg/', + "bin/", + "pkg/", # .NET/C# - 'bin/', - 'obj/', - '*.suo', - '*.user', - '*.userosscache', - '*.sln.docstates', - 'packages/', - '*.nupkg', + "bin/", + "obj/", + "*.suo", + "*.user", + "*.userosscache", + "*.sln.docstates", + "packages/", + "*.nupkg", # Version control - '.git', - '.svn', - '.hg', - '.gitignore', - '.gitattributes', - '.gitmodules', + ".git", + ".svn", + ".hg", + ".gitignore", + ".gitattributes", + ".gitmodules", # Images and media - '*.svg', - '*.png', - '*.jpg', - '*.jpeg', - '*.gif', - '*.ico', - '*.pdf', - '*.mov', - '*.mp4', - '*.mp3', - '*.wav', + "*.svg", + "*.png", + "*.jpg", + "*.jpeg", + "*.gif", + "*.ico", + "*.pdf", + "*.mov", + "*.mp4", + "*.mp3", + "*.wav", # Virtual environments - 'venv', - '.venv', - 'env', - '.env', - 'virtualenv', + "venv", + ".venv", + "env", + ".env", + "virtualenv", # IDEs and editors - '.idea', - '.vscode', - '.vs', - '*.swp', - '*.swo', - '*.swn', - '.settings', - '.project', - '.classpath', - '*.sublime-*', + ".idea", + ".vscode", + ".vs", + "*.swp", + "*.swo", + "*.swn", + ".settings", + ".project", + ".classpath", + "*.sublime-*", # Temporary and cache files - '*.log', - '*.bak', - '*.swp', - '*.tmp', - '*.temp', - '.cache', - '.sass-cache', - '.eslintcache', - '.DS_Store', - 'Thumbs.db', - 'desktop.ini', + "*.log", + "*.bak", + "*.swp", + "*.tmp", + "*.temp", + ".cache", + ".sass-cache", + ".eslintcache", + ".DS_Store", + "Thumbs.db", + "desktop.ini", # Build directories and artifacts - 'build', - 'dist', - 'target', - 'out', - '*.egg-info', - '*.egg', - '*.whl', - '*.so', - '*.dylib', - '*.dll', - '*.class', + "build", + "dist", + "target", + "out", + "*.egg-info", + "*.egg", + "*.whl", + "*.so", + "*.dylib", + "*.dll", + "*.class", # Documentation - 'site-packages', - '.docusaurus', - '.next', - '.nuxt', + "site-packages", + ".docusaurus", + ".next", + ".nuxt", # Other common patterns ## Minified files - '*.min.js', - '*.min.css', + "*.min.js", + "*.min.css", ## Source maps - '*.map', + "*.map", ## Terraform - '.terraform', - '*.tfstate*', + ".terraform", + "*.tfstate*", ## Dependencies in various languages - 'vendor/', + "vendor/", ] diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index 4889bc5..e4c673d 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -2,7 +2,6 @@ import inspect import shutil from pathlib import Path -from typing import List, Optional, Tuple, Union from gitingest.clone import CloneConfig, clone_repo from gitingest.ingest_from_query import ingest_from_query @@ -12,10 +11,11 @@ def ingest( source: str, max_file_size: int = 10 * 1024 * 1024, # 10 MB - include_patterns: Union[List[str], str, None] = None, - exclude_patterns: Union[List[str], str, None] = None, - output: Optional[str] = None, -) -> Tuple[str, str, str]: + include_patterns: list[str] | str | None = None, + exclude_patterns: list[str] | str | None = None, + output: str | None = None, +) -> tuple[str, str, str]: + try: query = parse_query( source=source, @@ -24,14 +24,14 @@ def ingest( include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - if query['url']: + if query["url"]: # Extract relevant fields for CloneConfig clone_config = CloneConfig( url=query["url"], - local_path=query['local_path'], - commit=query.get('commit'), - branch=query.get('branch'), + local_path=query["local_path"], + commit=query.get("commit"), + branch=query.get("branch"), ) clone_result = clone_repo(clone_config) @@ -50,7 +50,7 @@ def ingest( finally: # Clean up the temporary directory if it was created - if query['url']: + if query["url"]: # Get parent directory two levels up from local_path (../tmp) - cleanup_path = str(Path(query['local_path']).parents[1]) + cleanup_path = str(Path(query["local_path"]).parents[1]) shutil.rmtree(cleanup_path, ignore_errors=True) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index a9130a3..7dc6d29 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,6 +1,6 @@ import os from fnmatch import fnmatch -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any import tiktoken @@ -10,7 +10,7 @@ MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB -def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: +def should_include(path: str, base_path: str, include_patterns: list[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) include = False for pattern in include_patterns: @@ -19,10 +19,10 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo return include -def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: +def should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: - if pattern == '': + if pattern == "": continue if fnmatch(rel_path, pattern): return True @@ -43,7 +43,7 @@ def is_safe_symlink(symlink_path: str, base_path: str) -> bool: def is_text_file(file_path: str) -> bool: """Determines if a file is likely a text file based on its content.""" try: - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: chunk = file.read(1024) return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) except OSError: @@ -52,7 +52,7 @@ def is_text_file(file_path: str) -> bool: def read_file_content(file_path: str) -> str: try: - with open(file_path, encoding='utf-8', errors='ignore') as f: + with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() except Exception as e: return f"Error reading file: {str(e)}" @@ -60,11 +60,11 @@ def read_file_content(file_path: str) -> str: def scan_directory( path: str, - query: Dict[str, Any], - seen_paths: Optional[Set[str]] = None, + query: dict[str, Any], + seen_paths: set[str] | None = None, depth: int = 0, - stats: Optional[Dict[str, int]] = None, -) -> Optional[Dict[str, Any]]: + stats: dict[str, int] | None = None, +) -> dict[str, Any] | None: """Recursively analyzes a directory and its contents with safety limits.""" if seen_paths is None: seen_paths = set() @@ -101,9 +101,9 @@ def scan_directory( "ignore_content": False, } - ignore_patterns = query['ignore_patterns'] - base_path = query['local_path'] - include_patterns = query['include_patterns'] + ignore_patterns = query["ignore_patterns"] + base_path = query["local_path"] + include_patterns = query["include_patterns"] try: for item in os.listdir(path): @@ -113,7 +113,7 @@ def scan_directory( continue is_file = os.path.isfile(item_path) - if is_file and query['include_patterns']: + if is_file and query["include_patterns"]: if not should_include(item_path, base_path, include_patterns): result["ignore_content"] = True continue @@ -220,11 +220,11 @@ def scan_directory( def extract_files_content( - query: Dict[str, Any], - node: Dict[str, Any], + query: dict[str, Any], + node: dict[str, Any], max_file_size: int, - files: Optional[List[Dict[str, Any]]] = None, -) -> List[Dict[str, Any]]: + files: list[dict[str, Any]] | None = None, +) -> list[dict[str, Any]]: """Recursively collects all text files with their contents.""" if files is None: files = [] @@ -236,7 +236,7 @@ def extract_files_content( files.append( { - "path": node["path"].replace(query['local_path'], ""), + "path": node["path"].replace(query["local_path"], ""), "content": content, "size": node["size"], }, @@ -248,17 +248,17 @@ def extract_files_content( return files -def create_file_content_string(files: List[Dict[str, Any]]) -> str: +def create_file_content_string(files: list[dict[str, Any]]) -> str: """Creates a formatted string of file contents with separators.""" output = "" separator = "=" * 48 + "\n" # First add README.md if it exists for file in files: - if not file['content']: + if not file["content"]: continue - if file['path'].lower() == '/readme.md': + if file["path"].lower() == "/readme.md": output += separator output += f"File: {file['path']}\n" output += separator @@ -267,7 +267,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str: # Then add all other files in their original order for file in files: - if not file['content'] or file['path'].lower() == '/readme.md': + if not file["content"] or file["path"].lower() == "/readme.md": continue output += separator @@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str: return output -def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str: +def create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" @@ -287,22 +287,22 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str: summary += f"Files analyzed: {nodes['file_count']}\n" - if 'subpath' in query and query['subpath'] != '/': + if "subpath" in query and query["subpath"] != "/": summary += f"Subpath: {query['subpath']}\n" - if 'commit' in query and query['commit']: + if "commit" in query and query["commit"]: summary += f"Commit: {query['commit']}\n" - elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + elif "branch" in query and query["branch"] != "main" and query["branch"] != "master" and query["branch"]: summary += f"Branch: {query['branch']}\n" return summary -def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str: +def create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" if not node["name"]: - node["name"] = query['slug'] + node["name"] = query["slug"] if node["name"]: current_prefix = "└── " if is_last else "β”œβ”€β”€ " @@ -319,7 +319,7 @@ def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: s return tree -def generate_token_string(context_string: str) -> Optional[str]: +def generate_token_string(context_string: str) -> str | None: """Returns the number of tokens in a text string.""" formatted_tokens = "" try: @@ -340,7 +340,7 @@ def generate_token_string(context_string: str) -> Optional[str]: return formatted_tokens -def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: +def ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]: if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") @@ -350,11 +350,11 @@ def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str] raise ValueError(f"File {path} is not a text file") content = read_file_content(path) - if file_size > query['max_file_size']: + if file_size > query["max_file_size"]: content = "[Content ignored: file too large]" file_info = { - "path": path.replace(query['local_path'], ""), + "path": path.replace(query["local_path"], ""), "content": content, "size": file_size, } @@ -376,11 +376,11 @@ def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str] return summary, tree, files_content -def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: +def ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: nodes = scan_directory(path=path, query=query) if not nodes: raise ValueError(f"No files found in {path}") - files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) + files = extract_files_content(query=query, node=nodes, max_file_size=query["max_file_size"]) summary = create_summary_string(query, nodes) tree = "Directory structure:\n" + create_tree_structure(query, nodes) files_content = create_file_content_string(files) @@ -392,13 +392,13 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: return summary, tree, files_content -def ingest_from_query(query: Dict[str, Any]) -> Tuple[str, str, str]: +def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]: """Main entry point for analyzing a codebase directory or single file.""" path = f"{query['local_path']}{query['subpath']}" if not os.path.exists(path): raise ValueError(f"{query['slug']} cannot be found") - if query.get('type') == 'blob': + if query.get("type") == "blob": return ingest_single_file(path, query) return ingest_directory(path, query) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 5053dfa..73151c2 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,7 +1,7 @@ import os import string import uuid -from typing import Any, Dict, List, Optional, Union +from typing import Any from urllib.parse import unquote from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS @@ -11,15 +11,15 @@ HEX_DIGITS = set(string.hexdigits) -def parse_url(url: str) -> Dict[str, Any]: +def parse_url(url: str) -> dict[str, Any]: url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters - if not url.startswith('https://'): - url = 'https://' + url + if not url.startswith("https://"): + url = "https://" + url # Extract domain and path - url_parts = url.split('/') + url_parts = url.split("/") domain = url_parts[2] path_parts = url_parts[3:] @@ -62,8 +62,8 @@ def parse_url(url: str) -> Dict[str, Any]: # Handle branch names with slashes and special characters - # Find the index of the first type indicator ('tree' or 'blob'), if any - type_indicator_index = next((i for i, part in enumerate(remaining_parts) if part in ('tree', 'blob')), None) + # Find the index of the first type indicator ("tree" or "blob"), if any + type_indicator_index = next((i for i, part in enumerate(remaining_parts) if part in ("tree", "blob")), None) if type_indicator_index is None: # No type indicator found; assume the entire input is the branch name @@ -89,7 +89,7 @@ def normalize_pattern(pattern: str) -> str: return pattern -def parse_patterns(pattern: Union[List[str], str]) -> List[str]: +def parse_patterns(pattern: list[str] | str) -> list[str]: patterns = pattern if isinstance(pattern, list) else [pattern] patterns = [p.strip() for p in patterns] @@ -103,7 +103,7 @@ def parse_patterns(pattern: Union[List[str], str]) -> List[str]: return [normalize_pattern(p) for p in patterns] -def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: +def override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[str]) -> list[str]: """ Removes patterns from ignore_patterns that are present in include_patterns using set difference. @@ -122,7 +122,7 @@ def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[ return list(set(ignore_patterns) - set(include_patterns)) -def parse_path(path: str) -> Dict[str, Any]: +def parse_path(path: str) -> dict[str, Any]: query = { "url": None, "local_path": os.path.abspath(path), @@ -137,9 +137,9 @@ def parse_query( source: str, max_file_size: int, from_web: bool, - include_patterns: Optional[Union[List[str], str]] = None, - ignore_patterns: Optional[Union[List[str], str]] = None, -) -> Dict[str, Any]: + include_patterns: list[str] | str | None = None, + ignore_patterns: list[str] | str | None = None, +) -> dict[str, Any]: """ Parses the input source to construct a query dictionary with specified parameters. @@ -183,9 +183,9 @@ def parse_query( # Update the query dictionary with max_file_size and processed patterns query.update( { - 'max_file_size': max_file_size, - 'ignore_patterns': ignore_patterns_list, - 'include_patterns': parsed_include, + "max_file_size": max_file_size, + "ignore_patterns": ignore_patterns_list, + "include_patterns": parsed_include, } ) return query diff --git a/src/gitingest/tests/conftest.py b/src/gitingest/tests/conftest.py index 31dba62..ecb7e81 100644 --- a/src/gitingest/tests/conftest.py +++ b/src/gitingest/tests/conftest.py @@ -6,4 +6,4 @@ # Add both the project root and src directory to PYTHONPATH sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, 'src')) +sys.path.insert(0, os.path.join(project_root, "src")) diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 5f33b98..585ba6e 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -8,16 +8,17 @@ @pytest.mark.asyncio async def test_clone_repo_with_commit() -> None: clone_config = CloneConfig( - url='https://github.com/user/repo', - local_path='/tmp/repo', - commit='a' * 40, # Simulating a valid commit hash - branch='main', + url="https://github.com/user/repo", + local_path="/tmp/repo", + commit="a" * 40, # Simulating a valid commit hash + branch="main", ) - with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: + with patch("gitingest.clone.check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.clone.run_git_command", new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') + mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process await clone_repo(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -26,12 +27,12 @@ async def test_clone_repo_with_commit() -> None: @pytest.mark.asyncio async def test_clone_repo_without_commit() -> None: - query = CloneConfig(url='https://github.com/user/repo', local_path='/tmp/repo', commit=None, branch='main') + query = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", commit=None, branch="main") - with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: + with patch("gitingest.clone.check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.clone.run_git_command", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') + mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process await clone_repo(query) @@ -42,12 +43,12 @@ async def test_clone_repo_without_commit() -> None: @pytest.mark.asyncio async def test_clone_repo_nonexistent_repository() -> None: clone_config = CloneConfig( - url='https://github.com/user/nonexistent-repo', - local_path='/tmp/repo', + url="https://github.com/user/nonexistent-repo", + local_path="/tmp/repo", commit=None, - branch='main', + branch="main", ) - with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: + with patch("gitingest.clone.check_repo_exists", return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): await clone_repo(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -57,9 +58,9 @@ async def test_clone_repo_nonexistent_repository() -> None: async def test_check_repo_exists() -> None: url = "https://github.com/user/repo" - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() - mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') + mock_process.communicate.return_value = (b"HTTP/1.1 200 OK\n", b"") mock_exec.return_value = mock_process # Test existing repository @@ -67,7 +68,7 @@ async def test_check_repo_exists() -> None: assert await check_repo_exists(url) is True # Test non-existing repository (404 response) - mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') + mock_process.communicate.return_value = (b"HTTP/1.1 404 Not Found\n", b"") mock_process.returncode = 0 assert await check_repo_exists(url) is False diff --git a/src/gitingest/tests/test_ingest.py b/src/gitingest/tests/test_ingest.py index 33b174b..fa8369a 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/src/gitingest/tests/test_ingest.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, Dict +from typing import Any import pytest @@ -8,19 +8,19 @@ # Test fixtures @pytest.fixture -def sample_query() -> Dict[str, Any]: +def sample_query() -> dict[str, Any]: return { - 'user_name': 'test_user', - 'repo_name': 'test_repo', - 'local_path': '/tmp/test_repo', - 'subpath': '/', - 'branch': 'main', - 'commit': None, - 'max_file_size': 1_000_000, - 'slug': 'test_user/test_repo', - 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], - 'include_patterns': None, - 'pattern_type': 'exclude', + "user_name": "test_user", + "repo_name": "test_repo", + "local_path": "/tmp/test_repo", + "subpath": "/", + "branch": "main", + "commit": None, + "max_file_size": 1_000_000, + "slug": "test_user/test_repo", + "ignore_patterns": ["*.pyc", "__pycache__", ".git"], + "include_patterns": None, + "pattern_type": "exclude", } @@ -73,18 +73,18 @@ def temp_directory(tmp_path: Path) -> Path: return test_dir -def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: +def test_scan_directory(temp_directory: Path, sample_query: dict[str, Any]) -> None: result = scan_directory(str(temp_directory), query=sample_query) if result is None: assert False, "Result is None" - assert result['type'] == 'directory' - assert result['file_count'] == 8 # All .txt and .py files - assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 - assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 + assert result["type"] == "directory" + assert result["file_count"] == 8 # All .txt and .py files + assert result["dir_count"] == 4 # src, src/subdir, dir1, dir2 + assert len(result["children"]) == 5 # file1.txt, file2.py, src, dir1, dir2 -def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: +def test_extract_files_content(temp_directory: Path, sample_query: dict[str, Any]) -> None: nodes = scan_directory(str(temp_directory), query=sample_query) if nodes is None: assert False, "Nodes is None" @@ -92,14 +92,14 @@ def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any assert len(files) == 8 # All .txt and .py files # Check for presence of key files - paths = [f['path'] for f in files] - assert any('file1.txt' in p for p in paths) - assert any('subfile1.txt' in p for p in paths) - assert any('file2.py' in p for p in paths) - assert any('subfile2.py' in p for p in paths) - assert any('file_subdir.txt' in p for p in paths) - assert any('file_dir1.txt' in p for p in paths) - assert any('file_dir2.txt' in p for p in paths) + paths = [f["path"] for f in files] + assert any("file1.txt" in p for p in paths) + assert any("subfile1.txt" in p for p in paths) + assert any("file2.py" in p for p in paths) + assert any("subfile2.py" in p for p in paths) + assert any("file_subdir.txt" in p for p in paths) + assert any("file_dir1.txt" in p for p in paths) + assert any("file_dir2.txt" in p for p in paths) # TODO: test with include patterns: ['*.txt'] diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index 1ab5e44..71ff71e 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -26,7 +26,7 @@ def test_parse_url_invalid() -> None: def test_parse_query_basic() -> None: test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"] for url in test_cases: - result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') + result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns="*.txt") assert result["user_name"] == "user" assert result["repo_name"] == "repo" assert result["url"] == url @@ -35,7 +35,7 @@ def test_parse_query_basic() -> None: def test_parse_query_include_pattern() -> None: url = "https://github.com/user/repo" - result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') + result = parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py") assert result["include_patterns"] == ["*.py"] assert set(result["ignore_patterns"]) == set(DEFAULT_IGNORE_PATTERNS) @@ -43,4 +43,4 @@ def test_parse_query_include_pattern() -> None: def test_parse_query_invalid_pattern() -> None: url = "https://github.com/user/repo" with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') + parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 2445f14..8406d5c 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,7 +1,8 @@ ## Async Timeout decorator import asyncio import functools -from typing import Awaitable, Callable, ParamSpec, TypeVar +from collections.abc import Awaitable, Callable +from typing import ParamSpec, TypeVar T = TypeVar("T") P = ParamSpec("P") diff --git a/src/main.py b/src/main.py index a50a1c5..18de770 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,4 @@ import os -from typing import Dict from api_analytics.fastapi import Analytics from dotenv import load_dotenv @@ -33,7 +32,7 @@ async def rate_limit_exception_handler(request: Request, exc: Exception) -> Resp app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) app.mount("/static", StaticFiles(directory="static"), name="static") -app_analytics_key = os.getenv('API_ANALYTICS_KEY') +app_analytics_key = os.getenv("API_ANALYTICS_KEY") if app_analytics_key: app.add_middleware(Analytics, api_key=app_analytics_key) @@ -52,7 +51,7 @@ async def rate_limit_exception_handler(request: Request, exc: Exception) -> Resp @app.get("/health") -async def health_check() -> Dict[str, str]: +async def health_check() -> dict[str, str]: return {"status": "healthy"} @@ -70,7 +69,7 @@ async def api_docs(request: Request) -> HTMLResponse: @app.get("/robots.txt") async def robots() -> FileResponse: - return FileResponse('static/robots.txt') + return FileResponse("static/robots.txt") app.include_router(index) diff --git a/src/process_query.py b/src/process_query.py index 761fdf2..f55068c 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -62,9 +62,9 @@ async def process_query( ) clone_config = CloneConfig( url=query["url"], - local_path=query['local_path'], - commit=query.get('commit'), - branch=query.get('branch'), + local_path=query["local_path"], + commit=query.get("commit"), + branch=query.get("branch"), ) await clone_repo(clone_config) summary, tree, content = ingest_from_query(query) @@ -73,8 +73,8 @@ async def process_query( except Exception as e: # hack to print error message when query is not defined - if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query['url'], e, max_file_size, pattern_type, pattern) + if "query" in locals() and query is not None and isinstance(query, dict): + print_error(query["url"], e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") @@ -99,7 +99,7 @@ async def process_query( ) print_success( - url=query['url'], + url=query["url"], max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, @@ -116,7 +116,7 @@ async def process_query( "tree": tree, "content": content, "examples": EXAMPLE_REPOS if is_index else [], - "ingest_id": query['id'], + "ingest_id": query["id"], "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, diff --git a/src/routers/download.py b/src/routers/download.py index 95cec0f..2dc1022 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -13,7 +13,7 @@ async def download_ingest(digest_id: str) -> Response: try: # Find the first .txt file in the directory directory = f"{TMP_BASE_PATH}/{digest_id}" - txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] + txt_files = [f for f in os.listdir(directory) if f.endswith(".txt")] if not txt_files: raise FileNotFoundError("No .txt file found") From fd7b3b3d934186e9297b67c6a73a080ea87fefd2 Mon Sep 17 00:00:00 2001 From: cyclotruc Date: Sun, 29 Dec 2024 05:55:42 +0000 Subject: [PATCH 07/18] Fix format --- src/gitingest/clone.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index df9aba8..97a990a 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -14,7 +14,6 @@ class CloneConfig: branch: str | None = None - async def check_repo_exists(url: str) -> bool: """ Check if a repository exists at the given URL using an HTTP HEAD request. @@ -45,7 +44,6 @@ async def check_repo_exists(url: str) -> bool: async def run_git_command(*args: str) -> tuple[bytes, bytes]: - """ Executes a git command asynchronously and captures its output. @@ -111,7 +109,6 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: commit: str | None = config.commit branch: str | None = config.branch - if not url: raise ValueError("The 'url' parameter is required.") From 6ef399e23c14ab826f3eb0d4e136d6c7440dfc65 Mon Sep 17 00:00:00 2001 From: cyclotruc Date: Sun, 29 Dec 2024 06:01:31 +0000 Subject: [PATCH 08/18] Fix import sort --- src/gitingest/parse_query.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 73151c2..aacf9f5 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -6,7 +6,6 @@ from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS - TMP_BASE_PATH = "../tmp" HEX_DIGITS = set(string.hexdigits) From df19539c0f5a752b5345acf3f35d2a25c0b28e5d Mon Sep 17 00:00:00 2001 From: cyclotruc Date: Sun, 29 Dec 2024 06:27:55 +0000 Subject: [PATCH 09/18] Fix subbolders cloning --- src/gitingest/parse_query.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index aacf9f5..2e6470e 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -38,41 +38,29 @@ def parse_url(url: str) -> dict[str, Any]: "commit": None, "subpath": "/", "local_path": f"{TMP_BASE_PATH}/{_id}/{slug}", - # Keep original URL format but with decoded components "url": f"https://{domain}/{user_name}/{repo_name}", "slug": slug, "id": _id, } + # If this is an issues page, return early without processing subpath + if len(path_parts) > 2 and (path_parts[2] == "issues" or path_parts[2] == "pull"): + return parsed + if len(path_parts) < 4: return parsed parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' commit = path_parts[3] - # Find the commit hash or reconstruct the branch name - remaining_parts = path_parts[3:] - if _is_valid_git_commit_hash(commit): parsed["commit"] = commit - if len(remaining_parts) > 1: - parsed["subpath"] += "/".join(remaining_parts[1:]) - return parsed - - # Handle branch names with slashes and special characters - - # Find the index of the first type indicator ("tree" or "blob"), if any - type_indicator_index = next((i for i, part in enumerate(remaining_parts) if part in ("tree", "blob")), None) - - if type_indicator_index is None: - # No type indicator found; assume the entire input is the branch name - parsed["branch"] = "/".join(remaining_parts) - return parsed - - # Found a type indicator; update branch and subpath - parsed["branch"] = "/".join(remaining_parts[:type_indicator_index]) - if len(remaining_parts) > type_indicator_index + 2: - parsed["subpath"] += "/".join(remaining_parts[type_indicator_index + 2 :]) + if len(path_parts) > 4: + parsed["subpath"] += "/".join(path_parts[4:]) + else: + parsed["branch"] = commit + if len(path_parts) > 4: + parsed["subpath"] += "/".join(path_parts[4:]) return parsed From 65b4b4a589d7820c4d8b93e63ac2ebb98bff075d Mon Sep 17 00:00:00 2001 From: cyclotruc Date: Sun, 29 Dec 2024 06:54:53 +0000 Subject: [PATCH 10/18] Fix blob ingests --- src/gitingest/ingest_from_query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 7dc6d29..51cca8d 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -394,9 +394,9 @@ def ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]: """Main entry point for analyzing a codebase directory or single file.""" - path = f"{query['local_path']}{query['subpath']}" - if not os.path.exists(path): - raise ValueError(f"{query['slug']} cannot be found") + path = os.path.join(query["local_path"], query["subpath"].lstrip(os.sep)) + if not os.path.exists(path) and not os.path.exists(os.path.dirname(path)): + raise ValueError(f"{query['subpath']} cannot be found") if query.get("type") == "blob": return ingest_single_file(path, query) From 4e5c9521e89e0bf893df9a388284fcb30bd43c23 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Mon, 30 Dec 2024 02:53:17 +0100 Subject: [PATCH 11/18] Refactor, Documentation, and Code Cleanup (#70) Code Quality: - Add markdownlint to pre-commit hooks for consistent markdown formatting - Add missing type hints to constants and variables - Prefix helper functions with underscore for better encapsulation Documentation: - Update README.md to follow markdown best practices - Add docstrings to functions in process_query.py, server_utils.py and main.py Refactoring: - Remove redundant normalize_pattern function from client.py - Simplify logic in should_exclude function in ingest_from_query.py - Improve code organization in clone.py - Move async_timeout comment into function body for better context --- .pre-commit-config.yaml | 7 ++ README.md | 71 +++++-------- src/config.py | 6 +- src/gitingest/cli.py | 10 -- src/gitingest/clone.py | 134 ++++++++++++------------ src/gitingest/ingest_from_query.py | 83 ++++++++------- src/gitingest/parse_query.py | 24 ++--- src/gitingest/tests/test_clone.py | 19 ++-- src/gitingest/tests/test_ingest.py | 8 +- src/gitingest/tests/test_parse_query.py | 6 +- src/gitingest/utils.py | 2 +- src/main.py | 81 ++++++++++++-- src/process_query.py | 75 +++++++++++++ src/server_utils.py | 20 +++- 14 files changed, 332 insertions(+), 214 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 68eedb4..9dcf517 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -76,3 +76,10 @@ repos: rev: v1.36.4 hooks: - id: djlint-reformat-jinja + + - repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.43.0 + hooks: + - id: markdownlint + description: "Lint markdown files." + args: ["--disable=line-length"] diff --git a/README.md b/README.md index 6d0747a..7e02c46 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,13 @@ -[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) +# GitIngest - - - License - - - - PyPI version - - - - Downloads - - - - GitHub issues - - - - Code style: black - - - - - Discord - +[![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/cyclotruc/gitingest/blob/main/LICENSE) +[![PyPI version](https://badge.fury.io/py/gitingest.svg)](https://badge.fury.io/py/gitingest) +[![Downloads](https://pepy.tech/badge/gitingest)](https://pepy.tech/project/gitingest) +[![GitHub issues](https://img.shields.io/github/issues/cyclotruc/gitingest)](https://github.com/cyclotruc/gitingest/issues) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Discord](https://dcbadge.limes.pink/api/server/https://discord.com/invite/zerRaGK9EC)](https://discord.com/invite/zerRaGK9EC) -# GitIngest +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) Turn any Git repository into a prompt-friendly text ingest for LLMs. @@ -92,15 +73,15 @@ By default, this won't write a file but can be enabled with the `output` argumen 1. Build the image: -``` bash -docker build -t gitingest . -``` + ``` bash + docker build -t gitingest . + ``` 2. Run the container: -``` bash -docker run -d --name gitingest -p 8000:8000 gitingest -``` + ``` bash + docker run -d --name gitingest -p 8000:8000 gitingest + ``` The application will be available at `http://localhost:8000` Ensure environment variables are set before running the application or deploying it via Docker. @@ -135,22 +116,20 @@ ALLOWED_HOSTS="gitingest.local,localhost" 1. Clone the repository -```bash -git clone https://github.com/cyclotruc/gitingest.git -cd gitingest -``` + ```bash + git clone https://github.com/cyclotruc/gitingest.git + cd gitingest + ``` 2. Install dependencies -```bash -pip install -r requirements.txt -``` + ```bash + pip install -r requirements.txt + ``` 3. Run the application: -```bash -cd src -uvicorn main:app --reload -``` - -The frontend will be available at `localhost:8000` + ```bash + cd src + uvicorn main:app --reload + ``` diff --git a/src/config.py b/src/config.py index b918fb2..8da41da 100644 --- a/src/config.py +++ b/src/config.py @@ -1,7 +1,7 @@ -MAX_DISPLAY_SIZE = 300_000 -TMP_BASE_PATH = "../tmp" +MAX_DISPLAY_SIZE: int = 300_000 +TMP_BASE_PATH: str = "../tmp" -EXAMPLE_REPOS = [ +EXAMPLE_REPOS: list[dict[str, str]] = [ {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, {"name": "Flask", "url": "https://github.com/pallets/flask"}, diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index c5f8a49..f275efa 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,19 +1,9 @@ -import os - import click from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE -def normalize_pattern(pattern: str) -> str: - pattern = pattern.strip() - pattern = pattern.lstrip(os.sep) - if pattern.endswith(os.sep): - pattern += "*" - return pattern - - @click.command() @click.argument("source", type=str, required=True) @click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index 97a990a..a91b6a9 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -3,7 +3,7 @@ from gitingest.utils import AsyncTimeoutError, async_timeout -CLONE_TIMEOUT = 20 +CLONE_TIMEOUT: int = 20 @dataclass @@ -14,67 +14,6 @@ class CloneConfig: branch: str | None = None -async def check_repo_exists(url: str) -> bool: - """ - Check if a repository exists at the given URL using an HTTP HEAD request. - - Parameters - ---------- - url : str - The URL of the repository. - - Returns - ------- - bool - True if the repository exists, False otherwise. - """ - proc = await asyncio.create_subprocess_exec( - "curl", - "-I", - url, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, _ = await proc.communicate() - if proc.returncode != 0: - return False - # Check if stdout contains "404" status code - stdout_str = stdout.decode() - return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str - - -async def run_git_command(*args: str) -> tuple[bytes, bytes]: - """ - Executes a git command asynchronously and captures its output. - - Parameters - ---------- - *args : str - The git command and its arguments to execute. - - Returns - ------- - Tuple[bytes, bytes] - A tuple containing the stdout and stderr of the git command. - - Raises - ------ - RuntimeError - If the git command exits with a non-zero status. - """ - proc = await asyncio.create_subprocess_exec( - *args, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - stdout, stderr = await proc.communicate() - if proc.returncode != 0: - error_message = stderr.decode().strip() - raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}") - - return stdout, stderr - - @async_timeout(CLONE_TIMEOUT) async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: """ @@ -116,7 +55,7 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: raise ValueError("The 'local_path' parameter is required.") # Check if the repository exists - if not await check_repo_exists(url): + if not await _check_repo_exists(url): raise ValueError("Repository not found, make sure it is public") try: @@ -124,21 +63,82 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: # Scenario 1: Clone and checkout a specific commit # Clone the repository without depth to ensure full history for checkout clone_cmd = ["git", "clone", "--single-branch", url, local_path] - await run_git_command(*clone_cmd) + await _run_git_command(*clone_cmd) # Checkout the specific commit checkout_cmd = ["git", "-C", local_path, "checkout", commit] - return await run_git_command(*checkout_cmd) + return await _run_git_command(*checkout_cmd) if branch and branch.lower() not in ("main", "master"): # Scenario 2: Clone a specific branch with shallow depth clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] - return await run_git_command(*clone_cmd) + return await _run_git_command(*clone_cmd) # Scenario 3: Clone the default branch with shallow depth clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path] - return await run_git_command(*clone_cmd) + return await _run_git_command(*clone_cmd) except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError): raise # Re-raise the exception + + +async def _check_repo_exists(url: str) -> bool: + """ + Check if a repository exists at the given URL using an HTTP HEAD request. + + Parameters + ---------- + url : str + The URL of the repository. + + Returns + ------- + bool + True if the repository exists, False otherwise. + """ + proc = await asyncio.create_subprocess_exec( + "curl", + "-I", + url, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + if proc.returncode != 0: + return False + # Check if stdout contains "404" status code + stdout_str = stdout.decode() + return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str + + +async def _run_git_command(*args: str) -> tuple[bytes, bytes]: + """ + Executes a git command asynchronously and captures its output. + + Parameters + ---------- + *args : str + The git command and its arguments to execute. + + Returns + ------- + Tuple[bytes, bytes] + A tuple containing the stdout and stderr of the git command. + + Raises + ------ + RuntimeError + If the git command exits with a non-zero status. + """ + proc = await asyncio.create_subprocess_exec( + *args, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if proc.returncode != 0: + error_message = stderr.decode().strip() + raise RuntimeError(f"Git command failed: {' '.join(args)}\nError: {error_message}") + + return stdout, stderr diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 51cca8d..886afa2 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -10,7 +10,7 @@ MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB -def should_include(path: str, base_path: str, include_patterns: list[str]) -> bool: +def _should_include(path: str, base_path: str, include_patterns: list[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) include = False for pattern in include_patterns: @@ -19,17 +19,15 @@ def should_include(path: str, base_path: str, include_patterns: list[str]) -> bo return include -def should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool: +def _should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: - if pattern == "": - continue - if fnmatch(rel_path, pattern): + if pattern and fnmatch(rel_path, pattern): return True return False -def is_safe_symlink(symlink_path: str, base_path: str) -> bool: +def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: """Check if a symlink points to a location within the base directory.""" try: target_path = os.path.realpath(symlink_path) @@ -40,7 +38,7 @@ def is_safe_symlink(symlink_path: str, base_path: str) -> bool: return False -def is_text_file(file_path: str) -> bool: +def _is_text_file(file_path: str) -> bool: """Determines if a file is likely a text file based on its content.""" try: with open(file_path, "rb") as file: @@ -50,7 +48,7 @@ def is_text_file(file_path: str) -> bool: return False -def read_file_content(file_path: str) -> str: +def _read_file_content(file_path: str) -> str: try: with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() @@ -58,7 +56,7 @@ def read_file_content(file_path: str) -> str: return f"Error reading file: {str(e)}" -def scan_directory( +def _scan_directory( path: str, query: dict[str, Any], seen_paths: set[str] | None = None, @@ -68,6 +66,7 @@ def scan_directory( """Recursively analyzes a directory and its contents with safety limits.""" if seen_paths is None: seen_paths = set() + if stats is None: stats = {"total_files": 0, "total_size": 0} @@ -109,18 +108,18 @@ def scan_directory( for item in os.listdir(path): item_path = os.path.join(path, item) - if should_exclude(item_path, base_path, ignore_patterns): + if _should_exclude(item_path, base_path, ignore_patterns): continue is_file = os.path.isfile(item_path) if is_file and query["include_patterns"]: - if not should_include(item_path, base_path, include_patterns): + if not _should_include(item_path, base_path, include_patterns): result["ignore_content"] = True continue # Handle symlinks if os.path.islink(item_path): - if not is_safe_symlink(item_path, base_path): + if not _is_safe_symlink(item_path, base_path): print(f"Skipping symlink that points outside base directory: {item_path}") continue real_path = os.path.realpath(item_path) @@ -141,8 +140,8 @@ def scan_directory( print(f"Maximum file limit ({MAX_FILES}) reached") return result - is_text = is_text_file(real_path) - content = read_file_content(real_path) if is_text else "[Non-text file]" + is_text = _is_text_file(real_path) + content = _read_file_content(real_path) if is_text else "[Non-text file]" child = { "name": item, @@ -156,7 +155,7 @@ def scan_directory( result["file_count"] += 1 elif os.path.isdir(real_path): - subdir = scan_directory( + subdir = _scan_directory( path=real_path, query=query, seen_paths=seen_paths, @@ -185,8 +184,8 @@ def scan_directory( print(f"Maximum file limit ({MAX_FILES}) reached") return result - is_text = is_text_file(item_path) - content = read_file_content(item_path) if is_text else "[Non-text file]" + is_text = _is_text_file(item_path) + content = _read_file_content(item_path) if is_text else "[Non-text file]" child = { "name": item, @@ -200,7 +199,7 @@ def scan_directory( result["file_count"] += 1 elif os.path.isdir(item_path): - subdir = scan_directory( + subdir = _scan_directory( path=item_path, query=query, seen_paths=seen_paths, @@ -219,7 +218,7 @@ def scan_directory( return result -def extract_files_content( +def _extract_files_content( query: dict[str, Any], node: dict[str, Any], max_file_size: int, @@ -243,12 +242,12 @@ def extract_files_content( ) elif node["type"] == "directory": for child in node["children"]: - extract_files_content(query=query, node=child, max_file_size=max_file_size, files=files) + _extract_files_content(query=query, node=child, max_file_size=max_file_size, files=files) return files -def create_file_content_string(files: list[dict[str, Any]]) -> str: +def _create_file_content_string(files: list[dict[str, Any]]) -> str: """Creates a formatted string of file contents with separators.""" output = "" separator = "=" * 48 + "\n" @@ -278,7 +277,7 @@ def create_file_content_string(files: list[dict[str, Any]]) -> str: return output -def create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: +def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" @@ -297,7 +296,7 @@ def create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: return summary -def create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str: +def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" @@ -314,12 +313,12 @@ def create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: s new_prefix = prefix + (" " if is_last else "β”‚ ") if node["name"] else prefix children = node["children"] for i, child in enumerate(children): - tree += create_tree_structure(query, child, new_prefix, i == len(children) - 1) + tree += _create_tree_structure(query, child, new_prefix, i == len(children) - 1) return tree -def generate_token_string(context_string: str) -> str | None: +def _generate_token_string(context_string: str) -> str | None: """Returns the number of tokens in a text string.""" formatted_tokens = "" try: @@ -340,16 +339,16 @@ def generate_token_string(context_string: str) -> str | None: return formatted_tokens -def ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]: +def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]: if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") file_size = os.path.getsize(path) - is_text = is_text_file(path) + is_text = _is_text_file(path) if not is_text: raise ValueError(f"File {path} is not a text file") - content = read_file_content(path) + content = _read_file_content(path) if file_size > query["max_file_size"]: content = "[Content ignored: file too large]" @@ -366,26 +365,26 @@ def ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str] f"Lines: {len(content.splitlines()):,}\n" ) - files_content = create_file_content_string([file_info]) + files_content = _create_file_content_string([file_info]) tree = "Directory structure:\n└── " + os.path.basename(path) - formatted_tokens = generate_token_string(files_content) + formatted_tokens = _generate_token_string(files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" return summary, tree, files_content -def ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: - nodes = scan_directory(path=path, query=query) +def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: + nodes = _scan_directory(path=path, query=query) if not nodes: raise ValueError(f"No files found in {path}") - files = extract_files_content(query=query, node=nodes, max_file_size=query["max_file_size"]) - summary = create_summary_string(query, nodes) - tree = "Directory structure:\n" + create_tree_structure(query, nodes) - files_content = create_file_content_string(files) + files = _extract_files_content(query=query, node=nodes, max_file_size=query["max_file_size"]) + summary = _create_summary_string(query, nodes) + tree = "Directory structure:\n" + _create_tree_structure(query, nodes) + files_content = _create_file_content_string(files) - formatted_tokens = generate_token_string(tree + files_content) + formatted_tokens = _generate_token_string(tree + files_content) if formatted_tokens: summary += f"\nEstimated tokens: {formatted_tokens}" @@ -394,11 +393,11 @@ def ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]: """Main entry point for analyzing a codebase directory or single file.""" - path = os.path.join(query["local_path"], query["subpath"].lstrip(os.sep)) - if not os.path.exists(path) and not os.path.exists(os.path.dirname(path)): - raise ValueError(f"{query['subpath']} cannot be found") + path = f"{query['local_path']}{query['subpath']}" + if not os.path.exists(path): + raise ValueError(f"{query['slug']} cannot be found") if query.get("type") == "blob": - return ingest_single_file(path, query) + return _ingest_single_file(path, query) - return ingest_directory(path, query) + return _ingest_directory(path, query) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 2e6470e..477520a 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -6,11 +6,11 @@ from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -TMP_BASE_PATH = "../tmp" +TMP_BASE_PATH: str = "../tmp" HEX_DIGITS = set(string.hexdigits) -def parse_url(url: str) -> dict[str, Any]: +def _parse_url(url: str) -> dict[str, Any]: url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters @@ -69,14 +69,14 @@ def _is_valid_git_commit_hash(commit: str) -> bool: return len(commit) == 40 and all(c in HEX_DIGITS for c in commit) -def normalize_pattern(pattern: str) -> str: +def _normalize_pattern(pattern: str) -> str: pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): pattern += "*" return pattern -def parse_patterns(pattern: list[str] | str) -> list[str]: +def _parse_patterns(pattern: list[str] | str) -> list[str]: patterns = pattern if isinstance(pattern, list) else [pattern] patterns = [p.strip() for p in patterns] @@ -87,10 +87,10 @@ def parse_patterns(pattern: list[str] | str) -> list[str]: "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." ) - return [normalize_pattern(p) for p in patterns] + return [_normalize_pattern(p) for p in patterns] -def override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[str]) -> list[str]: +def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[str]) -> list[str]: """ Removes patterns from ignore_patterns that are present in include_patterns using set difference. @@ -109,7 +109,7 @@ def override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[ return list(set(ignore_patterns) - set(include_patterns)) -def parse_path(path: str) -> dict[str, Any]: +def _parse_path(path: str) -> dict[str, Any]: query = { "url": None, "local_path": os.path.abspath(path), @@ -151,19 +151,19 @@ def parse_query( """ # Determine the parsing method based on the source type if from_web or source.startswith("https://") or "github.com" in source: - query = parse_url(source) + query = _parse_url(source) else: - query = parse_path(source) + query = _parse_path(source) # Process ignore patterns ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() if ignore_patterns: - ignore_patterns_list += parse_patterns(ignore_patterns) + ignore_patterns_list += _parse_patterns(ignore_patterns) # Process include patterns and override ignore patterns accordingly if include_patterns: - parsed_include = parse_patterns(include_patterns) - ignore_patterns_list = override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) + parsed_include = _parse_patterns(include_patterns) + ignore_patterns_list = _override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) else: parsed_include = None diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 585ba6e..e3b8128 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -2,7 +2,7 @@ import pytest -from gitingest.clone import CloneConfig, check_repo_exists, clone_repo +from gitingest.clone import CloneConfig, _check_repo_exists, clone_repo @pytest.mark.asyncio @@ -14,9 +14,8 @@ async def test_clone_repo_with_commit() -> None: branch="main", ) - with patch("gitingest.clone.check_repo_exists", return_value=True) as mock_check: - with patch("gitingest.clone.run_git_command", new_callable=AsyncMock) as mock_exec: - + with patch("gitingest.clone._check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.clone._run_git_command", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process @@ -29,8 +28,8 @@ async def test_clone_repo_with_commit() -> None: async def test_clone_repo_without_commit() -> None: query = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", commit=None, branch="main") - with patch("gitingest.clone.check_repo_exists", return_value=True) as mock_check: - with patch("gitingest.clone.run_git_command", new_callable=AsyncMock) as mock_exec: + with patch("gitingest.clone._check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.clone._run_git_command", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process @@ -48,7 +47,7 @@ async def test_clone_repo_nonexistent_repository() -> None: commit=None, branch="main", ) - with patch("gitingest.clone.check_repo_exists", return_value=False) as mock_check: + with patch("gitingest.clone._check_repo_exists", return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): await clone_repo(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -65,13 +64,13 @@ async def test_check_repo_exists() -> None: # Test existing repository mock_process.returncode = 0 - assert await check_repo_exists(url) is True + assert await _check_repo_exists(url) is True # Test non-existing repository (404 response) mock_process.communicate.return_value = (b"HTTP/1.1 404 Not Found\n", b"") mock_process.returncode = 0 - assert await check_repo_exists(url) is False + assert await _check_repo_exists(url) is False # Test failed request mock_process.returncode = 1 - assert await check_repo_exists(url) is False + assert await _check_repo_exists(url) is False diff --git a/src/gitingest/tests/test_ingest.py b/src/gitingest/tests/test_ingest.py index fa8369a..53257a1 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/src/gitingest/tests/test_ingest.py @@ -3,7 +3,7 @@ import pytest -from gitingest.ingest_from_query import extract_files_content, scan_directory +from gitingest.ingest_from_query import _extract_files_content, _scan_directory # Test fixtures @@ -74,7 +74,7 @@ def temp_directory(tmp_path: Path) -> Path: def test_scan_directory(temp_directory: Path, sample_query: dict[str, Any]) -> None: - result = scan_directory(str(temp_directory), query=sample_query) + result = _scan_directory(str(temp_directory), query=sample_query) if result is None: assert False, "Result is None" @@ -85,10 +85,10 @@ def test_scan_directory(temp_directory: Path, sample_query: dict[str, Any]) -> N def test_extract_files_content(temp_directory: Path, sample_query: dict[str, Any]) -> None: - nodes = scan_directory(str(temp_directory), query=sample_query) + nodes = _scan_directory(str(temp_directory), query=sample_query) if nodes is None: assert False, "Nodes is None" - files = extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) + files = _extract_files_content(query=sample_query, node=nodes, max_file_size=1_000_000) assert len(files) == 8 # All .txt and .py files # Check for presence of key files diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index 71ff71e..b87856d 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -1,7 +1,7 @@ import pytest from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.parse_query import parse_query, parse_url +from gitingest.parse_query import _parse_url, parse_query def test_parse_url_valid() -> None: @@ -11,7 +11,7 @@ def test_parse_url_valid() -> None: "https://bitbucket.org/user/repo", ] for url in test_cases: - result = parse_url(url) + result = _parse_url(url) assert result["user_name"] == "user" assert result["repo_name"] == "repo" assert result["url"] == url @@ -20,7 +20,7 @@ def test_parse_url_valid() -> None: def test_parse_url_invalid() -> None: url = "https://only-domain.com" with pytest.raises(ValueError, match="Invalid repository URL"): - parse_url(url) + _parse_url(url) def test_parse_query_basic() -> None: diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 8406d5c..82b8e30 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,4 +1,3 @@ -## Async Timeout decorator import asyncio import functools from collections.abc import Awaitable, Callable @@ -13,6 +12,7 @@ class AsyncTimeoutError(Exception): def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + # Async Timeout decorator def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @functools.wraps(func) async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: diff --git a/src/main.py b/src/main.py index 18de770..d00367b 100644 --- a/src/main.py +++ b/src/main.py @@ -13,65 +13,124 @@ from routers import download, dynamic, index from server_utils import limiter +# Load environment variables from .env file load_dotenv() +# Initialize the FastAPI application app = FastAPI() app.state.limiter = limiter -# Define a wrapper handler with the correct signature async def rate_limit_exception_handler(request: Request, exc: Exception) -> Response: + """ + Custom exception handler for rate-limiting errors. + + Parameters + ---------- + request : Request + The incoming HTTP request. + exc : Exception + The exception raised, expected to be RateLimitExceeded. + + Returns + ------- + Response + A response indicating that the rate limit has been exceeded. + """ if isinstance(exc, RateLimitExceeded): - # Delegate to the actual handler + # Delegate to the default rate limit handler return _rate_limit_exceeded_handler(request, exc) - # Optionally, handle other exceptions or re-raise + # Re-raise other exceptions raise exc -# Register the wrapper handler +# Register the custom exception handler for rate limits app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) +# Mount static files to serve CSS, JS, and other static assets app.mount("/static", StaticFiles(directory="static"), name="static") -app_analytics_key = os.getenv("API_ANALYTICS_KEY") -if app_analytics_key: - app.add_middleware(Analytics, api_key=app_analytics_key) -# Define the default allowed hosts -default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] +# Set up API analytics middleware if an API key is provided +if app_analytics_key := os.getenv("API_ANALYTICS_KEY"): + app.add_middleware(Analytics, api_key=app_analytics_key) -# Fetch allowed hosts from the environment variable or use the default +# Fetch allowed hosts from the environment or use the default values allowed_hosts = os.getenv("ALLOWED_HOSTS") if allowed_hosts: allowed_hosts = allowed_hosts.split(",") else: + # Define the default allowed hosts for the application + default_allowed_hosts = ["gitingest.com", "*.gitingest.com", "localhost", "127.0.0.1"] allowed_hosts = default_allowed_hosts +# Add middleware to enforce allowed hosts app.add_middleware(TrustedHostMiddleware, allowed_hosts=allowed_hosts) + +# Set up template rendering templates = Jinja2Templates(directory="templates") @app.get("/health") async def health_check() -> dict[str, str]: + """ + Health check endpoint to verify that the server is running. + + Returns + ------- + dict[str, str] + A JSON object with a "status" key indicating the server's health status. + """ return {"status": "healthy"} @app.head("/") async def head_root() -> HTMLResponse: - """Mirror the headers and status code of the index page""" + """ + Respond to HTTP HEAD requests for the root URL. + + Mirrors the headers and status code of the index page. + + Returns + ------- + HTMLResponse + An empty HTML response with appropriate headers. + """ return HTMLResponse(content=None, headers={"content-type": "text/html; charset=utf-8"}) @app.get("/api/", response_class=HTMLResponse) @app.get("/api", response_class=HTMLResponse) async def api_docs(request: Request) -> HTMLResponse: + """ + Render the API documentation page. + + Parameters + ---------- + request : Request + The incoming HTTP request. + + Returns + ------- + HTMLResponse + A rendered HTML page displaying API documentation. + """ return templates.TemplateResponse("api.jinja", {"request": request}) @app.get("/robots.txt") async def robots() -> FileResponse: + """ + Serve the `robots.txt` file to guide search engine crawlers. + + Returns + ------- + FileResponse + The `robots.txt` file located in the static directory. + """ return FileResponse("static/robots.txt") +# Include routers for modular endpoints app.include_router(index) app.include_router(download) app.include_router(dynamic) diff --git a/src/process_query.py b/src/process_query.py index f55068c..470b675 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -12,6 +12,21 @@ def print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted summary of the query details, including the URL, file size, + and pattern information, for easier debugging or logging. + + Parameters + ---------- + url : str + The URL associated with the query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") if int(max_file_size / 1024) != 50: print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") @@ -22,12 +37,46 @@ def print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) - def print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted error message including the URL, file size, pattern details, and the exception encountered, + for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the query that caused the error. + e : Exception + The exception raised during the query or process. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print_query(url, max_file_size, pattern_type, pattern) print(f" | {Colors.RED}{e}{Colors.END}") def print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: + """ + Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated + tokens, for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the successful query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + summary : str + A summary of the query result, including details like estimated tokens. + """ estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") print_query(url, max_file_size, pattern_type, pattern) @@ -42,6 +91,32 @@ async def process_query( pattern: str = "", is_index: bool = False, ) -> _TemplateResponse: + """ + Process a query by parsing input, cloning a repository, and generating a summary. + + Handle user input, process GitHub repository data, and prepare + a response for rendering a template with the processed results or an error message. + + Parameters + ---------- + request : Request + The HTTP request object. + input_text : str + Input text provided by the user, typically a GitHub repository URL or slug. + slider_position : int + Position of the slider, representing the maximum file size in the query. + pattern_type : str, optional + Type of pattern to use, either "include" or "exclude" (default is "exclude"). + pattern : str, optional + Pattern to include or exclude in the query, depending on the pattern type. + is_index : bool, optional + Flag indicating whether the request is for the index page (default is False). + + Returns + ------- + _TemplateResponse + Rendered template response containing the processed results or an error message. + """ template = "index.jinja" if is_index else "github.jinja" max_file_size = logSliderToSize(slider_position) diff --git a/src/server_utils.py b/src/server_utils.py index 2a6e186..7af4b85 100644 --- a/src/server_utils.py +++ b/src/server_utils.py @@ -1,19 +1,29 @@ import math -## Rate Limiter from slowapi import Limiter from slowapi.util import get_remote_address +# Initialize a rate limiter limiter = Limiter(key_func=get_remote_address) -## Logarithmic slider to file size conversion def logSliderToSize(position: int) -> int: - """Convert slider position to file size in KB""" + """ + Convert a slider position to a file size in bytes using a logarithmic scale. + + Parameters + ---------- + position : int + Slider position ranging from 0 to 500. + + Returns + ------- + int + File size in bytes corresponding to the slider position. + """ maxp = 500 minv = math.log(1) - maxv = math.log(102400) - + maxv = math.log(102_400) return round(math.exp(minv + (maxv - minv) * pow(position / maxp, 1.5))) * 1024 From d77741bddf2c8855a28ad02ae961a315597ee694 Mon Sep 17 00:00:00 2001 From: Alikae <35228727+Alikae@users.noreply.github.com> Date: Mon, 30 Dec 2024 04:56:28 +0100 Subject: [PATCH 12/18] Fix: include patterns (#76) --- src/gitingest/parse_query.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 477520a..0d41e75 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,4 +1,5 @@ import os +import re import string import uuid from typing import Any @@ -77,17 +78,45 @@ def _normalize_pattern(pattern: str) -> str: def _parse_patterns(pattern: list[str] | str) -> list[str]: + """ + Parse and validate file/directory patterns for inclusion or exclusion. + + Takes either a single pattern string or list of pattern strings and processes them into a normalized list. + Patterns are split on commas and spaces, validated for allowed characters, and normalized. + + Parameters + ---------- + pattern : list[str] | str + Pattern(s) to parse - either a single string or list of strings + + Returns + ------- + list[str] + List of normalized pattern strings + + Raises + ------ + ValueError + If any pattern contains invalid characters. Only alphanumeric characters, + dash (-), underscore (_), dot (.), forward slash (/), plus (+), and + asterisk (*) are allowed. + """ patterns = pattern if isinstance(pattern, list) else [pattern] - patterns = [p.strip() for p in patterns] + parsed_patterns = [] for p in patterns: + parsed_patterns.extend(re.split(",| ", p)) + + parsed_patterns = [p for p in parsed_patterns if p != ""] + + for p in parsed_patterns: if not all(c.isalnum() or c in "-_./+*" for c in p): raise ValueError( f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." ) - return [_normalize_pattern(p) for p in patterns] + return [_normalize_pattern(p) for p in parsed_patterns] def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[str]) -> list[str]: From fab90a6f4fbb4d33be0f4a8287075548721c8505 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Mon, 30 Dec 2024 12:16:54 +0100 Subject: [PATCH 13/18] Add docstrings, refactor process_query, and move AsyncTimeoutError to gitingest.exceptions (#77) * refactor: prefix helper functions with an underscore * Add docstrings to functions and move AsyncTimeoutError to gitingest.exceptions * Refactor: Move process_query to top and prefix helper functions with an underscore --- src/gitingest/cli.py | 27 ++- src/gitingest/clone.py | 31 ++- src/gitingest/exceptions.py | 29 +++ src/gitingest/ingest.py | 36 +++- src/gitingest/ingest_from_query.py | 290 ++++++++++++++++++++++++++++- src/gitingest/parse_query.py | 208 +++++++++++++++------ src/gitingest/utils.py | 31 ++- src/process_query.py | 148 +++++++-------- src/routers/download.py | 27 +++ src/routers/dynamic.py | 44 +++++ src/routers/index.py | 43 +++++ 11 files changed, 763 insertions(+), 151 deletions(-) create mode 100644 src/gitingest/exceptions.py diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index f275efa..57d9f2c 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -17,7 +17,32 @@ def main( exclude_pattern: tuple[str, ...], include_pattern: tuple[str, ...], ) -> None: - """Analyze a directory and create a text dump of its contents.""" + """ + Analyze a directory or repository and create a text dump of its contents. + + This command analyzes the contents of a specified source directory or repository, + applies custom include and exclude patterns, and generates a text summary of the analysis + which is then written to an output file. + + Parameters + ---------- + source : str + The source directory or repository to analyze. + output : str | None + The path where the output file will be written. If not specified, the output will be written + to a file named `.txt` in the current directory. + max_size : int + The maximum file size to process, in bytes. Files larger than this size will be ignored. + exclude_pattern : tuple[str, ...] + A tuple of patterns to exclude during the analysis. Files matching these patterns will be ignored. + include_pattern : tuple[str, ...] + A tuple of patterns to include during the analysis. Only files matching these patterns will be processed. + + Raises + ------ + click.Abort + If there is an error during the execution of the command, this exception is raised to abort the process. + """ try: # Combine default and custom ignore patterns exclude_patterns = list(exclude_pattern) diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index a91b6a9..da6550f 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,13 +1,32 @@ import asyncio from dataclasses import dataclass -from gitingest.utils import AsyncTimeoutError, async_timeout +from gitingest.exceptions import AsyncTimeoutError +from gitingest.utils import async_timeout CLONE_TIMEOUT: int = 20 @dataclass class CloneConfig: + """ + Configuration for cloning a Git repository. + + This class holds the necessary parameters for cloning a repository to a local path, including + the repository's URL, the target local path, and optional parameters for a specific commit or branch. + + Attributes + ---------- + url : str + The URL of the Git repository to clone. + local_path : str + The local directory where the repository will be cloned. + commit : str | None, optional + The specific commit hash to check out after cloning (default is None). + branch : str | None, optional + The branch to clone (default is None). + """ + url: str local_path: str commit: str | None = None @@ -17,7 +36,11 @@ class CloneConfig: @async_timeout(CLONE_TIMEOUT) async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: """ - Clones a repository to a local path based on the provided query parameters. + Clones a repository to a local path based on the provided configuration. + + This function handles the process of cloning a Git repository to the local file system. + It can clone a specific branch or commit if provided, and it raises exceptions if + any errors occur during the cloning process. Parameters ---------- @@ -30,7 +53,7 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: Returns ------- - Tuple[bytes, bytes] + tuple[bytes, bytes] A tuple containing the stdout and stderr of the git commands executed. Raises @@ -123,7 +146,7 @@ async def _run_git_command(*args: str) -> tuple[bytes, bytes]: Returns ------- - Tuple[bytes, bytes] + tuple[bytes, bytes] A tuple containing the stdout and stderr of the git command. Raises diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py new file mode 100644 index 0000000..34263e4 --- /dev/null +++ b/src/gitingest/exceptions.py @@ -0,0 +1,29 @@ +class InvalidPatternError(ValueError): + """ + Exception raised when a pattern contains invalid characters. + + This exception is used to signal that a pattern provided for some operation + contains characters that are not allowed. The valid characters for the pattern + include alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), + plus (+), and asterisk (*). + + Parameters + ---------- + pattern : str + The invalid pattern that caused the error. + """ + + def __init__(self, pattern: str) -> None: + super().__init__( + f"Pattern '{pattern}' contains invalid characters. Only alphanumeric characters, dash (-), " + "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." + ) + + +class AsyncTimeoutError(Exception): + """ + Raised when an async operation exceeds its timeout limit. + + This exception is used by the `async_timeout` decorator to signal that the wrapped + asynchronous function has exceeded the specified time limit for execution. + """ diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index e4c673d..4bb329f 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -15,7 +15,39 @@ def ingest( exclude_patterns: list[str] | str | None = None, output: str | None = None, ) -> tuple[str, str, str]: + """ + Main entry point for ingesting a source and processing its contents. + This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), + and processes its files according to the specified query parameters. It returns a summary, a tree-like + structure of the files, and the content of the files. The results can optionally be written to an output file. + + Parameters + ---------- + source : str + The source to analyze, which can be a URL (for a GitHub repository) or a local directory path. + max_file_size : int, optional + The maximum allowed file size for file ingestion. Files larger than this size are ignored, by default 10*1024*1024 (10 MB). + include_patterns : list[str] | str | None, optional + A pattern or list of patterns specifying which files to include in the analysis. If `None`, all files are included. + exclude_patterns : list[str] | str | None, optional + A pattern or list of patterns specifying which files to exclude from the analysis. If `None`, no files are excluded. + output : str | None, optional + The file path where the summary and content should be written. If `None`, the results are not written to a file. + + Returns + ------- + tuple[str, str, str] + A tuple containing: + - A summary string of the analyzed repository or directory. + - A tree-like string representation of the file structure. + - The content of the files in the repository or directory. + + Raises + ------ + TypeError + If `clone_repo` does not return a coroutine, or if the `source` is of an unsupported type. + """ try: query = parse_query( source=source, @@ -42,8 +74,8 @@ def ingest( summary, tree, content = ingest_from_query(query) - if output: - with open(f"{output}", "w") as f: + if output is not None: + with open(output, "w") as f: f.write(tree + "\n" + content) return summary, tree, content diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index 886afa2..d8f57b7 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -11,6 +11,27 @@ def _should_include(path: str, base_path: str, include_patterns: list[str]) -> bool: + """ + Determines if the given file or directory path matches any of the include patterns. + + This function checks whether the relative path of a file or directory matches + any of the specified patterns. If a match is found, it returns `True`, indicating + that the file or directory should be included in further processing. + + Parameters + ---------- + path : str + The absolute path of the file or directory to check. + base_path : str + The base directory from which the relative path is calculated. + include_patterns : list[str] + A list of patterns to check against the relative path. + + Returns + ------- + bool + `True` if the path matches any of the include patterns, `False` otherwise. + """ rel_path = path.replace(base_path, "").lstrip(os.sep) include = False for pattern in include_patterns: @@ -20,6 +41,27 @@ def _should_include(path: str, base_path: str, include_patterns: list[str]) -> b def _should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool: + """ + Determines if the given file or directory path matches any of the ignore patterns. + + This function checks whether the relative path of a file or directory matches + any of the specified ignore patterns. If a match is found, it returns `True`, indicating + that the file or directory should be excluded from further processing. + + Parameters + ---------- + path : str + The absolute path of the file or directory to check. + base_path : str + The base directory from which the relative path is calculated. + ignore_patterns : list[str] + A list of patterns to check against the relative path. + + Returns + ------- + bool + `True` if the path matches any of the ignore patterns, `False` otherwise. + """ rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: if pattern and fnmatch(rel_path, pattern): @@ -28,7 +70,25 @@ def _should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bo def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: - """Check if a symlink points to a location within the base directory.""" + """ + Check if a symlink points to a location within the base directory. + + This function resolves the target of a symlink and ensures it is within the specified + base directory, returning `True` if it is safe, or `False` if the symlink points outside + the base directory. + + Parameters + ---------- + symlink_path : str + The path of the symlink to check. + base_path : str + The base directory to ensure the symlink points within. + + Returns + ------- + bool + `True` if the symlink points within the base directory, `False` otherwise. + """ try: target_path = os.path.realpath(symlink_path) base_path = os.path.realpath(base_path) @@ -39,7 +99,23 @@ def _is_safe_symlink(symlink_path: str, base_path: str) -> bool: def _is_text_file(file_path: str) -> bool: - """Determines if a file is likely a text file based on its content.""" + """ + Determine if a file is likely a text file based on its content. + + This function attempts to read the first 1024 bytes of a file and checks for the presence + of non-text characters. It returns `True` if the file is determined to be a text file, + otherwise returns `False`. + + Parameters + ---------- + file_path : str + The path to the file to check. + + Returns + ------- + bool + `True` if the file is likely a text file, `False` otherwise. + """ try: with open(file_path, "rb") as file: chunk = file.read(1024) @@ -49,6 +125,23 @@ def _is_text_file(file_path: str) -> bool: def _read_file_content(file_path: str) -> str: + """ + Reads the content of a file. + + This function attempts to open a file and read its contents using UTF-8 encoding. + If an error occurs during reading (e.g., file is not found or permission error), + it returns an error message. + + Parameters + ---------- + file_path : str + The path to the file to read. + + Returns + ------- + str + The content of the file, or an error message if the file could not be read. + """ try: with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() @@ -63,7 +156,31 @@ def _scan_directory( depth: int = 0, stats: dict[str, int] | None = None, ) -> dict[str, Any] | None: - """Recursively analyzes a directory and its contents with safety limits.""" + """ + Recursively analyze a directory and its contents with safety limits. + + This function scans a directory and its subdirectories up to a specified depth. It checks + for any file or directory that should be included or excluded based on the provided patterns + and limits. It also tracks the number of files and total size processed. + + Parameters + ---------- + path : str + The path of the directory to scan. + query : dict[str, Any] + A dictionary containing the query parameters, such as include and ignore patterns. + seen_paths : set[str] | None, optional + A set to track already visited paths, by default None. + depth : int, optional + The current depth of directory traversal, by default 0. + stats : dict[str, int] | None, optional + A dictionary to track statistics such as total file count and size, by default None. + + Returns + ------- + dict[str, Any] | None + A dictionary representing the directory structure and contents, or `None` if limits are reached. + """ if seen_paths is None: seen_paths = set() @@ -224,7 +341,28 @@ def _extract_files_content( max_file_size: int, files: list[dict[str, Any]] | None = None, ) -> list[dict[str, Any]]: - """Recursively collects all text files with their contents.""" + """ + Recursively collect all text files with their contents. + + This function traverses the directory tree and extracts the contents of all text files + into a list, ignoring non-text files or files that exceed the specified size limit. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing the query parameters, including the base path of the repository. + node : dict[str, Any] + The current directory or file node being processed. + max_file_size : int + The maximum file size in bytes for which content should be extracted. + files : list[dict[str, Any]] | None, optional + A list to collect the extracted files' information, by default None. + + Returns + ------- + list[dict[str, Any]] + A list of dictionaries, each containing the path, content (or `None` if too large), and size of each file. + """ if files is None: files = [] @@ -248,7 +386,22 @@ def _extract_files_content( def _create_file_content_string(files: list[dict[str, Any]]) -> str: - """Creates a formatted string of file contents with separators.""" + """ + Create a formatted string of file contents with separators. + + This function takes a list of files and generates a formatted string where each file’s + content is separated by a divider. If a README.md file is found, it is placed at the top. + + Parameters + ---------- + files : list[dict[str, Any]] + A list of dictionaries containing file information, including the path and content. + + Returns + ------- + str + A formatted string representing the contents of all the files with appropriate separators. + """ output = "" separator = "=" * 48 + "\n" @@ -278,7 +431,24 @@ def _create_file_content_string(files: list[dict[str, Any]]) -> str: def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: - """Creates a summary string with file counts and content size.""" + """ + Create a summary string with file counts and content size. + + This function generates a summary of the repository's contents, including the number + of files analyzed, the total content size, and other relevant details based on the query parameters. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing query parameters like repository name, commit, branch, and subpath. + nodes : dict[str, Any] + A dictionary representing the directory structure, including file and directory counts. + + Returns + ------- + str + A summary string containing details such as the repository name, file count, and other query-specific information. + """ if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" else: @@ -297,7 +467,28 @@ def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str: - """Creates a tree-like string representation of the file structure.""" + """ + Create a tree-like string representation of the file structure. + + This function generates a string representation of the directory structure, formatted + as a tree with appropriate indentation for nested directories and files. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing query parameters like repository name and subpath. + node : dict[str, Any] + The current directory or file node being processed. + prefix : str, optional + A string used for indentation and formatting of the tree structure, by default "". + is_last : bool, optional + A flag indicating whether the current node is the last in its directory, by default True. + + Returns + ------- + str + A string representing the directory structure formatted as a tree. + """ tree = "" if not node["name"]: @@ -319,7 +510,22 @@ def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: def _generate_token_string(context_string: str) -> str | None: - """Returns the number of tokens in a text string.""" + """ + Return the number of tokens in a text string. + + This function estimates the number of tokens in a given text string using the `tiktoken` + library. It returns the number of tokens in a human-readable format (e.g., '1.2k', '1.2M'). + + Parameters + ---------- + context_string : str + The text string for which the token count is to be estimated. + + Returns + ------- + str | None + The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. + """ formatted_tokens = "" try: encoding = tiktoken.get_encoding("cl100k_base") @@ -340,6 +546,29 @@ def _generate_token_string(context_string: str) -> str | None: def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]: + """ + Ingest a single file and return its summary, directory structure, and content. + + This function reads a file, generates a summary of its contents, and returns the content + along with its directory structure and token estimation. + + Parameters + ---------- + path : str + The path of the file to ingest. + query : dict[str, Any] + A dictionary containing query parameters, such as the maximum file size. + + Returns + ------- + tuple[str, str, str] + A tuple containing the summary, directory structure, and file content. + + Raises + ------ + ValueError + If the specified path is not a file or if the file is not a text file. + """ if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") @@ -376,6 +605,29 @@ def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: + """ + Ingest an entire directory and return its summary, directory structure, and file contents. + + This function processes a directory, extracts its contents, and generates a summary, + directory structure, and file content. It recursively processes subdirectories as well. + + Parameters + ---------- + path : str + The path of the directory to ingest. + query : dict[str, Any] + A dictionary containing query parameters, including maximum file size. + + Returns + ------- + tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. + + Raises + ------ + ValueError + If no files are found in the directory. + """ nodes = _scan_directory(path=path, query=query) if not nodes: raise ValueError(f"No files found in {path}") @@ -392,7 +644,27 @@ def _ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]: - """Main entry point for analyzing a codebase directory or single file.""" + """ + Main entry point for analyzing a codebase directory or single file. + + This function processes a file or directory based on the provided query, extracting its contents + and generating a summary, directory structure, and file content, along with token estimations. + + Parameters + ---------- + query : dict[str, Any] + A dictionary containing parameters like local path, subpath, file type, etc. + + Returns + ------- + tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. + + Raises + ------ + ValueError + If the specified path cannot be found or if the file is not a text file. + """ path = f"{query['local_path']}{query['subpath']}" if not os.path.exists(path): raise ValueError(f"{query['slug']} cannot be found") diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 0d41e75..18a78e9 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -5,13 +5,98 @@ from typing import Any from urllib.parse import unquote +from gitingest.exceptions import InvalidPatternError from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH: str = "../tmp" HEX_DIGITS = set(string.hexdigits) +def parse_query( + source: str, + max_file_size: int, + from_web: bool, + include_patterns: list[str] | str | None = None, + ignore_patterns: list[str] | str | None = None, +) -> dict[str, Any]: + """ + Parses the input source to construct a query dictionary with specified parameters. + + This function processes the provided source (either a URL or file path) and builds a + query dictionary that includes information such as the source URL, maximum file size, + and any patterns to include or ignore. It handles both web and file-based sources. + + Parameters + ---------- + source : str + The source URL or file path to parse. + max_file_size : int + The maximum file size in bytes to include. + from_web : bool + Flag indicating whether the source is a web URL. + include_patterns : list[str] | str | None, optional + Patterns to include, by default None. Can be a list of strings or a single string. + ignore_patterns : list[str] | str | None, optional + Patterns to ignore, by default None. Can be a list of strings or a single string. + + Returns + ------- + dict[str, Any] + A dictionary containing the parsed query parameters, including 'max_file_size', + 'ignore_patterns', and 'include_patterns'. + """ + # Determine the parsing method based on the source type + if from_web or source.startswith("https://") or "github.com" in source: + query = _parse_url(source) + else: + query = _parse_path(source) + + # Process ignore patterns + ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() + if ignore_patterns: + ignore_patterns_list += _parse_patterns(ignore_patterns) + + # Process include patterns and override ignore patterns accordingly + if include_patterns: + parsed_include = _parse_patterns(include_patterns) + ignore_patterns_list = _override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) + else: + parsed_include = None + + # Update the query dictionary with max_file_size and processed patterns + query.update( + { + "max_file_size": max_file_size, + "ignore_patterns": ignore_patterns_list, + "include_patterns": parsed_include, + } + ) + return query + + def _parse_url(url: str) -> dict[str, Any]: + """ + Parses a GitHub repository URL into a structured query dictionary. + + This function extracts relevant information from a GitHub URL, such as the username, + repository name, commit, branch, and subpath, and returns them in a structured format. + + Parameters + ---------- + url : str + The GitHub URL to parse. + + Returns + ------- + dict[str, Any] + A dictionary containing the parsed details of the GitHub repository, including + the username, repository name, commit, branch, and other relevant information. + + Raises + ------ + ValueError + If the URL is invalid or does not correspond to a valid Git repository. + """ url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters @@ -67,10 +152,42 @@ def _parse_url(url: str) -> dict[str, Any]: def _is_valid_git_commit_hash(commit: str) -> bool: + """ + Validates if the provided string is a valid Git commit hash. + + This function checks if the commit hash is a 40-character string consisting only + of hexadecimal digits, which is the standard format for Git commit hashes. + + Parameters + ---------- + commit : str + The string to validate as a Git commit hash. + + Returns + ------- + bool + True if the string is a valid 40-character Git commit hash, otherwise False. + """ return len(commit) == 40 and all(c in HEX_DIGITS for c in commit) def _normalize_pattern(pattern: str) -> str: + """ + Normalizes the given pattern by removing leading separators and appending a wildcard. + + This function processes the pattern string by stripping leading directory separators + and appending a wildcard (`*`) if the pattern ends with a separator. + + Parameters + ---------- + pattern : str + The pattern to normalize. + + Returns + ------- + str + The normalized pattern. + """ pattern = pattern.lstrip(os.sep) if pattern.endswith(os.sep): pattern += "*" @@ -96,7 +213,7 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]: Raises ------ - ValueError + InvalidPatternError If any pattern contains invalid characters. Only alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed. @@ -110,11 +227,8 @@ def _parse_patterns(pattern: list[str] | str) -> list[str]: parsed_patterns = [p for p in parsed_patterns if p != ""] for p in parsed_patterns: - if not all(c.isalnum() or c in "-_./+*" for c in p): - raise ValueError( - f"Pattern '{p}' contains invalid characters. Only alphanumeric characters, dash (-), " - "underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*) are allowed." - ) + if not _is_valid_pattern(p): + raise InvalidPatternError(p) return [_normalize_pattern(p) for p in parsed_patterns] @@ -125,20 +239,37 @@ def _override_ignore_patterns(ignore_patterns: list[str], include_patterns: list Parameters ---------- - ignore_patterns : List[str] + ignore_patterns : list[str] The list of patterns to potentially remove. - include_patterns : List[str] + include_patterns : list[str] The list of patterns to exclude from ignore_patterns. Returns ------- - List[str] + list[str] A new list of ignore_patterns with specified patterns removed. """ return list(set(ignore_patterns) - set(include_patterns)) def _parse_path(path: str) -> dict[str, Any]: + """ + Parses a file path into a structured query dictionary. + + This function takes a file path and constructs a query dictionary that includes + relevant details such as the absolute path and the slug (a combination of the + directory and file names). + + Parameters + ---------- + path : str + The file path to parse. + + Returns + ------- + dict[str, Any] + A dictionary containing parsed details such as the local file path and slug. + """ query = { "url": None, "local_path": os.path.abspath(path), @@ -149,59 +280,22 @@ def _parse_path(path: str) -> dict[str, Any]: return query -def parse_query( - source: str, - max_file_size: int, - from_web: bool, - include_patterns: list[str] | str | None = None, - ignore_patterns: list[str] | str | None = None, -) -> dict[str, Any]: +def _is_valid_pattern(pattern: str) -> bool: """ - Parses the input source to construct a query dictionary with specified parameters. + Validates if the given pattern contains only valid characters. + + This function checks if the pattern contains only alphanumeric characters or one + of the following allowed characters: dash (`-`), underscore (`_`), dot (`.`), + forward slash (`/`), plus (`+`), or asterisk (`*`). Parameters ---------- - source : str - The source URL or file path to parse. - max_file_size : int - The maximum file size in bytes to include. - from_web : bool - Flag indicating whether the source is a web URL. - include_patterns : Optional[Union[List[str], str]], optional - Patterns to include, by default None. Can be a list of strings or a single string. - ignore_patterns : Optional[Union[List[str], str]], optional - Patterns to ignore, by default None. Can be a list of strings or a single string. + pattern : str + The pattern to validate. Returns ------- - Dict[str, Any] - A dictionary containing the parsed query parameters, including 'max_file_size', - 'ignore_patterns', and 'include_patterns'. + bool + True if the pattern is valid, otherwise False. """ - # Determine the parsing method based on the source type - if from_web or source.startswith("https://") or "github.com" in source: - query = _parse_url(source) - else: - query = _parse_path(source) - - # Process ignore patterns - ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() - if ignore_patterns: - ignore_patterns_list += _parse_patterns(ignore_patterns) - - # Process include patterns and override ignore patterns accordingly - if include_patterns: - parsed_include = _parse_patterns(include_patterns) - ignore_patterns_list = _override_ignore_patterns(ignore_patterns_list, include_patterns=parsed_include) - else: - parsed_include = None - - # Update the query dictionary with max_file_size and processed patterns - query.update( - { - "max_file_size": max_file_size, - "ignore_patterns": ignore_patterns_list, - "include_patterns": parsed_include, - } - ) - return query + return all(c.isalnum() or c in "-_./+*" for c in pattern) diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 82b8e30..bc95bfc 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -3,16 +3,39 @@ from collections.abc import Awaitable, Callable from typing import ParamSpec, TypeVar +from gitingest.exceptions import AsyncTimeoutError + T = TypeVar("T") P = ParamSpec("P") -class AsyncTimeoutError(Exception): - """Raised when an async operation exceeds its timeout limit.""" +def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + """ + Async Timeout decorator. + This decorator wraps an asynchronous function and ensures it does not run for + longer than the specified number of seconds. If the function execution exceeds + this limit, it raises an `AsyncTimeoutError`. + + Parameters + ---------- + seconds : int, optional + The maximum allowed time (in seconds) for the asynchronous function to complete. + The default is 10 seconds. + + Returns + ------- + Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]] + A decorator that, when applied to an async function, ensures the function + completes within the specified time limit. If the function takes too long, + an `AsyncTimeoutError` is raised. + + Raises + ------ + AsyncTimeoutError + If the wrapped asynchronous function does not complete within the specified time limit. + """ -def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: - # Async Timeout decorator def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @functools.wraps(func) async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: diff --git a/src/process_query.py b/src/process_query.py index 470b675..4053e45 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -11,78 +11,6 @@ templates = Jinja2Templates(directory="templates") -def print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: - """ - Print a formatted summary of the query details, including the URL, file size, - and pattern information, for easier debugging or logging. - - Parameters - ---------- - url : str - The URL associated with the query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - """ - print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") - if int(max_file_size / 1024) != 50: - print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") - if pattern_type == "include" and pattern != "": - print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") - elif pattern_type == "exclude" and pattern != "": - print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") - - -def print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: - """ - Print a formatted error message including the URL, file size, pattern details, and the exception encountered, - for debugging or logging purposes. - - Parameters - ---------- - url : str - The URL associated with the query that caused the error. - e : Exception - The exception raised during the query or process. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - """ - print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") - print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.RED}{e}{Colors.END}") - - -def print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: - """ - Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated - tokens, for debugging or logging purposes. - - Parameters - ---------- - url : str - The URL associated with the successful query. - max_file_size : int - The maximum file size allowed for the query, in bytes. - pattern_type : str - Specifies the type of pattern to use, either "include" or "exclude". - pattern : str - The actual pattern string to include or exclude in the query. - summary : str - A summary of the query result, including details like estimated tokens. - """ - estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] - print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") - print_query(url, max_file_size, pattern_type, pattern) - print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") - - async def process_query( request: Request, input_text: str, @@ -149,7 +77,7 @@ async def process_query( except Exception as e: # hack to print error message when query is not defined if "query" in locals() and query is not None and isinstance(query, dict): - print_error(query["url"], e, max_file_size, pattern_type, pattern) + _print_error(query["url"], e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") @@ -173,7 +101,7 @@ async def process_query( "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] ) - print_success( + _print_success( url=query["url"], max_file_size=max_file_size, pattern_type=pattern_type, @@ -197,3 +125,75 @@ async def process_query( "pattern": pattern, }, ) + + +def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted summary of the query details, including the URL, file size, + and pattern information, for easier debugging or logging. + + Parameters + ---------- + url : str + The URL associated with the query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ + print(f"{Colors.WHITE}{url:<20}{Colors.END}", end="") + if int(max_file_size / 1024) != 50: + print(f" | {Colors.YELLOW}Size: {int(max_file_size/1024)}kb{Colors.END}", end="") + if pattern_type == "include" and pattern != "": + print(f" | {Colors.YELLOW}Include {pattern}{Colors.END}", end="") + elif pattern_type == "exclude" and pattern != "": + print(f" | {Colors.YELLOW}Exclude {pattern}{Colors.END}", end="") + + +def _print_error(url: str, e: Exception, max_file_size: int, pattern_type: str, pattern: str) -> None: + """ + Print a formatted error message including the URL, file size, pattern details, and the exception encountered, + for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the query that caused the error. + e : Exception + The exception raised during the query or process. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + """ + print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") + _print_query(url, max_file_size, pattern_type, pattern) + print(f" | {Colors.RED}{e}{Colors.END}") + + +def _print_success(url: str, max_file_size: int, pattern_type: str, pattern: str, summary: str) -> None: + """ + Print a formatted success message, including the URL, file size, pattern details, and a summary with estimated + tokens, for debugging or logging purposes. + + Parameters + ---------- + url : str + The URL associated with the successful query. + max_file_size : int + The maximum file size allowed for the query, in bytes. + pattern_type : str + Specifies the type of pattern to use, either "include" or "exclude". + pattern : str + The actual pattern string to include or exclude in the query. + summary : str + A summary of the query result, including details like estimated tokens. + """ + estimated_tokens = summary[summary.index("Estimated tokens:") + len("Estimated ") :] + print(f"{Colors.GREEN}INFO{Colors.END}: {Colors.GREEN}<- {Colors.END}", end="") + _print_query(url, max_file_size, pattern_type, pattern) + print(f" | {Colors.PURPLE}{estimated_tokens}{Colors.END}") diff --git a/src/routers/download.py b/src/routers/download.py index 2dc1022..fdbf2bb 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -10,6 +10,33 @@ @router.get("/download/{digest_id}") async def download_ingest(digest_id: str) -> Response: + """ + Downloads a .txt file associated with a given digest ID. + + This function searches for a `.txt` file in a directory corresponding to the provided + digest ID. If a file is found, it is read and returned as a downloadable attachment. + If no `.txt` file is found, an error is raised. + + Parameters + ---------- + digest_id : str + The unique identifier for the digest. It is used to find the corresponding directory + and locate the .txt file within that directory. + + Returns + ------- + Response + A FastAPI Response object containing the content of the found `.txt` file. The file is + sent with the appropriate media type (`text/plain`) and the correct `Content-Disposition` + header to prompt a file download. + + Raises + ------ + FileNotFoundError + If no `.txt` file is found in the directory corresponding to the given `digest_id`. + HTTPException + If the digest directory is not found or if no `.txt` file exists in the directory. + """ try: # Find the first .txt file in the directory directory = f"{TMP_BASE_PATH}/{digest_id}" diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py index 12216f1..bfd6d44 100644 --- a/src/routers/dynamic.py +++ b/src/routers/dynamic.py @@ -11,6 +11,25 @@ @router.get("/{full_path:path}") async def catch_all(request: Request, full_path: str) -> HTMLResponse: + """ + Renders a page with a GitHub URL based on the provided path. + + This endpoint catches all GET requests with a dynamic path, constructs a GitHub URL + using the `full_path` parameter, and renders the `github.jinja` template with that URL. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + full_path : str + The full path extracted from the URL, which is used to build the GitHub URL. + + Returns + ------- + HTMLResponse + An HTML response containing the rendered template, with the GitHub URL + and other default parameters such as loading state and file size. + """ return templates.TemplateResponse( "github.jinja", { @@ -31,6 +50,31 @@ async def process_catch_all( pattern_type: str = Form(...), pattern: str = Form(...), ) -> HTMLResponse: + """ + Processes the form submission with user input for query parameters. + + This endpoint handles POST requests, processes the input parameters (e.g., text, file size, pattern), + and calls the `process_query` function to handle the query logic, returning the result as an HTML response. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + input_text : str, optional + The input text provided by the user for processing, by default taken from the form. + max_file_size : int, optional + The maximum allowed file size for the input, specified by the user. + pattern_type : str, optional + The type of pattern used for the query, specified by the user. + pattern : str, optional + The pattern string used in the query, specified by the user. + + Returns + ------- + HTMLResponse + An HTML response generated after processing the form input and query logic, + which will be rendered and returned to the user. + """ return await process_query( request, input_text, diff --git a/src/routers/index.py b/src/routers/index.py index f272880..9665bd0 100644 --- a/src/routers/index.py +++ b/src/routers/index.py @@ -12,6 +12,23 @@ @router.get("/", response_class=HTMLResponse) async def home(request: Request) -> HTMLResponse: + """ + Renders the home page with example repositories and default parameters. + + This endpoint serves the home page of the application, rendering the `index.jinja` template + and providing it with a list of example repositories and default file size values. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + + Returns + ------- + HTMLResponse + An HTML response containing the rendered home page template, with example repositories + and other default parameters such as file size. + """ return templates.TemplateResponse( "index.jinja", { @@ -31,6 +48,32 @@ async def index_post( pattern_type: str = Form(...), pattern: str = Form(...), ) -> HTMLResponse: + """ + Processes the form submission with user input for query parameters. + + This endpoint handles POST requests from the home page form. It processes the user-submitted + input (e.g., text, file size, pattern type) and invokes the `process_query` function to handle + the query logic, returning the result as an HTML response. + + Parameters + ---------- + request : Request + The incoming request object, which provides context for rendering the response. + input_text : str, optional + The input text provided by the user for processing, by default taken from the form. + max_file_size : int, optional + The maximum allowed file size for the input, specified by the user. + pattern_type : str, optional + The type of pattern used for the query, specified by the user. + pattern : str, optional + The pattern string used in the query, specified by the user. + + Returns + ------- + HTMLResponse + An HTML response containing the results of processing the form input and query logic, + which will be rendered and returned to the user. + """ return await process_query( request, input_text, From 2cefa678597c6895c8b657a419c1c5c7fe99d342 Mon Sep 17 00:00:00 2001 From: Romain Courtois Date: Tue, 31 Dec 2024 06:44:46 +0100 Subject: [PATCH 14/18] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7e02c46..70a2192 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # GitIngest +[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) + [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/cyclotruc/gitingest/blob/main/LICENSE) [![PyPI version](https://badge.fury.io/py/gitingest.svg)](https://badge.fury.io/py/gitingest) [![Downloads](https://pepy.tech/badge/gitingest)](https://pepy.tech/project/gitingest) @@ -7,7 +9,6 @@ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Discord](https://dcbadge.limes.pink/api/server/https://discord.com/invite/zerRaGK9EC)](https://discord.com/invite/zerRaGK9EC) -[![Image](./docs/frontpage.png "GitIngest main page")](https://gitingest.com) Turn any Git repository into a prompt-friendly text ingest for LLMs. From d556b0abaf288ee53d5494d28711952cf9def209 Mon Sep 17 00:00:00 2001 From: cyclotruc Date: Tue, 31 Dec 2024 06:30:01 +0000 Subject: [PATCH 15/18] Update README.md --- README.md | 83 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 70a2192..01ab27d 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,6 @@ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Discord](https://dcbadge.limes.pink/api/server/https://discord.com/invite/zerRaGK9EC)](https://discord.com/invite/zerRaGK9EC) - Turn any Git repository into a prompt-friendly text ingest for LLMs. You can also replace `hub` with `ingest` in any github url to access the coresponding digest @@ -63,13 +62,6 @@ summary, tree, content = ingest("https://github.com/cyclotruc/gitingest") By default, this won't write a file but can be enabled with the `output` argument -## πŸ› οΈ Using - -- Tailwind CSS - Frontend -- [FastAPI](https://github.com/fastapi/fastapi) - Backend framework -- [tiktoken](https://github.com/openai/tiktoken) - Token estimation -- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics - ## 🌐 Self-host 1. Build the image: @@ -85,35 +77,45 @@ By default, this won't write a file but can be enabled with the `output` argumen ``` The application will be available at `http://localhost:8000` -Ensure environment variables are set before running the application or deploying it via Docker. -## βœ”οΈ Contributing +If you are hosting it on a domain, you can specify the allowed hostnames via env variable `ALLOWED_HOSTS`. -Contributions are welcome! + ```bash + #Default: "gitingest.com,*.gitingest.com,localhost, 127.0.0.1". + ALLOWED_HOSTS="example.com, localhost, 127.0.0.1" + ``` -Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) +## πŸ› οΈ Stack -### Ways to contribute +- [Tailwind CSS](https://tailwindcss.com/) - Frontend +- [FastAPI](https://github.com/fastapi/fastapi) - Backend framework +- [Jinja2](https://jinja.palletsprojects.com/) - HTML templating +- [tiktoken](https://github.com/openai/tiktoken) - Token estimation +- [apianalytics.dev](https://www.apianalytics.dev/) - Simple Analytics -1. Provide your feedback and ideas on discord -2. Open an Issue on github to report a bug -3. Create a Pull request - - Fork the repository - - Make your changes and test them locally - - Open a pull request for review and feedback +## βœ”οΈ Contributing to Gitingest -### πŸ”§ Local dev +Gitingest aims to be friendly for first time contributors, with a simple python and html codebase. + If you need any help while working with the code, reach out to us on [discord](https://discord.com/invite/zerRaGK9EC) -#### Environment Configuration +### Ways to help (non-technical) -- **`ALLOWED_HOSTS`**: Specify allowed hostnames for the application. Default: `"gitingest.com,*.gitingest.com,gitdigest.dev,localhost"`. -You can configure the application using the following environment variables: +- Provide your feedback and ideas on discord +- Open an Issue on github to report a bug / submit an feature request +- Talk about Gitingest on social media -```bash -ALLOWED_HOSTS="gitingest.local,localhost" -``` +### How to submit a PR + +1. Fork the repository & clone it locally +2. Setup the dev environment (see Development section bellow) +3. Run unit tests with `pytest` +4. Commit your changes and run `pre-commit` +5. Open a pull request on Github for review and feedback +6. (Optionnal) Invite project maintainer to your branch for easier collaboration + +## πŸ”§ Development -#### Run locally +### Run web UI locally 1. Clone the repository @@ -125,7 +127,10 @@ ALLOWED_HOSTS="gitingest.local,localhost" 2. Install dependencies ```bash - pip install -r requirements.txt + pip install -r requirements-dev.txt + python -m venv .venv + source .venv/bin/activate + pre-commit install ``` 3. Run the application: @@ -134,3 +139,25 @@ ALLOWED_HOSTS="gitingest.local,localhost" cd src uvicorn main:app --reload ``` + +4. Run unit tests + + ```bash + pytest + ``` + +The application should be available at `http://localhost:8000` + +### Working on the CLI + +1. Install the package in dev mode + + ```bash + pip install -e . + ``` + +2. Run the CLI + + ```bash + gitingest --help + ``` From 49de436cfd06aa5974eb7601018cd4675470becf Mon Sep 17 00:00:00 2001 From: Joydeep Tripathy <113792434+joydeep049@users.noreply.github.com> Date: Tue, 31 Dec 2024 12:11:46 +0530 Subject: [PATCH 16/18] test: added unit tests for clone.py (#82) --- src/gitingest/tests/test_clone.py | 95 +++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index e3b8128..c124730 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -74,3 +74,98 @@ async def test_check_repo_exists() -> None: # Test failed request mock_process.returncode = 1 assert await _check_repo_exists(url) is False + + +@pytest.mark.asyncio +async def test_clone_repo_invalid_url() -> None: + clone_config = CloneConfig( + url="", + local_path="/tmp/repo", + ) + with pytest.raises(ValueError, match="The 'url' parameter is required."): + await clone_repo(clone_config) + + +@pytest.mark.asyncio +async def test_clone_repo_invalid_local_path() -> None: + clone_config = CloneConfig( + url="https://github.com/user/repo", + local_path="", + ) + with pytest.raises(ValueError, match="The 'local_path' parameter is required."): + await clone_repo(clone_config) + + +@pytest.mark.asyncio +async def test_clone_repo_with_custom_branch() -> None: + clone_config = CloneConfig( + url="https://github.com/user/repo", + local_path="/tmp/repo", + branch="feature-branch", + ) + with patch("gitingest.clone._check_repo_exists", return_value=True): + with patch("gitingest.clone._run_git_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) + mock_exec.assert_called_once_with( + "git", + "clone", + "--depth=1", + "--single-branch", + "--branch", + "feature-branch", + clone_config.url, + clone_config.local_path, + ) + + +@pytest.mark.asyncio +async def test_git_command_failure() -> None: + clone_config = CloneConfig( + url="https://github.com/user/repo", + local_path="/tmp/repo", + ) + with patch("gitingest.clone._check_repo_exists", return_value=True): + with patch("gitingest.clone._run_git_command", side_effect=RuntimeError("Git command failed")): + with pytest.raises(RuntimeError, match="Git command failed"): + await clone_repo(clone_config) + + +@pytest.mark.asyncio +async def test_clone_repo_default_shallow_clone() -> None: + clone_config = CloneConfig( + url="https://github.com/user/repo", + local_path="/tmp/repo", + ) + with patch("gitingest.clone._check_repo_exists", return_value=True): + with patch("gitingest.clone._run_git_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) + mock_exec.assert_called_once_with( + "git", "clone", "--depth=1", "--single-branch", clone_config.url, clone_config.local_path + ) + + +@pytest.mark.asyncio +async def test_clone_repo_commit_without_branch() -> None: + clone_config = CloneConfig( + url="https://github.com/user/repo", + local_path="/tmp/repo", + commit="a" * 40, # Simulating a valid commit hash + ) + with patch("gitingest.clone._check_repo_exists", return_value=True): + with patch("gitingest.clone._run_git_command", new_callable=AsyncMock) as mock_exec: + await clone_repo(clone_config) + assert mock_exec.call_count == 2 # Clone and checkout calls + mock_exec.assert_any_call("git", "clone", "--single-branch", clone_config.url, clone_config.local_path) + mock_exec.assert_any_call("git", "-C", clone_config.local_path, "checkout", clone_config.commit) + + +@pytest.mark.asyncio +async def test_check_repo_exists_with_redirect() -> None: + url = "https://github.com/user/repo" + with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() + mock_process.communicate.return_value = (b"HTTP/1.1 302 Found\n", b"") + mock_process.returncode = 0 # Simulate successful request + mock_exec.return_value = mock_process + + assert await _check_repo_exists(url) From 4b8896975fb7516085566d3fa30e555f99cb4871 Mon Sep 17 00:00:00 2001 From: Joydeep Tripathy <113792434+joydeep049@users.noreply.github.com> Date: Tue, 31 Dec 2024 12:39:01 +0530 Subject: [PATCH 17/18] ci: disable windows tests (#86) Signed-off-by: joydeep049 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 163c2a8..9fbbf5d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,7 +12,7 @@ jobs: strategy: fail-fast: true matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-latest, macos-latest] python-version: ["3.10", "3.11", "3.12", "3.13"] steps: From 36b04a5f95922e59af1225718df93a79a005b2bc Mon Sep 17 00:00:00 2001 From: Joydeep Tripathy <113792434+joydeep049@users.noreply.github.com> Date: Tue, 31 Dec 2024 13:05:03 +0530 Subject: [PATCH 18/18] test: added unit test for parse_query (#81) --- src/gitingest/tests/test_parse_query.py | 98 ++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index b87856d..8ce3ff0 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -1,7 +1,7 @@ import pytest from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS -from gitingest.parse_query import _parse_url, parse_query +from gitingest.parse_query import _parse_patterns, _parse_url, parse_query def test_parse_url_valid() -> None: @@ -44,3 +44,99 @@ def test_parse_query_invalid_pattern() -> None: url = "https://github.com/user/repo" with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") + + +def test_parse_url_with_subpaths() -> None: + url = "https://github.com/user/repo/tree/main/subdir/file" + result = _parse_url(url) + assert result["user_name"] == "user" + assert result["repo_name"] == "repo" + assert result["branch"] == "main" + assert result["subpath"] == "/subdir/file" + + +def test_parse_url_invalid_repo_structure() -> None: + url = "https://github.com/user" + with pytest.raises(ValueError, match="Invalid repository URL"): + _parse_url(url) + + +def test_parse_patterns_valid() -> None: + patterns = "*.py, *.md, docs/*" + result = _parse_patterns(patterns) + assert result == ["*.py", "*.md", "docs/*"] + + +def test_parse_patterns_invalid_characters() -> None: + patterns = "*.py;rm -rf" + with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): + _parse_patterns(patterns) + + +def test_parse_query_with_large_file_size() -> None: + url = "https://github.com/user/repo" + result = parse_query(url, max_file_size=10**9, from_web=True) + assert result["max_file_size"] == 10**9 + assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS + + +def test_parse_query_empty_patterns() -> None: + url = "https://github.com/user/repo" + result = parse_query(url, max_file_size=50, from_web=True, include_patterns="", ignore_patterns="") + assert result["include_patterns"] is None + assert result["ignore_patterns"] == DEFAULT_IGNORE_PATTERNS + + +def test_parse_query_include_and_ignore_overlap() -> None: + url = "https://github.com/user/repo" + result = parse_query( + url, + max_file_size=50, + from_web=True, + include_patterns="*.py", + ignore_patterns=["*.py", "*.txt"], + ) + assert result["include_patterns"] == ["*.py"] + assert "*.py" not in result["ignore_patterns"] + assert "*.txt" in result["ignore_patterns"] + + +def test_parse_query_local_path() -> None: + path = "/home/user/project" + result = parse_query(path, max_file_size=100, from_web=False) + assert result["local_path"] == "/home/user/project" + assert result["id"] is not None + assert result["slug"] == "user/project" + + +def test_parse_query_relative_path() -> None: + path = "./project" + result = parse_query(path, max_file_size=100, from_web=False) + assert result["local_path"].endswith("project") + assert result["slug"].endswith("project") + + +def test_parse_query_empty_source() -> None: + with pytest.raises(ValueError, match="Invalid repository URL"): + parse_query("", max_file_size=100, from_web=True) + + +def test_parse_url_branch_and_commit_distinction() -> None: + url_branch = "https://github.com/user/repo/tree/main" + url_commit = "https://github.com/user/repo/tree/abcd1234abcd1234abcd1234abcd1234abcd1234" + + result_branch = _parse_url(url_branch) + result_commit = _parse_url(url_commit) + + assert result_branch["branch"] == "main" + assert result_branch["commit"] is None + + assert result_commit["branch"] is None + assert result_commit["commit"] == "abcd1234abcd1234abcd1234abcd1234abcd1234" + + +def test_parse_query_uuid_uniqueness() -> None: + path = "/home/user/project" + result1 = parse_query(path, max_file_size=100, from_web=False) + result2 = parse_query(path, max_file_size=100, from_web=False) + assert result1["id"] != result2["id"]