From 61dbc8b327573d40b400c74bec98da336dc1bfa9 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Sun, 29 Dec 2024 10:37:48 +0100 Subject: [PATCH] Refactor project structure, enhance logic, update configurations, and improve code quality Refactoring and Logic Improvements - Refactored the `_scan_directory` function in `src/gitingest/ingest_from_query.py` by extracting loop logic into the new `_process_item` function, and further separating functionality into `_process_symlink` and `_process_file` - Replaced multiple return statements with error raising and catching, introducing custom exceptions (`MaxFilesReachedError`, `MaxFileSizeReachedError`, `AlreadyVisitedError`) in the `_process_item` and `_scan_directory` functions - Enhanced the logic in the `process_query` function in `src/process_query.py` for better flow and maintainability - Improved the logic in `_generate_token_string` in `src/gitingest/ingest_from_query.py` - Refined the `download_ingest` function in `src/routers/download.py` for better clarity and functionality Exception Handling Enhancements - Replaced broad `Exception` handling with specific `OSError` in the `_read_file_content` function in `src/gitingest/ingest_from_query.py` - Refined exception handling throughout the codebase, including removing redundant try-except-raise blocks, e.g., in `clone_repo` function in `src/gitingest/clone.py` - Added custom exceptions to `src/gitingest/exceptions.py`: `MaxFilesReachedError`, `MaxFileSizeReachedError`, and `AlreadyVisitedError` - Included explicit re-raising of exceptions in various functions for improved error propagation Test Suite Refactoring - Cleaned up and reorganized test files: - Moved tests from `src/gitingest/tests/` to `tests/` - Consolidated fixtures from `tests/test_ingest.py` into `tests/conftest.py` - Removed redundant content from `tests/conftest.py` - Migrated configuration from `pytest.ini` to `pyproject.toml`, deleted `pytest.ini`, and updated `.dockerignore` Documentation Improvements - Added `darglint` for enforcing `numpy` docstring style in `.pre-commit-config.yaml` for `src/` files - Updated docstrings throughout the codebase, including adding module docstrings where needed - Updated `README.md`: - Added "GitHub stars" badge - Moved the "Discord" badge to its own line - Replaced occurrences of "Gitingest" with "GitIngest" for consistency and clarity Linting and Code Quality - Integrated `pylint` into `.pre-commit-config.yaml` for both `src/` and `tests/` directories - Created `tests/.pylintrc` for linting configuration specific to test files Code Clean-up - Removed the redundant `src/__init__.py` file Naming Conventions and Code Style - Renamed `logSliderToSize` to `log_slider_to_size` in `src/server_utils.py` for consistency with Python's naming conventions - Added explicit encoding specification in multiple instances of `open` throughout the code --- .dockerignore | 1 - .pre-commit-config.yaml | 48 +++ README.md | 2 + pyproject.toml | 63 +++ pytest.ini | 8 - src/config.py | 4 +- src/gitingest/__init__.py | 2 + src/gitingest/cli.py | 6 +- src/gitingest/clone.py | 41 +- src/gitingest/exceptions.py | 26 +- src/gitingest/ignore_patterns.py | 2 + src/gitingest/ingest.py | 15 +- src/gitingest/ingest_from_query.py | 390 ++++++++++++------ src/gitingest/parse_query.py | 2 + src/gitingest/tests/__init__.py | 0 src/gitingest/tests/conftest.py | 9 - src/gitingest/utils.py | 13 +- src/main.py | 7 + src/process_query.py | 65 +-- src/routers/__init__.py | 2 + src/routers/download.py | 34 +- src/routers/dynamic.py | 10 +- src/routers/index.py | 10 +- src/server_utils.py | 4 +- tests/.pylintrc | 10 + {src => tests}/__init__.py | 0 tests/conftest.py | 72 ++++ {src/gitingest/tests => tests}/test_clone.py | 2 + {src/gitingest/tests => tests}/test_ingest.py | 71 +--- .../tests => tests}/test_parse_query.py | 2 + 30 files changed, 613 insertions(+), 308 deletions(-) delete mode 100644 pytest.ini delete mode 100644 src/gitingest/tests/__init__.py delete mode 100644 src/gitingest/tests/conftest.py create mode 100644 tests/.pylintrc rename {src => tests}/__init__.py (100%) create mode 100644 tests/conftest.py rename {src/gitingest/tests => tests}/test_clone.py (99%) rename {src/gitingest/tests => tests}/test_ingest.py (50%) rename {src/gitingest/tests => tests}/test_parse_query.py (99%) diff --git a/.dockerignore b/.dockerignore index 4720766..c4fef71 100644 --- a/.dockerignore +++ b/.dockerignore @@ -37,5 +37,4 @@ docs/ tests/ *.md LICENSE -pytest.ini setup.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9dcf517..fedf41f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -83,3 +83,51 @@ repos: - id: markdownlint description: "Lint markdown files." args: ["--disable=line-length"] + + - repo: https://github.com/terrencepreilly/darglint + rev: v1.8.1 + hooks: + - id: darglint + name: darglint for source + args: [--docstring-style=numpy] + files: ^src/ + + - repo: https://github.com/pycqa/pylint + rev: v3.3.3 + hooks: + - id: pylint + name: pylint for source + files: ^src/ + additional_dependencies: + [ + click, + fastapi-analytics, + pytest-asyncio, + python-dotenv, + slowapi, + starlette, + tiktoken, + uvicorn, + ] + - id: pylint + name: pylint for tests + files: ^tests/ + args: + - --rcfile=tests/.pylintrc + additional_dependencies: + [ + click, + fastapi-analytics, + pytest, + pytest-asyncio, + python-dotenv, + slowapi, + starlette, + tiktoken, + uvicorn, + ] + + - repo: meta + hooks: + - id: check-hooks-apply + - id: check-useless-excludes diff --git a/README.md b/README.md index 01ab27d..47d3685 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,11 @@ [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/cyclotruc/gitingest/blob/main/LICENSE) [![PyPI version](https://badge.fury.io/py/gitingest.svg)](https://badge.fury.io/py/gitingest) +[![GitHub stars](https://img.shields.io/github/stars/cyclotruc/gitingest?style=social.svg)](https://github.com/cyclotruc/gitingest) [![Downloads](https://pepy.tech/badge/gitingest)](https://pepy.tech/project/gitingest) [![GitHub issues](https://img.shields.io/github/issues/cyclotruc/gitingest)](https://github.com/cyclotruc/gitingest/issues) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) + [![Discord](https://dcbadge.limes.pink/api/server/https://discord.com/invite/zerRaGK9EC)](https://discord.com/invite/zerRaGK9EC) Turn any Git repository into a prompt-friendly text ingest for LLMs. diff --git a/pyproject.toml b/pyproject.toml index f30623c..f056eac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,60 @@ +[project] +name = "gitingest" +version = "0.1.2" +description="CLI tool to analyze and create text dumps of codebases for LLMs" +readme = {file = "README.md", content-type = "text/markdown" } +requires-python = ">= 3.10" +dependencies = [ + "click>=8.0.0", + "fastapi-analytics", + "fastapi[standard]", + "python-dotenv", + "slowapi", + "starlette", + "tiktoken", + "uvicorn", +] +license = {file = "LICENSE"} +authors = [{name = "Romain Courtois", email = "romain@coderamp.io"}] +classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] + +[project.scripts] +gitingest = "gitingest.cli:main" + +[project.urls] +homepage = "https://gitingest.com" +github = "https://github.com/cyclotruc/gitingest" + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = {find = {where = ["src"]}} +include-package-data = true + +# Linting configuration [tool.pylint.format] max-line-length = 119 +[tool.pylint.'MESSAGES CONTROL'] +disable = [ + "too-many-arguments", + "too-many-positional-arguments", + "too-many-locals", + "too-few-public-methods", + "broad-exception-caught", + "duplicate-code", +] + [tool.pycln] all = true @@ -14,3 +68,12 @@ filter_files = true [tool.black] line-length = 119 + +# Test configuration +[tool.pytest.ini_options] +pythonpath = ["src"] +testpaths = ["tests/"] +python_files = "test_*.py" +asyncio_mode = "auto" +python_classes = "Test*" +python_functions = "test_*" diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 2a15500..0000000 --- a/pytest.ini +++ /dev/null @@ -1,8 +0,0 @@ -[pytest] -pythonpath = src -testpaths = src/gitingest/tests -asyncio_mode = auto - -python_files = test_*.py -python_classes = Test* -python_functions = test_* diff --git a/src/config.py b/src/config.py index 8da41da..c0c134f 100644 --- a/src/config.py +++ b/src/config.py @@ -1,8 +1,10 @@ +""" Configuration file for the project. """ + MAX_DISPLAY_SIZE: int = 300_000 TMP_BASE_PATH: str = "../tmp" EXAMPLE_REPOS: list[dict[str, str]] = [ - {"name": "Gitingest", "url": "https://github.com/cyclotruc/gitingest"}, + {"name": "GitIngest", "url": "https://github.com/cyclotruc/gitingest"}, {"name": "FastAPI", "url": "https://github.com/tiangolo/fastapi"}, {"name": "Flask", "url": "https://github.com/pallets/flask"}, {"name": "Tldraw", "url": "https://github.com/tldraw/tldraw"}, diff --git a/src/gitingest/__init__.py b/src/gitingest/__init__.py index 212fefc..e1d130b 100644 --- a/src/gitingest/__init__.py +++ b/src/gitingest/__init__.py @@ -1,3 +1,5 @@ +""" gitingest: A package for ingesting data from git repositories. """ + from gitingest.clone import clone_repo from gitingest.ingest import ingest from gitingest.ingest_from_query import ingest_from_query diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 57d9f2c..7da0b1f 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,3 +1,7 @@ +""" Command-line interface for the GitIngest package. """ + +# pylint: disable=no-value-for-parameter + import click from gitingest.ingest import ingest @@ -40,7 +44,7 @@ def main( Raises ------ - click.Abort + Abort If there is an error during the execution of the command, this exception is raised to abort the process. """ try: diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index da6550f..01ba387 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,7 +1,8 @@ +""" This module contains functions for cloning a Git repository to a local path. """ + import asyncio from dataclasses import dataclass -from gitingest.exceptions import AsyncTimeoutError from gitingest.utils import async_timeout CLONE_TIMEOUT: int = 20 @@ -59,11 +60,7 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: Raises ------ ValueError - If the repository does not exist or if required query parameters are missing. - RuntimeError - If any git command fails during execution. - AsyncTimeoutError - If the cloning process exceeds the specified timeout. + If the 'url' or 'local_path' parameters are missing, or if the repository is not found. """ # Extract and validate query parameters url: str = config.url @@ -81,29 +78,25 @@ async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: if not await _check_repo_exists(url): raise ValueError("Repository not found, make sure it is public") - try: - if commit: - # Scenario 1: Clone and checkout a specific commit - # Clone the repository without depth to ensure full history for checkout - clone_cmd = ["git", "clone", "--single-branch", url, local_path] - await _run_git_command(*clone_cmd) - - # Checkout the specific commit - checkout_cmd = ["git", "-C", local_path, "checkout", commit] - return await _run_git_command(*checkout_cmd) + if commit: + # Scenario 1: Clone and checkout a specific commit + # Clone the repository without depth to ensure full history for checkout + clone_cmd = ["git", "clone", "--single-branch", url, local_path] + await _run_git_command(*clone_cmd) - if branch and branch.lower() not in ("main", "master"): + # Checkout the specific commit + checkout_cmd = ["git", "-C", local_path, "checkout", commit] + return await _run_git_command(*checkout_cmd) - # Scenario 2: Clone a specific branch with shallow depth - clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] - return await _run_git_command(*clone_cmd) + if branch and branch.lower() not in ("main", "master"): - # Scenario 3: Clone the default branch with shallow depth - clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path] + # Scenario 2: Clone a specific branch with shallow depth + clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] return await _run_git_command(*clone_cmd) - except (RuntimeError, asyncio.TimeoutError, AsyncTimeoutError): - raise # Re-raise the exception + # Scenario 3: Clone the default branch with shallow depth + clone_cmd = ["git", "clone", "--depth=1", "--single-branch", url, local_path] + return await _run_git_command(*clone_cmd) async def _check_repo_exists(url: str) -> bool: diff --git a/src/gitingest/exceptions.py b/src/gitingest/exceptions.py index 34263e4..b101f2e 100644 --- a/src/gitingest/exceptions.py +++ b/src/gitingest/exceptions.py @@ -1,12 +1,13 @@ +""" Custom exceptions for the GitIngest package. """ + + class InvalidPatternError(ValueError): """ Exception raised when a pattern contains invalid characters. - This exception is used to signal that a pattern provided for some operation contains characters that are not allowed. The valid characters for the pattern include alphanumeric characters, dash (-), underscore (_), dot (.), forward slash (/), plus (+), and asterisk (*). - Parameters ---------- pattern : str @@ -27,3 +28,24 @@ class AsyncTimeoutError(Exception): This exception is used by the `async_timeout` decorator to signal that the wrapped asynchronous function has exceeded the specified time limit for execution. """ + + +class MaxFilesReachedError(Exception): + """Exception raised when the maximum number of files is reached.""" + + def __init__(self, max_files: int) -> None: + super().__init__(f"Maximum number of files ({max_files}) reached.") + + +class MaxFileSizeReachedError(Exception): + """Raised when the maximum file size is reached.""" + + def __init__(self, max_size: int): + super().__init__(f"Maximum file size limit ({max_size/1024/1024:.1f}MB) reached.") + + +class AlreadyVisitedError(Exception): + """Exception raised when a symlink target has already been visited.""" + + def __init__(self, path: str) -> None: + super().__init__(f"Symlink target already visited: {path}") diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py index f8ab453..c2b382c 100644 --- a/src/gitingest/ignore_patterns.py +++ b/src/gitingest/ignore_patterns.py @@ -1,3 +1,5 @@ +""" Default ignore patterns for GitIngest. """ + DEFAULT_IGNORE_PATTERNS: list[str] = [ # Python "*.pyc", diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index 4bb329f..1b00c43 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -1,3 +1,5 @@ +""" Main entry point for ingesting a source and processing its contents. """ + import asyncio import inspect import shutil @@ -26,14 +28,15 @@ def ingest( ---------- source : str The source to analyze, which can be a URL (for a GitHub repository) or a local directory path. - max_file_size : int, optional - The maximum allowed file size for file ingestion. Files larger than this size are ignored, by default 10*1024*1024 (10 MB). + max_file_size : int + Maximum allowed file size for file ingestion. Files larger than this size are ignored, by default + 10*1024*1024 (10 MB). include_patterns : list[str] | str | None, optional - A pattern or list of patterns specifying which files to include in the analysis. If `None`, all files are included. + Pattern or list of patterns specifying which files to include. If `None`, all files are included. exclude_patterns : list[str] | str | None, optional - A pattern or list of patterns specifying which files to exclude from the analysis. If `None`, no files are excluded. + Pattern or list of patterns specifying which files to exclude. If `None`, no files are excluded. output : str | None, optional - The file path where the summary and content should be written. If `None`, the results are not written to a file. + File path where the summary and content should be written. If `None`, the results are not written to a file. Returns ------- @@ -75,7 +78,7 @@ def ingest( summary, tree, content = ingest_from_query(query) if output is not None: - with open(output, "w") as f: + with open(output, "w", encoding="utf-8") as f: f.write(tree + "\n" + content) return summary, tree, content diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index d8f57b7..14f83cc 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,9 +1,13 @@ +""" Functions to ingest and analyze a codebase directory or single file. """ + import os from fnmatch import fnmatch from typing import Any import tiktoken +from gitingest.exceptions import AlreadyVisitedError, MaxFileSizeReachedError, MaxFilesReachedError + MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB MAX_DIRECTORY_DEPTH = 20 # Maximum depth of directory traversal MAX_FILES = 10_000 # Maximum number of files to process @@ -14,9 +18,8 @@ def _should_include(path: str, base_path: str, include_patterns: list[str]) -> b """ Determines if the given file or directory path matches any of the include patterns. - This function checks whether the relative path of a file or directory matches - any of the specified patterns. If a match is found, it returns `True`, indicating - that the file or directory should be included in further processing. + This function checks whether the relative path of a file or directory matches any of the specified patterns. If a + match is found, it returns `True`, indicating that the file or directory should be included in further processing. Parameters ---------- @@ -145,8 +148,8 @@ def _read_file_content(file_path: str) -> str: try: with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() - except Exception as e: - return f"Error reading file: {str(e)}" + except OSError as e: + return f"Error reading file: {e}" def _scan_directory( @@ -171,7 +174,7 @@ def _scan_directory( A dictionary containing the query parameters, such as include and ignore patterns. seen_paths : set[str] | None, optional A set to track already visited paths, by default None. - depth : int, optional + depth : int The current depth of directory traversal, by default 0. stats : dict[str, int] | None, optional A dictionary to track statistics such as total file count and size, by default None. @@ -224,117 +227,255 @@ def _scan_directory( try: for item in os.listdir(path): item_path = os.path.join(path, item) - - if _should_exclude(item_path, base_path, ignore_patterns): - continue - - is_file = os.path.isfile(item_path) - if is_file and query["include_patterns"]: - if not _should_include(item_path, base_path, include_patterns): - result["ignore_content"] = True - continue - - # Handle symlinks - if os.path.islink(item_path): - if not _is_safe_symlink(item_path, base_path): - print(f"Skipping symlink that points outside base directory: {item_path}") - continue - real_path = os.path.realpath(item_path) - if real_path in seen_paths: - print(f"Skipping already visited symlink target: {item_path}") - continue - - if os.path.isfile(real_path): - file_size = os.path.getsize(real_path) - if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {item_path}: would exceed total size limit") - continue - - stats["total_files"] += 1 - stats["total_size"] += file_size - - if stats["total_files"] > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return result - - is_text = _is_text_file(real_path) - content = _read_file_content(real_path) if is_text else "[Non-text file]" - - child = { - "name": item, - "type": "file", - "size": file_size, - "content": content, - "path": item_path, - } - result["children"].append(child) - result["size"] += file_size - result["file_count"] += 1 - - elif os.path.isdir(real_path): - subdir = _scan_directory( - path=real_path, - query=query, - seen_paths=seen_paths, - depth=depth + 1, - stats=stats, - ) - if subdir and (not include_patterns or subdir["file_count"] > 0): - subdir["name"] = item - subdir["path"] = item_path - result["children"].append(subdir) - result["size"] += subdir["size"] - result["file_count"] += subdir["file_count"] - result["dir_count"] += 1 + subdir["dir_count"] - continue - - if os.path.isfile(item_path): - file_size = os.path.getsize(item_path) - if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: - print(f"Skipping file {item_path}: would exceed total size limit") - continue - - stats["total_files"] += 1 - stats["total_size"] += file_size - - if stats["total_files"] > MAX_FILES: - print(f"Maximum file limit ({MAX_FILES}) reached") - return result - - is_text = _is_text_file(item_path) - content = _read_file_content(item_path) if is_text else "[Non-text file]" - - child = { - "name": item, - "type": "file", - "size": file_size, - "content": content, - "path": item_path, - } - result["children"].append(child) - result["size"] += file_size - result["file_count"] += 1 - - elif os.path.isdir(item_path): - subdir = _scan_directory( - path=item_path, - query=query, - seen_paths=seen_paths, - depth=depth + 1, - stats=stats, - ) - if subdir and (not include_patterns or subdir["file_count"] > 0): - result["children"].append(subdir) - result["size"] += subdir["size"] - result["file_count"] += subdir["file_count"] - result["dir_count"] += 1 + subdir["dir_count"] - + _process_item( + item=item, + item_path=item_path, + query=query, + result=result, + seen_paths=seen_paths, + stats=stats, + depth=depth, + ignore_patterns=ignore_patterns, + base_path=base_path, + include_patterns=include_patterns, + ) + except MaxFilesReachedError: + print(f"Maximum file limit ({MAX_FILES}) reached.") except PermissionError: - print(f"Permission denied: {path}") + print(f"Permission denied: {path}.") return result +def _process_symlink( + item: str, + item_path: str, + query: dict[str, Any], + result: dict[str, Any], + seen_paths: set[str], + stats: dict[str, int], + depth: int, + base_path: str, + include_patterns: list[str], +) -> None: + """ + Process a symlink in the file system. + + This function checks if a symlink is safe, resolves its target, and processes it accordingly. + If the symlink is not safe, an exception is raised. + + Parameters + ---------- + item : str + The name of the symlink. + item_path : str + The full path of the symlink. + query : dict[str, Any] + The query dictionary containing the parameters. + result : dict[str, Any] + The dictionary to accumulate the results. + seen_paths : set[str] + A set of already visited paths. + stats : dict[str, int] + The dictionary to track statistics such as file count and size. + depth : int + The current depth in the directory traversal. + base_path : str + The base path used for validation of the symlink. + include_patterns : list[str] + A list of include patterns for file filtering. + + Raises + ------ + AlreadyVisitedError + If the symlink has already been processed. + MaxFileSizeReachedError + If the file size exceeds the maximum limit. + MaxFilesReachedError + If the number of files exceeds the maximum limit. + """ + if not _is_safe_symlink(item_path, base_path): + raise AlreadyVisitedError(item_path) + + real_path = os.path.realpath(item_path) + if real_path in seen_paths: + raise AlreadyVisitedError(item_path) + + if os.path.isfile(real_path): + file_size = os.path.getsize(real_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + raise MaxFileSizeReachedError(MAX_TOTAL_SIZE_BYTES) + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + raise MaxFilesReachedError(MAX_FILES) + + is_text = _is_text_file(real_path) + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": _read_file_content(real_path) if is_text else "[Non-text file]", + "path": item_path, + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + elif os.path.isdir(real_path): + subdir = _scan_directory( + path=real_path, + query=query, + seen_paths=seen_paths, + depth=depth + 1, + stats=stats, + ) + if subdir and (not include_patterns or subdir["file_count"] > 0): + subdir["name"] = item + subdir["path"] = item_path + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + + +def _process_file(item: str, item_path: str, result: dict[str, Any], stats: dict[str, int]) -> None: + """ + Process a file in the file system. + + This function checks the file's size, increments the statistics, and reads its content. + If the file size exceeds the maximum allowed, it raises an error. + + Parameters + ---------- + item : str + The name of the file. + item_path : str + The full path of the file. + result : dict[str, Any] + The dictionary to accumulate the results. + stats : dict[str, int] + The dictionary to track statistics such as file count and size. + + Raises + ------ + MaxFileSizeReachedError + If the file size exceeds the maximum limit. + MaxFilesReachedError + If the number of files exceeds the maximum limit. + """ + file_size = os.path.getsize(item_path) + if stats["total_size"] + file_size > MAX_TOTAL_SIZE_BYTES: + print(f"Skipping file {item_path}: would exceed total size limit") + raise MaxFileSizeReachedError(MAX_TOTAL_SIZE_BYTES) + + stats["total_files"] += 1 + stats["total_size"] += file_size + + if stats["total_files"] > MAX_FILES: + print(f"Maximum file limit ({MAX_FILES}) reached") + raise MaxFilesReachedError(MAX_FILES) + + is_text = _is_text_file(item_path) + content = _read_file_content(item_path) if is_text else "[Non-text file]" + + child = { + "name": item, + "type": "file", + "size": file_size, + "content": content, + "path": item_path, + } + result["children"].append(child) + result["size"] += file_size + result["file_count"] += 1 + + +def _process_item( + item: str, + item_path: str, + query: dict[str, Any], + result: dict[str, Any], + seen_paths: set[str], + stats: dict[str, int], + depth: int, + ignore_patterns: list[str], + base_path: str, + include_patterns: list[str], +) -> None: + """ + Process a file or directory item within a directory. + + This function handles each file or directory item, checking if it should be included or excluded based on the + provided patterns. It handles symlinks, directories, and files accordingly. + + Parameters + ---------- + item : str + The name of the file or directory to process. + item_path : str + The full path of the file or directory to process. + query : dict[str, Any] + A dictionary of query parameters, including the base path and patterns. + result : dict[str, Any] + The result dictionary to accumulate processed file/directory data. + seen_paths : set[str] + A set of paths that have already been visited. + stats : dict[str, int] + A dictionary of statistics like the total file count and size. + depth : int + The current depth of directory traversal. + ignore_patterns : list[str] + A list of patterns to exclude files or directories. + base_path : str + The base directory used for relative path calculations. + include_patterns : list[str] + A list of patterns to include files or directories. + """ + if _should_exclude(item_path, base_path, ignore_patterns): + return + + if ( + os.path.isfile(item_path) + and query["include_patterns"] + and not _should_include(item_path, base_path, include_patterns) + ): + result["ignore_content"] = True + return + + try: + if os.path.islink(item_path): + _process_symlink( + item=item, + item_path=item_path, + query=query, + result=result, + seen_paths=seen_paths, + stats=stats, + depth=depth, + base_path=base_path, + include_patterns=include_patterns, + ) + + if os.path.isfile(item_path): + _process_file(item=item, item_path=item_path, result=result, stats=stats) + + elif os.path.isdir(item_path): + subdir = _scan_directory(path=item_path, query=query, seen_paths=seen_paths, depth=depth + 1, stats=stats) + if subdir and (not include_patterns or subdir["file_count"] > 0): + result["children"].append(subdir) + result["size"] += subdir["size"] + result["file_count"] += subdir["file_count"] + result["dir_count"] += 1 + subdir["dir_count"] + + except (MaxFileSizeReachedError, AlreadyVisitedError) as e: + print(e) + + def _extract_files_content( query: dict[str, Any], node: dict[str, Any], @@ -440,14 +581,14 @@ def _create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: Parameters ---------- query : dict[str, Any] - A dictionary containing query parameters like repository name, commit, branch, and subpath. + Dictionary containing query parameters like repository name, commit, branch, and subpath. nodes : dict[str, Any] - A dictionary representing the directory structure, including file and directory counts. + Dictionary representing the directory structure, including file and directory counts. Returns ------- str - A summary string containing details such as the repository name, file count, and other query-specific information. + Summary string containing details such as repository name, file count, and other query-specific information. """ if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" @@ -479,9 +620,9 @@ def _create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: A dictionary containing query parameters like repository name and subpath. node : dict[str, Any] The current directory or file node being processed. - prefix : str, optional + prefix : str A string used for indentation and formatting of the tree structure, by default "". - is_last : bool, optional + is_last : bool A flag indicating whether the current node is the last in its directory, by default True. Returns @@ -526,23 +667,20 @@ def _generate_token_string(context_string: str) -> str | None: str | None The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. """ - formatted_tokens = "" try: encoding = tiktoken.get_encoding("cl100k_base") total_tokens = len(encoding.encode(context_string, disallowed_special=())) - - except Exception as e: + except (ValueError, UnicodeEncodeError) as e: print(e) return None if total_tokens > 1_000_000: - formatted_tokens = f"{total_tokens / 1_000_000:.1f}M" - elif total_tokens > 1_000: - formatted_tokens = f"{total_tokens / 1_000:.1f}k" - else: - formatted_tokens = f"{total_tokens}" + return f"{total_tokens / 1_000_000:.1f}M" + + if total_tokens > 1_000: + return f"{total_tokens / 1_000:.1f}k" - return formatted_tokens + return str(total_tokens) def _ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]: diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 18a78e9..0ec5308 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,3 +1,5 @@ +""" This module contains functions to parse and validate input sources and patterns. """ + import os import re import string diff --git a/src/gitingest/tests/__init__.py b/src/gitingest/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/gitingest/tests/conftest.py b/src/gitingest/tests/conftest.py deleted file mode 100644 index ecb7e81..0000000 --- a/src/gitingest/tests/conftest.py +++ /dev/null @@ -1,9 +0,0 @@ -import os -import sys - -# Get the absolute path of the project root directory (one level up from tests) -project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -# Add both the project root and src directory to PYTHONPATH -sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, "src")) diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index bc95bfc..c93c26a 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,3 +1,5 @@ +""" Utility functions for the GitIngest package. """ + import asyncio import functools from collections.abc import Awaitable, Callable @@ -19,7 +21,7 @@ def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Ca Parameters ---------- - seconds : int, optional + seconds : int The maximum allowed time (in seconds) for the asynchronous function to complete. The default is 10 seconds. @@ -29,11 +31,6 @@ def async_timeout(seconds: int = 10) -> Callable[[Callable[P, Awaitable[T]]], Ca A decorator that, when applied to an async function, ensures the function completes within the specified time limit. If the function takes too long, an `AsyncTimeoutError` is raised. - - Raises - ------ - AsyncTimeoutError - If the wrapped asynchronous function does not complete within the specified time limit. """ def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: @@ -41,8 +38,8 @@ def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: try: return await asyncio.wait_for(func(*args, **kwargs), timeout=seconds) - except asyncio.TimeoutError: - raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") + except asyncio.TimeoutError as exc: + raise AsyncTimeoutError(f"Operation timed out after {seconds} seconds") from exc return wrapper diff --git a/src/main.py b/src/main.py index d00367b..a5c8852 100644 --- a/src/main.py +++ b/src/main.py @@ -1,3 +1,5 @@ +""" Main module for the FastAPI application. """ + import os from api_analytics.fastapi import Analytics @@ -36,6 +38,11 @@ async def rate_limit_exception_handler(request: Request, exc: Exception) -> Resp ------- Response A response indicating that the rate limit has been exceeded. + + Raises + ------ + exc + If the exception is not a RateLimitExceeded error, it is re-raised. """ if isinstance(exc, RateLimitExceeded): # Delegate to the default rate limit handler diff --git a/src/process_query.py b/src/process_query.py index 4053e45..eb87ccc 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -1,3 +1,7 @@ +""" Process a query by parsing input, cloning a repository, and generating a summary. """ + +from functools import partial + from fastapi import Request from fastapi.templating import Jinja2Templates from starlette.templating import _TemplateResponse @@ -6,7 +10,7 @@ from gitingest.clone import CloneConfig, clone_repo from gitingest.ingest_from_query import ingest_from_query from gitingest.parse_query import parse_query -from server_utils import Colors, logSliderToSize +from server_utils import Colors, log_slider_to_size templates = Jinja2Templates(directory="templates") @@ -33,27 +37,44 @@ async def process_query( Input text provided by the user, typically a GitHub repository URL or slug. slider_position : int Position of the slider, representing the maximum file size in the query. - pattern_type : str, optional + pattern_type : str Type of pattern to use, either "include" or "exclude" (default is "exclude"). - pattern : str, optional + pattern : str Pattern to include or exclude in the query, depending on the pattern type. - is_index : bool, optional + is_index : bool Flag indicating whether the request is for the index page (default is False). Returns ------- _TemplateResponse Rendered template response containing the processed results or an error message. - """ - template = "index.jinja" if is_index else "github.jinja" - max_file_size = logSliderToSize(slider_position) + Raises + ------ + ValueError + If an invalid pattern type is provided. + """ if pattern_type == "include": include_patterns = pattern exclude_patterns = None elif pattern_type == "exclude": exclude_patterns = pattern include_patterns = None + else: + raise ValueError(f"Invalid pattern type: {pattern_type}") + + template = "index.jinja" if is_index else "github.jinja" + template_response = partial(templates.TemplateResponse, name=template) + max_file_size = log_slider_to_size(slider_position) + + context = { + "request": request, + "github_url": input_text, + "examples": EXAMPLE_REPOS if is_index else [], + "default_file_size": slider_position, + "pattern_type": pattern_type, + "pattern": pattern, + } try: query = parse_query( @@ -71,9 +92,8 @@ async def process_query( ) await clone_repo(clone_config) summary, tree, content = ingest_from_query(query) - with open(f"{clone_config.local_path}.txt", "w") as f: + with open(f"{clone_config.local_path}.txt", "w", encoding="utf-8") as f: f.write(tree + "\n" + content) - except Exception as e: # hack to print error message when query is not defined if "query" in locals() and query is not None and isinstance(query, dict): @@ -82,18 +102,8 @@ async def process_query( print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") - return templates.TemplateResponse( - template, - { - "request": request, - "github_url": input_text, - "error_message": f"Error: {e}", - "examples": EXAMPLE_REPOS if is_index else [], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - }, - ) + context["error_message"] = f"Error: {e}" + return template_response(context) if len(content) > MAX_DISPLAY_SIZE: content = ( @@ -109,23 +119,18 @@ async def process_query( summary=summary, ) - return templates.TemplateResponse( - template, + context.update( { - "request": request, - "github_url": input_text, "result": True, "summary": summary, "tree": tree, "content": content, - "examples": EXAMPLE_REPOS if is_index else [], "ingest_id": query["id"], - "default_file_size": slider_position, - "pattern_type": pattern_type, - "pattern": pattern, - }, + } ) + return template_response(context) + def _print_query(url: str, max_file_size: int, pattern_type: str, pattern: str) -> None: """ diff --git a/src/routers/__init__.py b/src/routers/__init__.py index ace7bd0..d8d2409 100644 --- a/src/routers/__init__.py +++ b/src/routers/__init__.py @@ -1,3 +1,5 @@ +""" This module contains the routers for the FastAPI application. """ + from routers.download import router as download from routers.dynamic import router as dynamic from routers.index import router as index diff --git a/src/routers/download.py b/src/routers/download.py index fdbf2bb..513451c 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -1,3 +1,5 @@ +""" This module contains the FastAPI router for downloading a digest file. """ + import os from fastapi import APIRouter, HTTPException @@ -32,26 +34,30 @@ async def download_ingest(digest_id: str) -> Response: Raises ------ - FileNotFoundError - If no `.txt` file is found in the directory corresponding to the given `digest_id`. HTTPException If the digest directory is not found or if no `.txt` file exists in the directory. """ + directory = f"{TMP_BASE_PATH}/{digest_id}" + try: - # Find the first .txt file in the directory - directory = f"{TMP_BASE_PATH}/{digest_id}" - txt_files = [f for f in os.listdir(directory) if f.endswith(".txt")] + if not os.path.exists(directory): + raise FileNotFoundError("Directory not found") + txt_files = [f for f in os.listdir(directory) if f.endswith(".txt")] if not txt_files: raise FileNotFoundError("No .txt file found") - with open(f"{directory}/{txt_files[0]}") as f: - content = f.read() + except FileNotFoundError as exc: + raise HTTPException(status_code=404, detail="Digest not found") from exc + + # Find the first .txt file in the directory + first_file = txt_files[0] + + with open(f"{directory}/{first_file}", encoding="utf-8") as f: + content = f.read() - return Response( - content=content, - media_type="text/plain", - headers={"Content-Disposition": f"attachment; filename={txt_files[0]}"}, - ) - except FileNotFoundError: - raise HTTPException(status_code=404, detail="Digest not found") + return Response( + content=content, + media_type="text/plain", + headers={"Content-Disposition": f"attachment; filename={first_file}"}, + ) diff --git a/src/routers/dynamic.py b/src/routers/dynamic.py index bfd6d44..54d9184 100644 --- a/src/routers/dynamic.py +++ b/src/routers/dynamic.py @@ -1,3 +1,5 @@ +""" This module defines the dynamic router for handling dynamic path requests. """ + from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates @@ -60,13 +62,13 @@ async def process_catch_all( ---------- request : Request The incoming request object, which provides context for rendering the response. - input_text : str, optional + input_text : str The input text provided by the user for processing, by default taken from the form. - max_file_size : int, optional + max_file_size : int The maximum allowed file size for the input, specified by the user. - pattern_type : str, optional + pattern_type : str The type of pattern used for the query, specified by the user. - pattern : str, optional + pattern : str The pattern string used in the query, specified by the user. Returns diff --git a/src/routers/index.py b/src/routers/index.py index 9665bd0..a0b8235 100644 --- a/src/routers/index.py +++ b/src/routers/index.py @@ -1,3 +1,5 @@ +""" This module defines the FastAPI router for the home page of the application. """ + from fastapi import APIRouter, Form, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates @@ -59,13 +61,13 @@ async def index_post( ---------- request : Request The incoming request object, which provides context for rendering the response. - input_text : str, optional + input_text : str The input text provided by the user for processing, by default taken from the form. - max_file_size : int, optional + max_file_size : int The maximum allowed file size for the input, specified by the user. - pattern_type : str, optional + pattern_type : str The type of pattern used for the query, specified by the user. - pattern : str, optional + pattern : str The pattern string used in the query, specified by the user. Returns diff --git a/src/server_utils.py b/src/server_utils.py index 7af4b85..432bbb2 100644 --- a/src/server_utils.py +++ b/src/server_utils.py @@ -1,3 +1,5 @@ +""" Utility functions for the server. """ + import math from slowapi import Limiter @@ -7,7 +9,7 @@ limiter = Limiter(key_func=get_remote_address) -def logSliderToSize(position: int) -> int: +def log_slider_to_size(position: int) -> int: """ Convert a slider position to a file size in bytes using a logarithmic scale. diff --git a/tests/.pylintrc b/tests/.pylintrc new file mode 100644 index 0000000..32f19bc --- /dev/null +++ b/tests/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +init-hook= + import sys + sys.path.append('./src') + +[MESSAGES CONTROL] +disable=missing-class-docstring,missing-function-docstring,protected-access,fixme + +[FORMAT] +max-line-length=119 diff --git a/src/__init__.py b/tests/__init__.py similarity index 100% rename from src/__init__.py rename to tests/__init__.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5779127 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,72 @@ +""" This module contains fixtures for the tests. """ + +from pathlib import Path +from typing import Any + +import pytest + + +@pytest.fixture +def sample_query() -> dict[str, Any]: + return { + "user_name": "test_user", + "repo_name": "test_repo", + "local_path": "/tmp/test_repo", + "subpath": "/", + "branch": "main", + "commit": None, + "max_file_size": 1_000_000, + "slug": "test_user/test_repo", + "ignore_patterns": ["*.pyc", "__pycache__", ".git"], + "include_patterns": None, + "pattern_type": "exclude", + } + + +@pytest.fixture +def temp_directory(tmp_path: Path) -> Path: + # Creates the following structure: + # test_repo/ + # ├── file1.txt + # ├── file2.py + # └── src/ + # | ├── subfile1.txt + # | └── subfile2.py + # | └── subdir/ + # | └── file_subdir.txt + # | └── file_subdir.py + # └── dir1/ + # | └── file_dir1.txt + # └── dir2/ + # └── file_dir2.txt + + test_dir = tmp_path / "test_repo" + test_dir.mkdir() + + # Root files + (test_dir / "file1.txt").write_text("Hello World") + (test_dir / "file2.py").write_text("print('Hello')") + + # src directory and its files + src_dir = test_dir / "src" + src_dir.mkdir() + (src_dir / "subfile1.txt").write_text("Hello from src") + (src_dir / "subfile2.py").write_text("print('Hello from src')") + + # src/subdir and its files + subdir = src_dir / "subdir" + subdir.mkdir() + (subdir / "file_subdir.txt").write_text("Hello from subdir") + (subdir / "file_subdir.py").write_text("print('Hello from subdir')") + + # dir1 and its file + dir1 = test_dir / "dir1" + dir1.mkdir() + (dir1 / "file_dir1.txt").write_text("Hello from dir1") + + # dir2 and its file + dir2 = test_dir / "dir2" + dir2.mkdir() + (dir2 / "file_dir2.txt").write_text("Hello from dir2") + + return test_dir diff --git a/src/gitingest/tests/test_clone.py b/tests/test_clone.py similarity index 99% rename from src/gitingest/tests/test_clone.py rename to tests/test_clone.py index c124730..2e58a0f 100644 --- a/src/gitingest/tests/test_clone.py +++ b/tests/test_clone.py @@ -1,3 +1,5 @@ +""" Tests for the clone module. """ + from unittest.mock import AsyncMock, patch import pytest diff --git a/src/gitingest/tests/test_ingest.py b/tests/test_ingest.py similarity index 50% rename from src/gitingest/tests/test_ingest.py rename to tests/test_ingest.py index 53257a1..b0a36d7 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -1,78 +1,11 @@ +""" Tests for the ingest_from_query module """ + from pathlib import Path from typing import Any -import pytest - from gitingest.ingest_from_query import _extract_files_content, _scan_directory -# Test fixtures -@pytest.fixture -def sample_query() -> dict[str, Any]: - return { - "user_name": "test_user", - "repo_name": "test_repo", - "local_path": "/tmp/test_repo", - "subpath": "/", - "branch": "main", - "commit": None, - "max_file_size": 1_000_000, - "slug": "test_user/test_repo", - "ignore_patterns": ["*.pyc", "__pycache__", ".git"], - "include_patterns": None, - "pattern_type": "exclude", - } - - -@pytest.fixture -def temp_directory(tmp_path: Path) -> Path: - # Creates the following structure: - # test_repo/ - # ├── file1.txt - # ├── file2.py - # └── src/ - # | ├── subfile1.txt - # | └── subfile2.py - # | └── subdir/ - # | └── file_subdir.txt - # | └── file_subdir.py - # └── dir1/ - # | └── file_dir1.txt - # └── dir2/ - # └── file_dir2.txt - - test_dir = tmp_path / "test_repo" - test_dir.mkdir() - - # Root files - (test_dir / "file1.txt").write_text("Hello World") - (test_dir / "file2.py").write_text("print('Hello')") - - # src directory and its files - src_dir = test_dir / "src" - src_dir.mkdir() - (src_dir / "subfile1.txt").write_text("Hello from src") - (src_dir / "subfile2.py").write_text("print('Hello from src')") - - # src/subdir and its files - subdir = src_dir / "subdir" - subdir.mkdir() - (subdir / "file_subdir.txt").write_text("Hello from subdir") - (subdir / "file_subdir.py").write_text("print('Hello from subdir')") - - # dir1 and its file - dir1 = test_dir / "dir1" - dir1.mkdir() - (dir1 / "file_dir1.txt").write_text("Hello from dir1") - - # dir2 and its file - dir2 = test_dir / "dir2" - dir2.mkdir() - (dir2 / "file_dir2.txt").write_text("Hello from dir2") - - return test_dir - - def test_scan_directory(temp_directory: Path, sample_query: dict[str, Any]) -> None: result = _scan_directory(str(temp_directory), query=sample_query) if result is None: diff --git a/src/gitingest/tests/test_parse_query.py b/tests/test_parse_query.py similarity index 99% rename from src/gitingest/tests/test_parse_query.py rename to tests/test_parse_query.py index 8ce3ff0..0a162a4 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/tests/test_parse_query.py @@ -1,3 +1,5 @@ +""" Tests for the parse_query module. """ + import pytest from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS