diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..163c2a8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,47 @@ +name: CI + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: true + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/*requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r requirements-dev.txt + + - name: Run tests + run: | + pytest + + # Run pre-commit only on Python 3.13 + ubuntu. + - name: Run pre-commit hooks + if: ${{ matrix.python-version == '3.13' && matrix.os == 'ubuntu-latest' }} + run: | + pre-commit run --all-files diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..3b2764d --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,41 @@ +name: Publish to PyPI + +on: + release: + types: [created] # Trigger only when a release is created + workflow_dispatch: # Allows manual triggering of the workflow + +jobs: + publish: + runs-on: ubuntu-latest + + steps: + # Step 1: Check out the code + - name: Checkout code + uses: actions/checkout@v4 + + # Step 2: Set up Python + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.13 + + # Step 3: Install dependencies for building and publishing + - name: Install build tools + run: | + pip install --upgrade pip + pip install build twine + + # Step 4: Build the package + - name: Build the package + run: | + python -m build + + # Step 5: Publish to PyPI + - name: Publish to PyPI + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: | + python -m twine check dist/* + python -m twine upload --skip-existing dist/* diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml deleted file mode 100644 index 6759ecd..0000000 --- a/.github/workflows/unitest.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Unit Tests - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - test: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.10", "3.11"] - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest pytest-asyncio - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install -e . - - - name: Run tests - run: | - pytest diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1b3eabd..68eedb4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,39 +4,39 @@ repos: hooks: # Files - id: check-added-large-files - description: 'Prevent large files from being committed.' - args: ['--maxkb=10000'] + description: "Prevent large files from being committed." + args: ["--maxkb=10000"] - id: check-case-conflict - description: 'Check for files that would conflict in case-insensitive filesystems.' + description: "Check for files that would conflict in case-insensitive filesystems." - id: fix-byte-order-marker - description: 'Remove utf-8 byte order marker.' + description: "Remove utf-8 byte order marker." - id: mixed-line-ending - description: 'Replace mixed line ending.' + description: "Replace mixed line ending." # Links - id: destroyed-symlinks - description: 'Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to.' + description: "Detect symlinks which are changed to regular files with a content of a path which that symlink was pointing to." # File files for parseable syntax: python - id: check-ast # File and line endings - id: end-of-file-fixer - description: 'Ensure that a file is either empty, or ends with one newline.' + description: "Ensure that a file is either empty, or ends with one newline." - id: trailing-whitespace - description: 'Trim trailing whitespace.' + description: "Trim trailing whitespace." # Python - id: check-docstring-first - description: 'Check a common error of defining a docstring after code.' + description: "Check a common error of defining a docstring after code." - id: requirements-txt-fixer - description: 'Sort entries in requirements.txt.' + description: "Sort entries in requirements.txt." - repo: https://github.com/MarcoGorelli/absolufy-imports rev: v0.3.1 hooks: - id: absolufy-imports - description: 'Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)' + description: "Automatically convert relative imports to absolute. (Use `args: [--never]` to revert.)" - repo: https://github.com/psf/black rev: 24.10.0 @@ -47,30 +47,30 @@ repos: rev: v3.19.1 hooks: - id: pyupgrade - description: 'Automatically upgrade syntax for newer versions.' - args: [--py3-plus, --py36-plus, --py38-plus] + description: "Automatically upgrade syntax for newer versions." + args: [--py3-plus, --py36-plus, --py38-plus, --py39-plus, --py310-plus] - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 hooks: - id: python-check-blanket-noqa - description: 'Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`.' + description: "Enforce that `noqa` annotations always occur with specific codes. Sample annotations: `# noqa: F401`, `# noqa: F401,W203`." - id: python-check-blanket-type-ignore - description: 'Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`.' + description: "Enforce that `# type: ignore` annotations always occur with specific codes. Sample annotations: `# type: ignore[attr-defined]`, `# type: ignore[attr-defined, name-defined]`." - id: python-use-type-annotations - description: 'Enforce that python3.6+ type annotations are used instead of type comments.' + description: "Enforce that python3.6+ type annotations are used instead of type comments." - repo: https://github.com/PyCQA/isort rev: 5.13.2 hooks: - id: isort - description: 'Sort imports alphabetically, and automatically separated into sections and by type.' + description: "Sort imports alphabetically, and automatically separated into sections and by type." - repo: https://github.com/hadialqattan/pycln rev: v2.4.0 hooks: - id: pycln - description: 'Remove unused import statements.' + description: "Remove unused import statements." - repo: https://github.com/djlint/djLint rev: v1.36.4 diff --git a/pyproject.toml b/pyproject.toml index f7d6c65..f30623c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,4 +14,3 @@ filter_files = true [tool.black] line-length = 119 -skip-string-normalization = true diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..eb733ff --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +-r requirements.txt +black +djlint +pre-commit +pylint +pytest +pytest-asyncio diff --git a/requirements.txt b/requirements.txt index 2688a88..e147ebf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,6 @@ -black click>=8.0.0 -djlint fastapi-analytics fastapi[standard] -pre-commit -pytest -pytest-asyncio python-dotenv slowapi starlette diff --git a/src/gitingest/cli.py b/src/gitingest/cli.py index 9e0e3c4..c5f8a49 100644 --- a/src/gitingest/cli.py +++ b/src/gitingest/cli.py @@ -1,9 +1,7 @@ import os -from typing import Optional, Tuple import click -from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS from gitingest.ingest import ingest from gitingest.ingest_from_query import MAX_FILE_SIZE @@ -17,17 +15,17 @@ def normalize_pattern(pattern: str) -> str: @click.command() -@click.argument('source', type=str, required=True) -@click.option('--output', '-o', default=None, help='Output file path (default: .txt in current directory)') -@click.option('--max-size', '-s', default=MAX_FILE_SIZE, help='Maximum file size to process in bytes') -@click.option('--exclude-pattern', '-e', multiple=True, help='Patterns to exclude') -@click.option('--include-pattern', '-i', multiple=True, help='Patterns to include') +@click.argument("source", type=str, required=True) +@click.option("--output", "-o", default=None, help="Output file path (default: .txt in current directory)") +@click.option("--max-size", "-s", default=MAX_FILE_SIZE, help="Maximum file size to process in bytes") +@click.option("--exclude-pattern", "-e", multiple=True, help="Patterns to exclude") +@click.option("--include-pattern", "-i", multiple=True, help="Patterns to include") def main( source: str, - output: Optional[str], + output: str | None, max_size: int, - exclude_pattern: Tuple[str, ...], - include_pattern: Tuple[str, ...], + exclude_pattern: tuple[str, ...], + include_pattern: tuple[str, ...], ) -> None: """Analyze a directory and create a text dump of its contents.""" try: @@ -48,5 +46,5 @@ def main( raise click.Abort() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index 4a3fda3..df9aba8 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -1,6 +1,5 @@ import asyncio from dataclasses import dataclass -from typing import Optional, Tuple from gitingest.utils import AsyncTimeoutError, async_timeout @@ -11,8 +10,9 @@ class CloneConfig: url: str local_path: str - commit: Optional[str] = None - branch: Optional[str] = None + commit: str | None = None + branch: str | None = None + async def check_repo_exists(url: str) -> bool: @@ -44,7 +44,8 @@ async def check_repo_exists(url: str) -> bool: return "HTTP/1.1 404" not in stdout_str and "HTTP/2 404" not in stdout_str -async def run_git_command(*args: str) -> Tuple[bytes, bytes]: +async def run_git_command(*args: str) -> tuple[bytes, bytes]: + """ Executes a git command asynchronously and captures its output. @@ -77,7 +78,7 @@ async def run_git_command(*args: str) -> Tuple[bytes, bytes]: @async_timeout(CLONE_TIMEOUT) -async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: +async def clone_repo(config: CloneConfig) -> tuple[bytes, bytes]: """ Clones a repository to a local path based on the provided query parameters. @@ -107,8 +108,9 @@ async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: # Extract and validate query parameters url: str = config.url local_path: str = config.local_path - commit: Optional[str] = config.commit - branch: Optional[str] = config.branch + commit: str | None = config.commit + branch: str | None = config.branch + if not url: raise ValueError("The 'url' parameter is required.") @@ -131,7 +133,8 @@ async def clone_repo(config: CloneConfig) -> Tuple[bytes, bytes]: checkout_cmd = ["git", "-C", local_path, "checkout", commit] return await run_git_command(*checkout_cmd) - if branch and branch.lower() not in ('main', 'master'): + if branch and branch.lower() not in ("main", "master"): + # Scenario 2: Clone a specific branch with shallow depth clone_cmd = ["git", "clone", "--depth=1", "--single-branch", "--branch", branch, url, local_path] return await run_git_command(*clone_cmd) diff --git a/src/gitingest/ignore_patterns.py b/src/gitingest/ignore_patterns.py index 8c738b8..f8ab453 100644 --- a/src/gitingest/ignore_patterns.py +++ b/src/gitingest/ignore_patterns.py @@ -1,162 +1,160 @@ -from typing import List - -DEFAULT_IGNORE_PATTERNS: List[str] = [ +DEFAULT_IGNORE_PATTERNS: list[str] = [ # Python - '*.pyc', - '*.pyo', - '*.pyd', - '__pycache__', - '.pytest_cache', - '.coverage', - '.tox', - '.nox', - '.mypy_cache', - '.ruff_cache', - '.hypothesis', - 'poetry.lock', - 'Pipfile.lock', + "*.pyc", + "*.pyo", + "*.pyd", + "__pycache__", + ".pytest_cache", + ".coverage", + ".tox", + ".nox", + ".mypy_cache", + ".ruff_cache", + ".hypothesis", + "poetry.lock", + "Pipfile.lock", # JavaScript/Node - 'node_modules', - 'bower_components', - 'package-lock.json', - 'yarn.lock', - '.npm', - '.yarn', - '.pnpm-store', + "node_modules", + "bower_components", + "package-lock.json", + "yarn.lock", + ".npm", + ".yarn", + ".pnpm-store", # Java - '*.class', - '*.jar', - '*.war', - '*.ear', - '*.nar', - 'target/', - '.gradle/', - 'build/', - '.settings/', - '.project', - '.classpath', - 'gradle-app.setting', - '*.gradle', + "*.class", + "*.jar", + "*.war", + "*.ear", + "*.nar", + "target/", + ".gradle/", + "build/", + ".settings/", + ".project", + ".classpath", + "gradle-app.setting", + "*.gradle", # C/C++ - '*.o', - '*.obj', - '*.so', - '*.dll', - '*.dylib', - '*.exe', - '*.lib', - '*.out', - '*.a', - '*.pdb', + "*.o", + "*.obj", + "*.so", + "*.dll", + "*.dylib", + "*.exe", + "*.lib", + "*.out", + "*.a", + "*.pdb", # Swift/Xcode - '.build/', - '*.xcodeproj/', - '*.xcworkspace/', - '*.pbxuser', - '*.mode1v3', - '*.mode2v3', - '*.perspectivev3', - '*.xcuserstate', - 'xcuserdata/', - '.swiftpm/', + ".build/", + "*.xcodeproj/", + "*.xcworkspace/", + "*.pbxuser", + "*.mode1v3", + "*.mode2v3", + "*.perspectivev3", + "*.xcuserstate", + "xcuserdata/", + ".swiftpm/", # Ruby - '*.gem', - '.bundle/', - 'vendor/bundle', - 'Gemfile.lock', - '.ruby-version', - '.ruby-gemset', - '.rvmrc', + "*.gem", + ".bundle/", + "vendor/bundle", + "Gemfile.lock", + ".ruby-version", + ".ruby-gemset", + ".rvmrc", # Rust - 'target/', - 'Cargo.lock', - '**/*.rs.bk', + "target/", + "Cargo.lock", + "**/*.rs.bk", # Go - 'bin/', - 'pkg/', + "bin/", + "pkg/", # .NET/C# - 'bin/', - 'obj/', - '*.suo', - '*.user', - '*.userosscache', - '*.sln.docstates', - 'packages/', - '*.nupkg', + "bin/", + "obj/", + "*.suo", + "*.user", + "*.userosscache", + "*.sln.docstates", + "packages/", + "*.nupkg", # Version control - '.git', - '.svn', - '.hg', - '.gitignore', - '.gitattributes', - '.gitmodules', + ".git", + ".svn", + ".hg", + ".gitignore", + ".gitattributes", + ".gitmodules", # Images and media - '*.svg', - '*.png', - '*.jpg', - '*.jpeg', - '*.gif', - '*.ico', - '*.pdf', - '*.mov', - '*.mp4', - '*.mp3', - '*.wav', + "*.svg", + "*.png", + "*.jpg", + "*.jpeg", + "*.gif", + "*.ico", + "*.pdf", + "*.mov", + "*.mp4", + "*.mp3", + "*.wav", # Virtual environments - 'venv', - '.venv', - 'env', - '.env', - 'virtualenv', + "venv", + ".venv", + "env", + ".env", + "virtualenv", # IDEs and editors - '.idea', - '.vscode', - '.vs', - '*.swp', - '*.swo', - '*.swn', - '.settings', - '.project', - '.classpath', - '*.sublime-*', + ".idea", + ".vscode", + ".vs", + "*.swp", + "*.swo", + "*.swn", + ".settings", + ".project", + ".classpath", + "*.sublime-*", # Temporary and cache files - '*.log', - '*.bak', - '*.swp', - '*.tmp', - '*.temp', - '.cache', - '.sass-cache', - '.eslintcache', - '.DS_Store', - 'Thumbs.db', - 'desktop.ini', + "*.log", + "*.bak", + "*.swp", + "*.tmp", + "*.temp", + ".cache", + ".sass-cache", + ".eslintcache", + ".DS_Store", + "Thumbs.db", + "desktop.ini", # Build directories and artifacts - 'build', - 'dist', - 'target', - 'out', - '*.egg-info', - '*.egg', - '*.whl', - '*.so', - '*.dylib', - '*.dll', - '*.class', + "build", + "dist", + "target", + "out", + "*.egg-info", + "*.egg", + "*.whl", + "*.so", + "*.dylib", + "*.dll", + "*.class", # Documentation - 'site-packages', - '.docusaurus', - '.next', - '.nuxt', + "site-packages", + ".docusaurus", + ".next", + ".nuxt", # Other common patterns ## Minified files - '*.min.js', - '*.min.css', + "*.min.js", + "*.min.css", ## Source maps - '*.map', + "*.map", ## Terraform - '.terraform', - '*.tfstate*', + ".terraform", + "*.tfstate*", ## Dependencies in various languages - 'vendor/', + "vendor/", ] diff --git a/src/gitingest/ingest.py b/src/gitingest/ingest.py index 4889bc5..e4c673d 100644 --- a/src/gitingest/ingest.py +++ b/src/gitingest/ingest.py @@ -2,7 +2,6 @@ import inspect import shutil from pathlib import Path -from typing import List, Optional, Tuple, Union from gitingest.clone import CloneConfig, clone_repo from gitingest.ingest_from_query import ingest_from_query @@ -12,10 +11,11 @@ def ingest( source: str, max_file_size: int = 10 * 1024 * 1024, # 10 MB - include_patterns: Union[List[str], str, None] = None, - exclude_patterns: Union[List[str], str, None] = None, - output: Optional[str] = None, -) -> Tuple[str, str, str]: + include_patterns: list[str] | str | None = None, + exclude_patterns: list[str] | str | None = None, + output: str | None = None, +) -> tuple[str, str, str]: + try: query = parse_query( source=source, @@ -24,14 +24,14 @@ def ingest( include_patterns=include_patterns, ignore_patterns=exclude_patterns, ) - if query['url']: + if query["url"]: # Extract relevant fields for CloneConfig clone_config = CloneConfig( url=query["url"], - local_path=query['local_path'], - commit=query.get('commit'), - branch=query.get('branch'), + local_path=query["local_path"], + commit=query.get("commit"), + branch=query.get("branch"), ) clone_result = clone_repo(clone_config) @@ -50,7 +50,7 @@ def ingest( finally: # Clean up the temporary directory if it was created - if query['url']: + if query["url"]: # Get parent directory two levels up from local_path (../tmp) - cleanup_path = str(Path(query['local_path']).parents[1]) + cleanup_path = str(Path(query["local_path"]).parents[1]) shutil.rmtree(cleanup_path, ignore_errors=True) diff --git a/src/gitingest/ingest_from_query.py b/src/gitingest/ingest_from_query.py index a9130a3..7dc6d29 100644 --- a/src/gitingest/ingest_from_query.py +++ b/src/gitingest/ingest_from_query.py @@ -1,6 +1,6 @@ import os from fnmatch import fnmatch -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any import tiktoken @@ -10,7 +10,7 @@ MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB -def should_include(path: str, base_path: str, include_patterns: List[str]) -> bool: +def should_include(path: str, base_path: str, include_patterns: list[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) include = False for pattern in include_patterns: @@ -19,10 +19,10 @@ def should_include(path: str, base_path: str, include_patterns: List[str]) -> bo return include -def should_exclude(path: str, base_path: str, ignore_patterns: List[str]) -> bool: +def should_exclude(path: str, base_path: str, ignore_patterns: list[str]) -> bool: rel_path = path.replace(base_path, "").lstrip(os.sep) for pattern in ignore_patterns: - if pattern == '': + if pattern == "": continue if fnmatch(rel_path, pattern): return True @@ -43,7 +43,7 @@ def is_safe_symlink(symlink_path: str, base_path: str) -> bool: def is_text_file(file_path: str) -> bool: """Determines if a file is likely a text file based on its content.""" try: - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: chunk = file.read(1024) return not bool(chunk.translate(None, bytes([7, 8, 9, 10, 12, 13, 27] + list(range(0x20, 0x100))))) except OSError: @@ -52,7 +52,7 @@ def is_text_file(file_path: str) -> bool: def read_file_content(file_path: str) -> str: try: - with open(file_path, encoding='utf-8', errors='ignore') as f: + with open(file_path, encoding="utf-8", errors="ignore") as f: return f.read() except Exception as e: return f"Error reading file: {str(e)}" @@ -60,11 +60,11 @@ def read_file_content(file_path: str) -> str: def scan_directory( path: str, - query: Dict[str, Any], - seen_paths: Optional[Set[str]] = None, + query: dict[str, Any], + seen_paths: set[str] | None = None, depth: int = 0, - stats: Optional[Dict[str, int]] = None, -) -> Optional[Dict[str, Any]]: + stats: dict[str, int] | None = None, +) -> dict[str, Any] | None: """Recursively analyzes a directory and its contents with safety limits.""" if seen_paths is None: seen_paths = set() @@ -101,9 +101,9 @@ def scan_directory( "ignore_content": False, } - ignore_patterns = query['ignore_patterns'] - base_path = query['local_path'] - include_patterns = query['include_patterns'] + ignore_patterns = query["ignore_patterns"] + base_path = query["local_path"] + include_patterns = query["include_patterns"] try: for item in os.listdir(path): @@ -113,7 +113,7 @@ def scan_directory( continue is_file = os.path.isfile(item_path) - if is_file and query['include_patterns']: + if is_file and query["include_patterns"]: if not should_include(item_path, base_path, include_patterns): result["ignore_content"] = True continue @@ -220,11 +220,11 @@ def scan_directory( def extract_files_content( - query: Dict[str, Any], - node: Dict[str, Any], + query: dict[str, Any], + node: dict[str, Any], max_file_size: int, - files: Optional[List[Dict[str, Any]]] = None, -) -> List[Dict[str, Any]]: + files: list[dict[str, Any]] | None = None, +) -> list[dict[str, Any]]: """Recursively collects all text files with their contents.""" if files is None: files = [] @@ -236,7 +236,7 @@ def extract_files_content( files.append( { - "path": node["path"].replace(query['local_path'], ""), + "path": node["path"].replace(query["local_path"], ""), "content": content, "size": node["size"], }, @@ -248,17 +248,17 @@ def extract_files_content( return files -def create_file_content_string(files: List[Dict[str, Any]]) -> str: +def create_file_content_string(files: list[dict[str, Any]]) -> str: """Creates a formatted string of file contents with separators.""" output = "" separator = "=" * 48 + "\n" # First add README.md if it exists for file in files: - if not file['content']: + if not file["content"]: continue - if file['path'].lower() == '/readme.md': + if file["path"].lower() == "/readme.md": output += separator output += f"File: {file['path']}\n" output += separator @@ -267,7 +267,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str: # Then add all other files in their original order for file in files: - if not file['content'] or file['path'].lower() == '/readme.md': + if not file["content"] or file["path"].lower() == "/readme.md": continue output += separator @@ -278,7 +278,7 @@ def create_file_content_string(files: List[Dict[str, Any]]) -> str: return output -def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str: +def create_summary_string(query: dict[str, Any], nodes: dict[str, Any]) -> str: """Creates a summary string with file counts and content size.""" if "user_name" in query: summary = f"Repository: {query['user_name']}/{query['repo_name']}\n" @@ -287,22 +287,22 @@ def create_summary_string(query: Dict[str, Any], nodes: Dict[str, Any]) -> str: summary += f"Files analyzed: {nodes['file_count']}\n" - if 'subpath' in query and query['subpath'] != '/': + if "subpath" in query and query["subpath"] != "/": summary += f"Subpath: {query['subpath']}\n" - if 'commit' in query and query['commit']: + if "commit" in query and query["commit"]: summary += f"Commit: {query['commit']}\n" - elif 'branch' in query and query['branch'] != 'main' and query['branch'] != 'master' and query['branch']: + elif "branch" in query and query["branch"] != "main" and query["branch"] != "master" and query["branch"]: summary += f"Branch: {query['branch']}\n" return summary -def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: str = "", is_last: bool = True) -> str: +def create_tree_structure(query: dict[str, Any], node: dict[str, Any], prefix: str = "", is_last: bool = True) -> str: """Creates a tree-like string representation of the file structure.""" tree = "" if not node["name"]: - node["name"] = query['slug'] + node["name"] = query["slug"] if node["name"]: current_prefix = "└── " if is_last else "├── " @@ -319,7 +319,7 @@ def create_tree_structure(query: Dict[str, Any], node: Dict[str, Any], prefix: s return tree -def generate_token_string(context_string: str) -> Optional[str]: +def generate_token_string(context_string: str) -> str | None: """Returns the number of tokens in a text string.""" formatted_tokens = "" try: @@ -340,7 +340,7 @@ def generate_token_string(context_string: str) -> Optional[str]: return formatted_tokens -def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: +def ingest_single_file(path: str, query: dict[str, Any]) -> tuple[str, str, str]: if not os.path.isfile(path): raise ValueError(f"Path {path} is not a file") @@ -350,11 +350,11 @@ def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str] raise ValueError(f"File {path} is not a text file") content = read_file_content(path) - if file_size > query['max_file_size']: + if file_size > query["max_file_size"]: content = "[Content ignored: file too large]" file_info = { - "path": path.replace(query['local_path'], ""), + "path": path.replace(query["local_path"], ""), "content": content, "size": file_size, } @@ -376,11 +376,11 @@ def ingest_single_file(path: str, query: Dict[str, Any]) -> Tuple[str, str, str] return summary, tree, files_content -def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: +def ingest_directory(path: str, query: dict[str, Any]) -> tuple[str, str, str]: nodes = scan_directory(path=path, query=query) if not nodes: raise ValueError(f"No files found in {path}") - files = extract_files_content(query=query, node=nodes, max_file_size=query['max_file_size']) + files = extract_files_content(query=query, node=nodes, max_file_size=query["max_file_size"]) summary = create_summary_string(query, nodes) tree = "Directory structure:\n" + create_tree_structure(query, nodes) files_content = create_file_content_string(files) @@ -392,13 +392,13 @@ def ingest_directory(path: str, query: Dict[str, Any]) -> Tuple[str, str, str]: return summary, tree, files_content -def ingest_from_query(query: Dict[str, Any]) -> Tuple[str, str, str]: +def ingest_from_query(query: dict[str, Any]) -> tuple[str, str, str]: """Main entry point for analyzing a codebase directory or single file.""" path = f"{query['local_path']}{query['subpath']}" if not os.path.exists(path): raise ValueError(f"{query['slug']} cannot be found") - if query.get('type') == 'blob': + if query.get("type") == "blob": return ingest_single_file(path, query) return ingest_directory(path, query) diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 5053dfa..73151c2 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,7 +1,7 @@ import os import string import uuid -from typing import Any, Dict, List, Optional, Union +from typing import Any from urllib.parse import unquote from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS @@ -11,15 +11,15 @@ HEX_DIGITS = set(string.hexdigits) -def parse_url(url: str) -> Dict[str, Any]: +def parse_url(url: str) -> dict[str, Any]: url = url.split(" ")[0] url = unquote(url) # Decode URL-encoded characters - if not url.startswith('https://'): - url = 'https://' + url + if not url.startswith("https://"): + url = "https://" + url # Extract domain and path - url_parts = url.split('/') + url_parts = url.split("/") domain = url_parts[2] path_parts = url_parts[3:] @@ -62,8 +62,8 @@ def parse_url(url: str) -> Dict[str, Any]: # Handle branch names with slashes and special characters - # Find the index of the first type indicator ('tree' or 'blob'), if any - type_indicator_index = next((i for i, part in enumerate(remaining_parts) if part in ('tree', 'blob')), None) + # Find the index of the first type indicator ("tree" or "blob"), if any + type_indicator_index = next((i for i, part in enumerate(remaining_parts) if part in ("tree", "blob")), None) if type_indicator_index is None: # No type indicator found; assume the entire input is the branch name @@ -89,7 +89,7 @@ def normalize_pattern(pattern: str) -> str: return pattern -def parse_patterns(pattern: Union[List[str], str]) -> List[str]: +def parse_patterns(pattern: list[str] | str) -> list[str]: patterns = pattern if isinstance(pattern, list) else [pattern] patterns = [p.strip() for p in patterns] @@ -103,7 +103,7 @@ def parse_patterns(pattern: Union[List[str], str]) -> List[str]: return [normalize_pattern(p) for p in patterns] -def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[str]) -> List[str]: +def override_ignore_patterns(ignore_patterns: list[str], include_patterns: list[str]) -> list[str]: """ Removes patterns from ignore_patterns that are present in include_patterns using set difference. @@ -122,7 +122,7 @@ def override_ignore_patterns(ignore_patterns: List[str], include_patterns: List[ return list(set(ignore_patterns) - set(include_patterns)) -def parse_path(path: str) -> Dict[str, Any]: +def parse_path(path: str) -> dict[str, Any]: query = { "url": None, "local_path": os.path.abspath(path), @@ -137,9 +137,9 @@ def parse_query( source: str, max_file_size: int, from_web: bool, - include_patterns: Optional[Union[List[str], str]] = None, - ignore_patterns: Optional[Union[List[str], str]] = None, -) -> Dict[str, Any]: + include_patterns: list[str] | str | None = None, + ignore_patterns: list[str] | str | None = None, +) -> dict[str, Any]: """ Parses the input source to construct a query dictionary with specified parameters. @@ -183,9 +183,9 @@ def parse_query( # Update the query dictionary with max_file_size and processed patterns query.update( { - 'max_file_size': max_file_size, - 'ignore_patterns': ignore_patterns_list, - 'include_patterns': parsed_include, + "max_file_size": max_file_size, + "ignore_patterns": ignore_patterns_list, + "include_patterns": parsed_include, } ) return query diff --git a/src/gitingest/tests/conftest.py b/src/gitingest/tests/conftest.py index 31dba62..ecb7e81 100644 --- a/src/gitingest/tests/conftest.py +++ b/src/gitingest/tests/conftest.py @@ -6,4 +6,4 @@ # Add both the project root and src directory to PYTHONPATH sys.path.insert(0, project_root) -sys.path.insert(0, os.path.join(project_root, 'src')) +sys.path.insert(0, os.path.join(project_root, "src")) diff --git a/src/gitingest/tests/test_clone.py b/src/gitingest/tests/test_clone.py index 5f33b98..585ba6e 100644 --- a/src/gitingest/tests/test_clone.py +++ b/src/gitingest/tests/test_clone.py @@ -8,16 +8,17 @@ @pytest.mark.asyncio async def test_clone_repo_with_commit() -> None: clone_config = CloneConfig( - url='https://github.com/user/repo', - local_path='/tmp/repo', - commit='a' * 40, # Simulating a valid commit hash - branch='main', + url="https://github.com/user/repo", + local_path="/tmp/repo", + commit="a" * 40, # Simulating a valid commit hash + branch="main", ) - with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: + with patch("gitingest.clone.check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.clone.run_git_command", new_callable=AsyncMock) as mock_exec: + mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') + mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process await clone_repo(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -26,12 +27,12 @@ async def test_clone_repo_with_commit() -> None: @pytest.mark.asyncio async def test_clone_repo_without_commit() -> None: - query = CloneConfig(url='https://github.com/user/repo', local_path='/tmp/repo', commit=None, branch='main') + query = CloneConfig(url="https://github.com/user/repo", local_path="/tmp/repo", commit=None, branch="main") - with patch('gitingest.clone.check_repo_exists', return_value=True) as mock_check: - with patch('gitingest.clone.run_git_command', new_callable=AsyncMock) as mock_exec: + with patch("gitingest.clone.check_repo_exists", return_value=True) as mock_check: + with patch("gitingest.clone.run_git_command", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() - mock_process.communicate.return_value = (b'output', b'error') + mock_process.communicate.return_value = (b"output", b"error") mock_exec.return_value = mock_process await clone_repo(query) @@ -42,12 +43,12 @@ async def test_clone_repo_without_commit() -> None: @pytest.mark.asyncio async def test_clone_repo_nonexistent_repository() -> None: clone_config = CloneConfig( - url='https://github.com/user/nonexistent-repo', - local_path='/tmp/repo', + url="https://github.com/user/nonexistent-repo", + local_path="/tmp/repo", commit=None, - branch='main', + branch="main", ) - with patch('gitingest.clone.check_repo_exists', return_value=False) as mock_check: + with patch("gitingest.clone.check_repo_exists", return_value=False) as mock_check: with pytest.raises(ValueError, match="Repository not found"): await clone_repo(clone_config) mock_check.assert_called_once_with(clone_config.url) @@ -57,9 +58,9 @@ async def test_clone_repo_nonexistent_repository() -> None: async def test_check_repo_exists() -> None: url = "https://github.com/user/repo" - with patch('asyncio.create_subprocess_exec', new_callable=AsyncMock) as mock_exec: + with patch("asyncio.create_subprocess_exec", new_callable=AsyncMock) as mock_exec: mock_process = AsyncMock() - mock_process.communicate.return_value = (b'HTTP/1.1 200 OK\n', b'') + mock_process.communicate.return_value = (b"HTTP/1.1 200 OK\n", b"") mock_exec.return_value = mock_process # Test existing repository @@ -67,7 +68,7 @@ async def test_check_repo_exists() -> None: assert await check_repo_exists(url) is True # Test non-existing repository (404 response) - mock_process.communicate.return_value = (b'HTTP/1.1 404 Not Found\n', b'') + mock_process.communicate.return_value = (b"HTTP/1.1 404 Not Found\n", b"") mock_process.returncode = 0 assert await check_repo_exists(url) is False diff --git a/src/gitingest/tests/test_ingest.py b/src/gitingest/tests/test_ingest.py index 33b174b..fa8369a 100644 --- a/src/gitingest/tests/test_ingest.py +++ b/src/gitingest/tests/test_ingest.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Any, Dict +from typing import Any import pytest @@ -8,19 +8,19 @@ # Test fixtures @pytest.fixture -def sample_query() -> Dict[str, Any]: +def sample_query() -> dict[str, Any]: return { - 'user_name': 'test_user', - 'repo_name': 'test_repo', - 'local_path': '/tmp/test_repo', - 'subpath': '/', - 'branch': 'main', - 'commit': None, - 'max_file_size': 1_000_000, - 'slug': 'test_user/test_repo', - 'ignore_patterns': ['*.pyc', '__pycache__', '.git'], - 'include_patterns': None, - 'pattern_type': 'exclude', + "user_name": "test_user", + "repo_name": "test_repo", + "local_path": "/tmp/test_repo", + "subpath": "/", + "branch": "main", + "commit": None, + "max_file_size": 1_000_000, + "slug": "test_user/test_repo", + "ignore_patterns": ["*.pyc", "__pycache__", ".git"], + "include_patterns": None, + "pattern_type": "exclude", } @@ -73,18 +73,18 @@ def temp_directory(tmp_path: Path) -> Path: return test_dir -def test_scan_directory(temp_directory: Path, sample_query: Dict[str, Any]) -> None: +def test_scan_directory(temp_directory: Path, sample_query: dict[str, Any]) -> None: result = scan_directory(str(temp_directory), query=sample_query) if result is None: assert False, "Result is None" - assert result['type'] == 'directory' - assert result['file_count'] == 8 # All .txt and .py files - assert result['dir_count'] == 4 # src, src/subdir, dir1, dir2 - assert len(result['children']) == 5 # file1.txt, file2.py, src, dir1, dir2 + assert result["type"] == "directory" + assert result["file_count"] == 8 # All .txt and .py files + assert result["dir_count"] == 4 # src, src/subdir, dir1, dir2 + assert len(result["children"]) == 5 # file1.txt, file2.py, src, dir1, dir2 -def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any]) -> None: +def test_extract_files_content(temp_directory: Path, sample_query: dict[str, Any]) -> None: nodes = scan_directory(str(temp_directory), query=sample_query) if nodes is None: assert False, "Nodes is None" @@ -92,14 +92,14 @@ def test_extract_files_content(temp_directory: Path, sample_query: Dict[str, Any assert len(files) == 8 # All .txt and .py files # Check for presence of key files - paths = [f['path'] for f in files] - assert any('file1.txt' in p for p in paths) - assert any('subfile1.txt' in p for p in paths) - assert any('file2.py' in p for p in paths) - assert any('subfile2.py' in p for p in paths) - assert any('file_subdir.txt' in p for p in paths) - assert any('file_dir1.txt' in p for p in paths) - assert any('file_dir2.txt' in p for p in paths) + paths = [f["path"] for f in files] + assert any("file1.txt" in p for p in paths) + assert any("subfile1.txt" in p for p in paths) + assert any("file2.py" in p for p in paths) + assert any("subfile2.py" in p for p in paths) + assert any("file_subdir.txt" in p for p in paths) + assert any("file_dir1.txt" in p for p in paths) + assert any("file_dir2.txt" in p for p in paths) # TODO: test with include patterns: ['*.txt'] diff --git a/src/gitingest/tests/test_parse_query.py b/src/gitingest/tests/test_parse_query.py index 1ab5e44..71ff71e 100644 --- a/src/gitingest/tests/test_parse_query.py +++ b/src/gitingest/tests/test_parse_query.py @@ -26,7 +26,7 @@ def test_parse_url_invalid() -> None: def test_parse_query_basic() -> None: test_cases = ["https://github.com/user/repo", "https://gitlab.com/user/repo"] for url in test_cases: - result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns='*.txt') + result = parse_query(url, max_file_size=50, from_web=True, ignore_patterns="*.txt") assert result["user_name"] == "user" assert result["repo_name"] == "repo" assert result["url"] == url @@ -35,7 +35,7 @@ def test_parse_query_basic() -> None: def test_parse_query_include_pattern() -> None: url = "https://github.com/user/repo" - result = parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py') + result = parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py") assert result["include_patterns"] == ["*.py"] assert set(result["ignore_patterns"]) == set(DEFAULT_IGNORE_PATTERNS) @@ -43,4 +43,4 @@ def test_parse_query_include_pattern() -> None: def test_parse_query_invalid_pattern() -> None: url = "https://github.com/user/repo" with pytest.raises(ValueError, match="Pattern.*contains invalid characters"): - parse_query(url, max_file_size=50, from_web=True, include_patterns='*.py;rm -rf') + parse_query(url, max_file_size=50, from_web=True, include_patterns="*.py;rm -rf") diff --git a/src/gitingest/utils.py b/src/gitingest/utils.py index 2445f14..8406d5c 100644 --- a/src/gitingest/utils.py +++ b/src/gitingest/utils.py @@ -1,7 +1,8 @@ ## Async Timeout decorator import asyncio import functools -from typing import Awaitable, Callable, ParamSpec, TypeVar +from collections.abc import Awaitable, Callable +from typing import ParamSpec, TypeVar T = TypeVar("T") P = ParamSpec("P") diff --git a/src/main.py b/src/main.py index a50a1c5..18de770 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,4 @@ import os -from typing import Dict from api_analytics.fastapi import Analytics from dotenv import load_dotenv @@ -33,7 +32,7 @@ async def rate_limit_exception_handler(request: Request, exc: Exception) -> Resp app.add_exception_handler(RateLimitExceeded, rate_limit_exception_handler) app.mount("/static", StaticFiles(directory="static"), name="static") -app_analytics_key = os.getenv('API_ANALYTICS_KEY') +app_analytics_key = os.getenv("API_ANALYTICS_KEY") if app_analytics_key: app.add_middleware(Analytics, api_key=app_analytics_key) @@ -52,7 +51,7 @@ async def rate_limit_exception_handler(request: Request, exc: Exception) -> Resp @app.get("/health") -async def health_check() -> Dict[str, str]: +async def health_check() -> dict[str, str]: return {"status": "healthy"} @@ -70,7 +69,7 @@ async def api_docs(request: Request) -> HTMLResponse: @app.get("/robots.txt") async def robots() -> FileResponse: - return FileResponse('static/robots.txt') + return FileResponse("static/robots.txt") app.include_router(index) diff --git a/src/process_query.py b/src/process_query.py index 761fdf2..f55068c 100644 --- a/src/process_query.py +++ b/src/process_query.py @@ -62,9 +62,9 @@ async def process_query( ) clone_config = CloneConfig( url=query["url"], - local_path=query['local_path'], - commit=query.get('commit'), - branch=query.get('branch'), + local_path=query["local_path"], + commit=query.get("commit"), + branch=query.get("branch"), ) await clone_repo(clone_config) summary, tree, content = ingest_from_query(query) @@ -73,8 +73,8 @@ async def process_query( except Exception as e: # hack to print error message when query is not defined - if 'query' in locals() and query is not None and isinstance(query, dict): - print_error(query['url'], e, max_file_size, pattern_type, pattern) + if "query" in locals() and query is not None and isinstance(query, dict): + print_error(query["url"], e, max_file_size, pattern_type, pattern) else: print(f"{Colors.BROWN}WARN{Colors.END}: {Colors.RED}<- {Colors.END}", end="") print(f"{Colors.RED}{e}{Colors.END}") @@ -99,7 +99,7 @@ async def process_query( ) print_success( - url=query['url'], + url=query["url"], max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, @@ -116,7 +116,7 @@ async def process_query( "tree": tree, "content": content, "examples": EXAMPLE_REPOS if is_index else [], - "ingest_id": query['id'], + "ingest_id": query["id"], "default_file_size": slider_position, "pattern_type": pattern_type, "pattern": pattern, diff --git a/src/routers/download.py b/src/routers/download.py index 95cec0f..2dc1022 100644 --- a/src/routers/download.py +++ b/src/routers/download.py @@ -13,7 +13,7 @@ async def download_ingest(digest_id: str) -> Response: try: # Find the first .txt file in the directory directory = f"{TMP_BASE_PATH}/{digest_id}" - txt_files = [f for f in os.listdir(directory) if f.endswith('.txt')] + txt_files = [f for f in os.listdir(directory) if f.endswith(".txt")] if not txt_files: raise FileNotFoundError("No .txt file found")