diff --git a/src/gitingest/parse_query.py b/src/gitingest/parse_query.py index 669f28f..fcf8186 100644 --- a/src/gitingest/parse_query.py +++ b/src/gitingest/parse_query.py @@ -1,7 +1,8 @@ import os import uuid -from typing import Any, Dict, List, Optional, Union +from urllib.parse import unquote +from typing import Any, Dict, List, Optional, Union from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS TMP_BASE_PATH = "../tmp" @@ -22,6 +23,8 @@ def parse_url(url: str) -> Dict[str, Any]: } url = url.split(" ")[0] + url = unquote(url) # Decode URL-encoded characters + if not url.startswith('https://'): url = 'https://' + url @@ -36,19 +39,34 @@ def parse_url(url: str) -> Dict[str, Any]: parsed["user_name"] = path_parts[0] parsed["repo_name"] = path_parts[1] - # Keep original URL format + # Keep original URL format but with decoded components parsed["url"] = f"https://{domain}/{parsed['user_name']}/{parsed['repo_name']}" parsed['slug'] = f"{parsed['user_name']}-{parsed['repo_name']}" parsed["id"] = str(uuid.uuid4()) parsed["local_path"] = f"{TMP_BASE_PATH}/{parsed['id']}/{parsed['slug']}" if len(path_parts) > 3: - parsed["type"] = path_parts[2] - parsed["branch"] = path_parts[3] - if len(parsed['branch']) == 40 and all(c in '0123456789abcdefABCDEF' for c in parsed['branch']): - parsed["commit"] = parsed['branch'] - parsed["subpath"] = "/" + "/".join(path_parts[4:]) + parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' + + # Find the commit hash or reconstruct the branch name + remaining_parts = path_parts[3:] + if remaining_parts[0] and len(remaining_parts[0]) == 40 and all(c in '0123456789abcdefABCDEF' for c in remaining_parts[0]): + parsed["commit"] = remaining_parts[0] + parsed["subpath"] = "/" + "/".join(remaining_parts[1:]) if len(remaining_parts) > 1 else "/" + else: + # Handle branch names with slashes and special characters + for i, part in enumerate(remaining_parts): + if part in ('tree', 'blob'): + # Found another type indicator, everything before this was the branch name + parsed["branch"] = "/".join(remaining_parts[:i]) + parsed["subpath"] = "/" + "/".join(remaining_parts[i+2:]) if len(remaining_parts) > i+2 else "/" + break + else: + # No additional type indicator found, assume everything is part of the branch name + parsed["branch"] = "/".join(remaining_parts) + parsed["subpath"] = "/" + return parsed