From 9bdee8f7012cdf42485387eb11c09d741b37435d Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Fri, 10 Jan 2025 13:50:05 +0100 Subject: [PATCH] feat: make parser domain-agnostic to support multiple Git hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - added list of known domains/Git hosts in `query_parser.py` - fixed bug from [#115](https://github.com/cyclotruc/gitingest/pull/115): corrected case handling for URL components—scheme, domain, username, and repository are case-insensitive, but paths beyond (e.g., file names, branches) are case-sensitive - implemented `try_domains_for_user_and_repo` in `query_parser.py` to iteratively guess the correct domain until success or supported hosts are exhausted - added helper functions `_get_user_and_repo_from_path`, `_validate_host`, and `_validate_scheme` in `query_parser.py` - extended `_parse_repo_source` in `query_parser.py` to be Git host agnostic by using `try_domains_for_user_and_repo` - added tests `test_parse_url_unsupported_host` and `test_parse_query_with_branch` in `test_query_parser.py` - created new file `test_git_host_agnostic.py` to verify domain/Git host agnostic behavior --- src/gitingest/query_parser.py | 208 ++++++++++++++----- src/main.py | 2 +- tests/query_parser/test_git_host_agnostic.py | 71 +++++++ tests/test_query_parser.py | 22 ++ 4 files changed, 251 insertions(+), 52 deletions(-) create mode 100644 tests/query_parser/test_git_host_agnostic.py diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index bc35698..2981f09 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -11,8 +11,16 @@ from config import TMP_BASE_PATH from gitingest.exceptions import InvalidPatternError from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS +from gitingest.repository_clone import _check_repo_exists -HEX_DIGITS = set(string.hexdigits) +HEX_DIGITS: set[str] = set(string.hexdigits) + +KNOWN_GIT_HOSTS: list[str] = [ + "github.com", + "gitlab.com", + "bitbucket.org", + "gitea.com", +] async def parse_query( @@ -48,16 +56,16 @@ async def parse_query( A dictionary containing the parsed query parameters, including 'max_file_size', 'ignore_patterns', and 'include_patterns'. """ - # Normalize and clean up the source string to make it case-insensitive - source = source.lower().strip() # Determine the parsing method based on the source type - if from_web or source.startswith("https://") or "github.com" in source: - query = _parse_repo_source(source) + if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS): + # We either have a full URL or a domain-less slug + query = await _parse_repo_source(source) else: + # Local path scenario query = _parse_path(source) - # Process ignore patterns + # Combine ignore patterns ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy() if ignore_patterns: ignore_patterns_list += _parse_patterns(ignore_patterns) @@ -69,7 +77,6 @@ async def parse_query( else: parsed_include = None - # Update the query dictionary with max_file_size and processed patterns query.update( { "max_file_size": max_file_size, @@ -80,52 +87,54 @@ async def parse_query( return query -def _parse_repo_source(url: str) -> dict[str, Any]: +async def _parse_repo_source(source: str) -> dict[str, Any]: """ - Parse a GitHub repository URL into a structured query dictionary. + Parse a repository URL into a structured query dictionary. - This function extracts relevant information from a GitHub URL, such as the username, - repository name, commit, branch, and subpath, and returns them in a structured format. + If source is: + - A fully qualified URL (https://gitlab.com/...), parse & verify that domain + - A URL missing 'https://' (gitlab.com/...), add 'https://' and parse + - A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists. Parameters ---------- - url : str - The GitHub URL to parse. + source : str + The URL or domain-less slug to parse. Returns ------- dict[str, Any] - A dictionary containing the parsed details of the GitHub repository, including - the username, repository name, commit, branch, and other relevant information. - - Raises - ------ - ValueError - If the URL is invalid or does not correspond to a valid Git repository. + A dictionary containing the parsed details of the repository, including the username, + repository name, commit, branch, and other relevant information. """ - # Clean up the URL - url = url.split(" ")[0] # remove trailing text - url = unquote(url) # decode URL-encoded characters + source = unquote(source) - if not url.startswith(("https://", "http://")): - url = "https://" + url + # Attempt to parse + parsed_url = urlparse(source) - # Parse URL and reconstruct it without query parameters and fragments - parsed_url = urlparse(url) - url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}" + if parsed_url.scheme: + _validate_scheme(parsed_url.scheme) + _validate_host(parsed_url.netloc.lower()) - # Extract domain and path - url_parts = url.split("/") - domain = url_parts[2] - path_parts = url_parts[3:] + else: # Will be of the form 'host/user/repo' or 'user/repo' + tmp_host = source.split("/")[0].lower() + if "." in tmp_host: + _validate_host(tmp_host) + else: + # No scheme, no domain => user typed "user/repo", so we'll guess the domain. + host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source)) + source = f"{host}/{source}" - if len(path_parts) < 2: - raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.") + source = "https://" + source + parsed_url = urlparse(source) + + host = parsed_url.netloc.lower() + user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path) - user_name = path_parts[0] - repo_name = path_parts[1] _id = str(uuid.uuid4()) slug = f"{user_name}-{repo_name}" + local_path = Path(TMP_BASE_PATH) / _id / slug + url = f"https://{host}/{user_name}/{repo_name}" parsed = { "user_name": user_name, @@ -134,31 +143,39 @@ def _parse_repo_source(url: str) -> dict[str, Any]: "branch": None, "commit": None, "subpath": "/", - "local_path": Path(TMP_BASE_PATH) / _id / slug, - "url": f"https://{domain}/{user_name}/{repo_name}", - "slug": slug, + "local_path": local_path, + "url": url, + "slug": slug, # e.g. "pandas-dev-pandas" "id": _id, } - # If this is an issues page or pull requests, return early without processing subpath - if len(path_parts) > 2 and (path_parts[2] == "issues" or path_parts[2] == "pull"): + remaining_parts = parsed_url.path.strip("/").split("/")[2:] + + if not remaining_parts: return parsed + possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob' + # If no extra path parts, just return - if len(path_parts) < 4: + if not remaining_parts: + return parsed + + # If this is an issues page or pull requests, return early without processing subpath + if remaining_parts and possible_type in ("issues", "pull"): return parsed - parsed["type"] = path_parts[2] # Usually 'tree' or 'blob' - commit = path_parts[3] + parsed["type"] = possible_type - if _is_valid_git_commit_hash(commit): - parsed["commit"] = commit - if len(path_parts) > 4: - parsed["subpath"] += "/".join(path_parts[4:]) + # Commit or branch + commit_or_branch = remaining_parts.pop(0) + if _is_valid_git_commit_hash(commit_or_branch): + parsed["commit"] = commit_or_branch else: - parsed["branch"] = commit - if len(path_parts) > 4: - parsed["subpath"] += "/".join(path_parts[4:]) + parsed["branch"] = commit_or_branch + + # Subpath if anything left + if remaining_parts: + parsed["subpath"] += "/".join(remaining_parts) return parsed @@ -314,3 +331,92 @@ def _is_valid_pattern(pattern: str) -> bool: True if the pattern is valid, otherwise False. """ return all(c.isalnum() or c in "-_./+*" for c in pattern) + + +async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str: + """ + Attempt to find a valid repository host for the given user_name and repo_name. + + Parameters + ---------- + user_name : str + The username or owner of the repository. + repo_name : str + The name of the repository. + + Returns + ------- + str + The domain of the valid repository host. + + Raises + ------ + ValueError + If no valid repository host is found for the given user_name and repo_name. + """ + for domain in KNOWN_GIT_HOSTS: + candidate = f"https://{domain}/{user_name}/{repo_name}" + if await _check_repo_exists(candidate): + return domain + raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.") + + +def _get_user_and_repo_from_path(path: str) -> tuple[str, str]: + """ + Extract the user and repository names from a given path. + + Parameters + ---------- + path : str + The path to extract the user and repository names from. + + Returns + ------- + tuple[str, str] + A tuple containing the user and repository names. + + Raises + ------ + ValueError + If the path does not contain at least two parts. + """ + path_parts = path.lower().strip("/").split("/") + if len(path_parts) < 2: + raise ValueError(f"Invalid repository URL '{path}'") + return path_parts[0], path_parts[1] + + +def _validate_host(host: str) -> None: + """ + Validate the given host against the known Git hosts. + + Parameters + ---------- + host : str + The host to validate. + + Raises + ------ + ValueError + If the host is not a known Git host. + """ + if host not in KNOWN_GIT_HOSTS: + raise ValueError(f"Unknown domain '{host}' in URL") + + +def _validate_scheme(scheme: str) -> None: + """ + Validate the given scheme against the known schemes. + + Parameters + ---------- + scheme : str + The scheme to validate. + + Raises + ------ + ValueError + If the scheme is not 'http' or 'https'. + """ + if scheme not in ("https", "http"): + raise ValueError(f"Invalid URL scheme '{scheme}' in URL") diff --git a/src/main.py b/src/main.py index 7ba36a8..f2b63fd 100644 --- a/src/main.py +++ b/src/main.py @@ -78,7 +78,7 @@ async def process_folder(folder: Path) -> None: # Extract owner and repository name from the filename if txt_files and "-" in (filename := txt_files[0].stem): owner, repo = filename.split("-", 1) - repo_url = f"https://github.com/{owner}/{repo}" + repo_url = f"{owner}/{repo}" with open("history.txt", mode="a", encoding="utf-8") as history: history.write(f"{repo_url}\n") diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py new file mode 100644 index 0000000..1830811 --- /dev/null +++ b/tests/query_parser/test_git_host_agnostic.py @@ -0,0 +1,71 @@ +""" Tests to verify that the query parser is Git host agnostic. """ + +import pytest + +from gitingest.query_parser import parse_query + + +@pytest.mark.parametrize( + "urls, expected_user, expected_repo, expected_url", + [ + ( + [ + "https://github.com/tiangolo/fastapi", + "github.com/tiangolo/fastapi", + "tiangolo/fastapi", + ], + "tiangolo", + "fastapi", + "https://github.com/tiangolo/fastapi", + ), + ( + [ + "https://gitlab.com/gitlab-org/gitlab-runner", + "gitlab.com/gitlab-org/gitlab-runner", + "gitlab-org/gitlab-runner", + ], + "gitlab-org", + "gitlab-runner", + "https://gitlab.com/gitlab-org/gitlab-runner", + ), + ( + [ + "https://bitbucket.org/na-dna/llm-knowledge-share", + "bitbucket.org/na-dna/llm-knowledge-share", + "na-dna/llm-knowledge-share", + ], + "na-dna", + "llm-knowledge-share", + "https://bitbucket.org/na-dna/llm-knowledge-share", + ), + ( + [ + "https://gitea.com/xorm/xorm", + "gitea.com/xorm/xorm", + "xorm/xorm", + ], + "xorm", + "xorm", + "https://gitea.com/xorm/xorm", + ), + ], +) +@pytest.mark.asyncio +async def test_parse_query_without_host( + urls: list[str], + expected_user: str, + expected_repo: str, + expected_url: str, +) -> None: + for url in urls: + result = await parse_query(url, max_file_size=50, from_web=True) + # Common assertions for all cases + assert result["user_name"] == expected_user + assert result["repo_name"] == expected_repo + assert result["url"] == expected_url + assert result["slug"] == f"{expected_user}-{expected_repo}" + assert result["id"] is not None + assert result["subpath"] == "/" + assert result["branch"] is None + assert result["commit"] is None + assert result["type"] is None diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py index 1fe666b..0db65d3 100644 --- a/tests/test_query_parser.py +++ b/tests/test_query_parser.py @@ -252,3 +252,25 @@ async def test_parse_url_with_query_and_fragment() -> None: assert result["user_name"] == "user" assert result["repo_name"] == "repo" assert result["url"] == "https://github.com/user/repo" # URL should be cleaned + + +async def test_parse_url_unsupported_host() -> None: + url = "https://only-domain.com" + with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"): + await _parse_repo_source(url) + + +async def test_parse_query_with_branch() -> None: + url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" + result = await parse_query(url, max_file_size=10**9, from_web=True) + assert result["user_name"] == "pandas-dev" + assert result["repo_name"] == "pandas" + assert result["url"] == "https://github.com/pandas-dev/pandas" + assert result["slug"] == "pandas-dev-pandas" + assert result["id"] is not None + print('result["subpath"]', result["subpath"]) + print("/.github/ISSUE_TEMPLATE/documentation_improvement.yaml") + assert result["subpath"] == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml" + assert result["branch"] == "2.2.x" + assert result["commit"] is None + assert result["type"] == "blob"