Skip to content

Commit

Permalink
feat: make parser domain-agnostic to support multiple Git hosts
Browse files Browse the repository at this point in the history
- added list of known domains/Git hosts in `query_parser.py`
- fixed bug from [#115](#115): corrected case handling for URL components—scheme, domain, username, and repository are case-insensitive, but paths beyond (e.g., file names, branches) are case-sensitive
- implemented `try_domains_for_user_and_repo` in `query_parser.py` to iteratively guess the correct domain until success or supported hosts are exhausted
- added helper functions `_get_user_and_repo_from_path`, `_validate_host`, and `_validate_scheme` in `query_parser.py`
- extended `_parse_repo_source` in `query_parser.py` to be Git host agnostic by using `try_domains_for_user_and_repo`
- added tests `test_parse_url_unsupported_host` and `test_parse_query_with_branch` in `test_query_parser.py`
- created new file `test_git_host_agnostic.py` to verify domain/Git host agnostic behavior
  • Loading branch information
filipchristiansen committed Jan 10, 2025
1 parent a57f614 commit 9bdee8f
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 52 deletions.
208 changes: 157 additions & 51 deletions src/gitingest/query_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,16 @@
from config import TMP_BASE_PATH
from gitingest.exceptions import InvalidPatternError
from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
from gitingest.repository_clone import _check_repo_exists

HEX_DIGITS = set(string.hexdigits)
HEX_DIGITS: set[str] = set(string.hexdigits)

KNOWN_GIT_HOSTS: list[str] = [
"github.com",
"gitlab.com",
"bitbucket.org",
"gitea.com",
]


async def parse_query(
Expand Down Expand Up @@ -48,16 +56,16 @@ async def parse_query(
A dictionary containing the parsed query parameters, including 'max_file_size',
'ignore_patterns', and 'include_patterns'.
"""
# Normalize and clean up the source string to make it case-insensitive
source = source.lower().strip()

# Determine the parsing method based on the source type
if from_web or source.startswith("https://") or "github.com" in source:
query = _parse_repo_source(source)
if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
# We either have a full URL or a domain-less slug
query = await _parse_repo_source(source)
else:
# Local path scenario
query = _parse_path(source)

# Process ignore patterns
# Combine ignore patterns
ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy()
if ignore_patterns:
ignore_patterns_list += _parse_patterns(ignore_patterns)
Expand All @@ -69,7 +77,6 @@ async def parse_query(
else:
parsed_include = None

# Update the query dictionary with max_file_size and processed patterns
query.update(
{
"max_file_size": max_file_size,
Expand All @@ -80,52 +87,54 @@ async def parse_query(
return query


def _parse_repo_source(url: str) -> dict[str, Any]:
async def _parse_repo_source(source: str) -> dict[str, Any]:
"""
Parse a GitHub repository URL into a structured query dictionary.
Parse a repository URL into a structured query dictionary.
This function extracts relevant information from a GitHub URL, such as the username,
repository name, commit, branch, and subpath, and returns them in a structured format.
If source is:
- A fully qualified URL (https://gitlab.com/...), parse & verify that domain
- A URL missing 'https://' (gitlab.com/...), add 'https://' and parse
- A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists.
Parameters
----------
url : str
The GitHub URL to parse.
source : str
The URL or domain-less slug to parse.
Returns
-------
dict[str, Any]
A dictionary containing the parsed details of the GitHub repository, including
the username, repository name, commit, branch, and other relevant information.
Raises
------
ValueError
If the URL is invalid or does not correspond to a valid Git repository.
A dictionary containing the parsed details of the repository, including the username,
repository name, commit, branch, and other relevant information.
"""
# Clean up the URL
url = url.split(" ")[0] # remove trailing text
url = unquote(url) # decode URL-encoded characters
source = unquote(source)

if not url.startswith(("https://", "http://")):
url = "https://" + url
# Attempt to parse
parsed_url = urlparse(source)

# Parse URL and reconstruct it without query parameters and fragments
parsed_url = urlparse(url)
url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
if parsed_url.scheme:
_validate_scheme(parsed_url.scheme)
_validate_host(parsed_url.netloc.lower())

# Extract domain and path
url_parts = url.split("/")
domain = url_parts[2]
path_parts = url_parts[3:]
else: # Will be of the form 'host/user/repo' or 'user/repo'
tmp_host = source.split("/")[0].lower()
if "." in tmp_host:
_validate_host(tmp_host)
else:
# No scheme, no domain => user typed "user/repo", so we'll guess the domain.
host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
source = f"{host}/{source}"

if len(path_parts) < 2:
raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.")
source = "https://" + source
parsed_url = urlparse(source)

host = parsed_url.netloc.lower()
user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)

user_name = path_parts[0]
repo_name = path_parts[1]
_id = str(uuid.uuid4())
slug = f"{user_name}-{repo_name}"
local_path = Path(TMP_BASE_PATH) / _id / slug
url = f"https://{host}/{user_name}/{repo_name}"

parsed = {
"user_name": user_name,
Expand All @@ -134,31 +143,39 @@ def _parse_repo_source(url: str) -> dict[str, Any]:
"branch": None,
"commit": None,
"subpath": "/",
"local_path": Path(TMP_BASE_PATH) / _id / slug,
"url": f"https://{domain}/{user_name}/{repo_name}",
"slug": slug,
"local_path": local_path,
"url": url,
"slug": slug, # e.g. "pandas-dev-pandas"
"id": _id,
}

# If this is an issues page or pull requests, return early without processing subpath
if len(path_parts) > 2 and (path_parts[2] == "issues" or path_parts[2] == "pull"):
remaining_parts = parsed_url.path.strip("/").split("/")[2:]

if not remaining_parts:
return parsed

possible_type = remaining_parts.pop(0) # e.g. 'issues', 'pull', 'tree', 'blob'

# If no extra path parts, just return
if len(path_parts) < 4:
if not remaining_parts:
return parsed

# If this is an issues page or pull requests, return early without processing subpath
if remaining_parts and possible_type in ("issues", "pull"):
return parsed

parsed["type"] = path_parts[2] # Usually 'tree' or 'blob'
commit = path_parts[3]
parsed["type"] = possible_type

if _is_valid_git_commit_hash(commit):
parsed["commit"] = commit
if len(path_parts) > 4:
parsed["subpath"] += "/".join(path_parts[4:])
# Commit or branch
commit_or_branch = remaining_parts.pop(0)
if _is_valid_git_commit_hash(commit_or_branch):
parsed["commit"] = commit_or_branch
else:
parsed["branch"] = commit
if len(path_parts) > 4:
parsed["subpath"] += "/".join(path_parts[4:])
parsed["branch"] = commit_or_branch

# Subpath if anything left
if remaining_parts:
parsed["subpath"] += "/".join(remaining_parts)

return parsed

Expand Down Expand Up @@ -314,3 +331,92 @@ def _is_valid_pattern(pattern: str) -> bool:
True if the pattern is valid, otherwise False.
"""
return all(c.isalnum() or c in "-_./+*" for c in pattern)


async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
"""
Attempt to find a valid repository host for the given user_name and repo_name.
Parameters
----------
user_name : str
The username or owner of the repository.
repo_name : str
The name of the repository.
Returns
-------
str
The domain of the valid repository host.
Raises
------
ValueError
If no valid repository host is found for the given user_name and repo_name.
"""
for domain in KNOWN_GIT_HOSTS:
candidate = f"https://{domain}/{user_name}/{repo_name}"
if await _check_repo_exists(candidate):
return domain
raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")


def _get_user_and_repo_from_path(path: str) -> tuple[str, str]:
"""
Extract the user and repository names from a given path.
Parameters
----------
path : str
The path to extract the user and repository names from.
Returns
-------
tuple[str, str]
A tuple containing the user and repository names.
Raises
------
ValueError
If the path does not contain at least two parts.
"""
path_parts = path.lower().strip("/").split("/")
if len(path_parts) < 2:
raise ValueError(f"Invalid repository URL '{path}'")
return path_parts[0], path_parts[1]


def _validate_host(host: str) -> None:
"""
Validate the given host against the known Git hosts.
Parameters
----------
host : str
The host to validate.
Raises
------
ValueError
If the host is not a known Git host.
"""
if host not in KNOWN_GIT_HOSTS:
raise ValueError(f"Unknown domain '{host}' in URL")


def _validate_scheme(scheme: str) -> None:
"""
Validate the given scheme against the known schemes.
Parameters
----------
scheme : str
The scheme to validate.
Raises
------
ValueError
If the scheme is not 'http' or 'https'.
"""
if scheme not in ("https", "http"):
raise ValueError(f"Invalid URL scheme '{scheme}' in URL")
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ async def process_folder(folder: Path) -> None:
# Extract owner and repository name from the filename
if txt_files and "-" in (filename := txt_files[0].stem):
owner, repo = filename.split("-", 1)
repo_url = f"https://github.com/{owner}/{repo}"
repo_url = f"{owner}/{repo}"
with open("history.txt", mode="a", encoding="utf-8") as history:
history.write(f"{repo_url}\n")

Expand Down
71 changes: 71 additions & 0 deletions tests/query_parser/test_git_host_agnostic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
""" Tests to verify that the query parser is Git host agnostic. """

import pytest

from gitingest.query_parser import parse_query


@pytest.mark.parametrize(
"urls, expected_user, expected_repo, expected_url",
[
(
[
"https://github.com/tiangolo/fastapi",
"github.com/tiangolo/fastapi",
"tiangolo/fastapi",
],
"tiangolo",
"fastapi",
"https://github.com/tiangolo/fastapi",
),
(
[
"https://gitlab.com/gitlab-org/gitlab-runner",
"gitlab.com/gitlab-org/gitlab-runner",
"gitlab-org/gitlab-runner",
],
"gitlab-org",
"gitlab-runner",
"https://gitlab.com/gitlab-org/gitlab-runner",
),
(
[
"https://bitbucket.org/na-dna/llm-knowledge-share",
"bitbucket.org/na-dna/llm-knowledge-share",
"na-dna/llm-knowledge-share",
],
"na-dna",
"llm-knowledge-share",
"https://bitbucket.org/na-dna/llm-knowledge-share",
),
(
[
"https://gitea.com/xorm/xorm",
"gitea.com/xorm/xorm",
"xorm/xorm",
],
"xorm",
"xorm",
"https://gitea.com/xorm/xorm",
),
],
)
@pytest.mark.asyncio
async def test_parse_query_without_host(
urls: list[str],
expected_user: str,
expected_repo: str,
expected_url: str,
) -> None:
for url in urls:
result = await parse_query(url, max_file_size=50, from_web=True)
# Common assertions for all cases
assert result["user_name"] == expected_user
assert result["repo_name"] == expected_repo
assert result["url"] == expected_url
assert result["slug"] == f"{expected_user}-{expected_repo}"
assert result["id"] is not None
assert result["subpath"] == "/"
assert result["branch"] is None
assert result["commit"] is None
assert result["type"] is None
22 changes: 22 additions & 0 deletions tests/test_query_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,3 +252,25 @@ async def test_parse_url_with_query_and_fragment() -> None:
assert result["user_name"] == "user"
assert result["repo_name"] == "repo"
assert result["url"] == "https://github.com/user/repo" # URL should be cleaned


async def test_parse_url_unsupported_host() -> None:
url = "https://only-domain.com"
with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"):
await _parse_repo_source(url)


async def test_parse_query_with_branch() -> None:
url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
result = await parse_query(url, max_file_size=10**9, from_web=True)
assert result["user_name"] == "pandas-dev"
assert result["repo_name"] == "pandas"
assert result["url"] == "https://github.com/pandas-dev/pandas"
assert result["slug"] == "pandas-dev-pandas"
assert result["id"] is not None
print('result["subpath"]', result["subpath"])
print("/.github/ISSUE_TEMPLATE/documentation_improvement.yaml")
assert result["subpath"] == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
assert result["branch"] == "2.2.x"
assert result["commit"] is None
assert result["type"] == "blob"

0 comments on commit 9bdee8f

Please sign in to comment.