From 9bdee8f7012cdf42485387eb11c09d741b37435d Mon Sep 17 00:00:00 2001
From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com>
Date: Fri, 10 Jan 2025 13:50:05 +0100
Subject: [PATCH] feat: make parser domain-agnostic to support multiple Git
 hosts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- added list of known domains/Git hosts in `query_parser.py`
- fixed bug from [#115](https://github.com/cyclotruc/gitingest/pull/115): corrected case handling for URL components—scheme, domain, username, and repository are case-insensitive, but paths beyond (e.g., file names, branches) are case-sensitive
- implemented `try_domains_for_user_and_repo` in `query_parser.py` to iteratively guess the correct domain until success or supported hosts are exhausted
- added helper functions `_get_user_and_repo_from_path`, `_validate_host`, and `_validate_scheme` in `query_parser.py`
- extended `_parse_repo_source` in `query_parser.py` to be Git host agnostic by using `try_domains_for_user_and_repo`
- added tests `test_parse_url_unsupported_host` and `test_parse_query_with_branch` in `test_query_parser.py`
- created new file `test_git_host_agnostic.py` to verify domain/Git host agnostic behavior
---
 src/gitingest/query_parser.py                | 208 ++++++++++++++-----
 src/main.py                                  |   2 +-
 tests/query_parser/test_git_host_agnostic.py |  71 +++++++
 tests/test_query_parser.py                   |  22 ++
 4 files changed, 251 insertions(+), 52 deletions(-)
 create mode 100644 tests/query_parser/test_git_host_agnostic.py

diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py
index bc35698..2981f09 100644
--- a/src/gitingest/query_parser.py
+++ b/src/gitingest/query_parser.py
@@ -11,8 +11,16 @@
 from config import TMP_BASE_PATH
 from gitingest.exceptions import InvalidPatternError
 from gitingest.ignore_patterns import DEFAULT_IGNORE_PATTERNS
+from gitingest.repository_clone import _check_repo_exists
 
-HEX_DIGITS = set(string.hexdigits)
+HEX_DIGITS: set[str] = set(string.hexdigits)
+
+KNOWN_GIT_HOSTS: list[str] = [
+    "github.com",
+    "gitlab.com",
+    "bitbucket.org",
+    "gitea.com",
+]
 
 
 async def parse_query(
@@ -48,16 +56,16 @@ async def parse_query(
         A dictionary containing the parsed query parameters, including 'max_file_size',
         'ignore_patterns', and 'include_patterns'.
     """
-    # Normalize and clean up the source string to make it case-insensitive
-    source = source.lower().strip()
 
     # Determine the parsing method based on the source type
-    if from_web or source.startswith("https://") or "github.com" in source:
-        query = _parse_repo_source(source)
+    if from_web or urlparse(source).scheme in ("https", "http") or any(h in source for h in KNOWN_GIT_HOSTS):
+        # We either have a full URL or a domain-less slug
+        query = await _parse_repo_source(source)
     else:
+        # Local path scenario
         query = _parse_path(source)
 
-    # Process ignore patterns
+    # Combine ignore patterns
     ignore_patterns_list = DEFAULT_IGNORE_PATTERNS.copy()
     if ignore_patterns:
         ignore_patterns_list += _parse_patterns(ignore_patterns)
@@ -69,7 +77,6 @@ async def parse_query(
     else:
         parsed_include = None
 
-    # Update the query dictionary with max_file_size and processed patterns
     query.update(
         {
             "max_file_size": max_file_size,
@@ -80,52 +87,54 @@ async def parse_query(
     return query
 
 
-def _parse_repo_source(url: str) -> dict[str, Any]:
+async def _parse_repo_source(source: str) -> dict[str, Any]:
     """
-    Parse a GitHub repository URL into a structured query dictionary.
+    Parse a repository URL into a structured query dictionary.
 
-    This function extracts relevant information from a GitHub URL, such as the username,
-    repository name, commit, branch, and subpath, and returns them in a structured format.
+    If source is:
+      - A fully qualified URL (https://gitlab.com/...), parse & verify that domain
+      - A URL missing 'https://' (gitlab.com/...), add 'https://' and parse
+      - A 'slug' (like 'pandas-dev/pandas'), attempt known domains until we find one that exists.
 
     Parameters
     ----------
-    url : str
-        The GitHub URL to parse.
+    source : str
+        The URL or domain-less slug to parse.
 
     Returns
     -------
     dict[str, Any]
-        A dictionary containing the parsed details of the GitHub repository, including
-        the username, repository name, commit, branch, and other relevant information.
-
-    Raises
-    ------
-    ValueError
-        If the URL is invalid or does not correspond to a valid Git repository.
+        A dictionary containing the parsed details of the repository, including the username,
+        repository name, commit, branch, and other relevant information.
     """
-    # Clean up the URL
-    url = url.split(" ")[0]  # remove trailing text
-    url = unquote(url)  # decode URL-encoded characters
+    source = unquote(source)
 
-    if not url.startswith(("https://", "http://")):
-        url = "https://" + url
+    # Attempt to parse
+    parsed_url = urlparse(source)
 
-    # Parse URL and reconstruct it without query parameters and fragments
-    parsed_url = urlparse(url)
-    url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
+    if parsed_url.scheme:
+        _validate_scheme(parsed_url.scheme)
+        _validate_host(parsed_url.netloc.lower())
 
-    # Extract domain and path
-    url_parts = url.split("/")
-    domain = url_parts[2]
-    path_parts = url_parts[3:]
+    else:  # Will be of the form 'host/user/repo' or 'user/repo'
+        tmp_host = source.split("/")[0].lower()
+        if "." in tmp_host:
+            _validate_host(tmp_host)
+        else:
+            # No scheme, no domain => user typed "user/repo", so we'll guess the domain.
+            host = await try_domains_for_user_and_repo(*_get_user_and_repo_from_path(source))
+            source = f"{host}/{source}"
 
-    if len(path_parts) < 2:
-        raise ValueError("Invalid repository URL. Please provide a valid Git repository URL.")
+        source = "https://" + source
+        parsed_url = urlparse(source)
+
+    host = parsed_url.netloc.lower()
+    user_name, repo_name = _get_user_and_repo_from_path(parsed_url.path)
 
-    user_name = path_parts[0]
-    repo_name = path_parts[1]
     _id = str(uuid.uuid4())
     slug = f"{user_name}-{repo_name}"
+    local_path = Path(TMP_BASE_PATH) / _id / slug
+    url = f"https://{host}/{user_name}/{repo_name}"
 
     parsed = {
         "user_name": user_name,
@@ -134,31 +143,39 @@ def _parse_repo_source(url: str) -> dict[str, Any]:
         "branch": None,
         "commit": None,
         "subpath": "/",
-        "local_path": Path(TMP_BASE_PATH) / _id / slug,
-        "url": f"https://{domain}/{user_name}/{repo_name}",
-        "slug": slug,
+        "local_path": local_path,
+        "url": url,
+        "slug": slug,  # e.g. "pandas-dev-pandas"
         "id": _id,
     }
 
-    # If this is an issues page or pull requests, return early without processing subpath
-    if len(path_parts) > 2 and (path_parts[2] == "issues" or path_parts[2] == "pull"):
+    remaining_parts = parsed_url.path.strip("/").split("/")[2:]
+
+    if not remaining_parts:
         return parsed
 
+    possible_type = remaining_parts.pop(0)  # e.g. 'issues', 'pull', 'tree', 'blob'
+
     # If no extra path parts, just return
-    if len(path_parts) < 4:
+    if not remaining_parts:
+        return parsed
+
+    # If this is an issues page or pull requests, return early without processing subpath
+    if remaining_parts and possible_type in ("issues", "pull"):
         return parsed
 
-    parsed["type"] = path_parts[2]  # Usually 'tree' or 'blob'
-    commit = path_parts[3]
+    parsed["type"] = possible_type
 
-    if _is_valid_git_commit_hash(commit):
-        parsed["commit"] = commit
-        if len(path_parts) > 4:
-            parsed["subpath"] += "/".join(path_parts[4:])
+    # Commit or branch
+    commit_or_branch = remaining_parts.pop(0)
+    if _is_valid_git_commit_hash(commit_or_branch):
+        parsed["commit"] = commit_or_branch
     else:
-        parsed["branch"] = commit
-        if len(path_parts) > 4:
-            parsed["subpath"] += "/".join(path_parts[4:])
+        parsed["branch"] = commit_or_branch
+
+    # Subpath if anything left
+    if remaining_parts:
+        parsed["subpath"] += "/".join(remaining_parts)
 
     return parsed
 
@@ -314,3 +331,92 @@ def _is_valid_pattern(pattern: str) -> bool:
         True if the pattern is valid, otherwise False.
     """
     return all(c.isalnum() or c in "-_./+*" for c in pattern)
+
+
+async def try_domains_for_user_and_repo(user_name: str, repo_name: str) -> str:
+    """
+    Attempt to find a valid repository host for the given user_name and repo_name.
+
+    Parameters
+    ----------
+    user_name : str
+        The username or owner of the repository.
+    repo_name : str
+        The name of the repository.
+
+    Returns
+    -------
+    str
+        The domain of the valid repository host.
+
+    Raises
+    ------
+    ValueError
+        If no valid repository host is found for the given user_name and repo_name.
+    """
+    for domain in KNOWN_GIT_HOSTS:
+        candidate = f"https://{domain}/{user_name}/{repo_name}"
+        if await _check_repo_exists(candidate):
+            return domain
+    raise ValueError(f"Could not find a valid repository host for '{user_name}/{repo_name}'.")
+
+
+def _get_user_and_repo_from_path(path: str) -> tuple[str, str]:
+    """
+    Extract the user and repository names from a given path.
+
+    Parameters
+    ----------
+    path : str
+        The path to extract the user and repository names from.
+
+    Returns
+    -------
+    tuple[str, str]
+        A tuple containing the user and repository names.
+
+    Raises
+    ------
+    ValueError
+        If the path does not contain at least two parts.
+    """
+    path_parts = path.lower().strip("/").split("/")
+    if len(path_parts) < 2:
+        raise ValueError(f"Invalid repository URL '{path}'")
+    return path_parts[0], path_parts[1]
+
+
+def _validate_host(host: str) -> None:
+    """
+    Validate the given host against the known Git hosts.
+
+    Parameters
+    ----------
+    host : str
+        The host to validate.
+
+    Raises
+    ------
+    ValueError
+        If the host is not a known Git host.
+    """
+    if host not in KNOWN_GIT_HOSTS:
+        raise ValueError(f"Unknown domain '{host}' in URL")
+
+
+def _validate_scheme(scheme: str) -> None:
+    """
+    Validate the given scheme against the known schemes.
+
+    Parameters
+    ----------
+    scheme : str
+        The scheme to validate.
+
+    Raises
+    ------
+    ValueError
+        If the scheme is not 'http' or 'https'.
+    """
+    if scheme not in ("https", "http"):
+        raise ValueError(f"Invalid URL scheme '{scheme}' in URL")
diff --git a/src/main.py b/src/main.py
index 7ba36a8..f2b63fd 100644
--- a/src/main.py
+++ b/src/main.py
@@ -78,7 +78,7 @@ async def process_folder(folder: Path) -> None:
         # Extract owner and repository name from the filename
         if txt_files and "-" in (filename := txt_files[0].stem):
             owner, repo = filename.split("-", 1)
-            repo_url = f"https://github.com/{owner}/{repo}"
+            repo_url = f"{owner}/{repo}"
             with open("history.txt", mode="a", encoding="utf-8") as history:
                 history.write(f"{repo_url}\n")
 
diff --git a/tests/query_parser/test_git_host_agnostic.py b/tests/query_parser/test_git_host_agnostic.py
new file mode 100644
index 0000000..1830811
--- /dev/null
+++ b/tests/query_parser/test_git_host_agnostic.py
@@ -0,0 +1,71 @@
+""" Tests to verify that the query parser is Git host agnostic. """
+
+import pytest
+
+from gitingest.query_parser import parse_query
+
+
+@pytest.mark.parametrize(
+    "urls, expected_user, expected_repo, expected_url",
+    [
+        (
+            [
+                "https://github.com/tiangolo/fastapi",
+                "github.com/tiangolo/fastapi",
+                "tiangolo/fastapi",
+            ],
+            "tiangolo",
+            "fastapi",
+            "https://github.com/tiangolo/fastapi",
+        ),
+        (
+            [
+                "https://gitlab.com/gitlab-org/gitlab-runner",
+                "gitlab.com/gitlab-org/gitlab-runner",
+                "gitlab-org/gitlab-runner",
+            ],
+            "gitlab-org",
+            "gitlab-runner",
+            "https://gitlab.com/gitlab-org/gitlab-runner",
+        ),
+        (
+            [
+                "https://bitbucket.org/na-dna/llm-knowledge-share",
+                "bitbucket.org/na-dna/llm-knowledge-share",
+                "na-dna/llm-knowledge-share",
+            ],
+            "na-dna",
+            "llm-knowledge-share",
+            "https://bitbucket.org/na-dna/llm-knowledge-share",
+        ),
+        (
+            [
+                "https://gitea.com/xorm/xorm",
+                "gitea.com/xorm/xorm",
+                "xorm/xorm",
+            ],
+            "xorm",
+            "xorm",
+            "https://gitea.com/xorm/xorm",
+        ),
+    ],
+)
+@pytest.mark.asyncio
+async def test_parse_query_without_host(
+    urls: list[str],
+    expected_user: str,
+    expected_repo: str,
+    expected_url: str,
+) -> None:
+    for url in urls:
+        result = await parse_query(url, max_file_size=50, from_web=True)
+        # Common assertions for all cases
+        assert result["user_name"] == expected_user
+        assert result["repo_name"] == expected_repo
+        assert result["url"] == expected_url
+        assert result["slug"] == f"{expected_user}-{expected_repo}"
+        assert result["id"] is not None
+        assert result["subpath"] == "/"
+        assert result["branch"] is None
+        assert result["commit"] is None
+        assert result["type"] is None
diff --git a/tests/test_query_parser.py b/tests/test_query_parser.py
index 1fe666b..0db65d3 100644
--- a/tests/test_query_parser.py
+++ b/tests/test_query_parser.py
@@ -252,3 +252,25 @@ async def test_parse_url_with_query_and_fragment() -> None:
     assert result["user_name"] == "user"
     assert result["repo_name"] == "repo"
     assert result["url"] == "https://github.com/user/repo"  # URL should be cleaned
+
+
+async def test_parse_url_unsupported_host() -> None:
+    url = "https://only-domain.com"
+    with pytest.raises(ValueError, match="Unknown domain 'only-domain.com' in URL"):
+        await _parse_repo_source(url)
+
+
+async def test_parse_query_with_branch() -> None:
+    url = "https://github.com/pandas-dev/pandas/blob/2.2.x/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
+    result = await parse_query(url, max_file_size=10**9, from_web=True)
+    assert result["user_name"] == "pandas-dev"
+    assert result["repo_name"] == "pandas"
+    assert result["url"] == "https://github.com/pandas-dev/pandas"
+    assert result["slug"] == "pandas-dev-pandas"
+    assert result["id"] is not None
+    print('result["subpath"]', result["subpath"])
+    print("/.github/ISSUE_TEMPLATE/documentation_improvement.yaml")
+    assert result["subpath"] == "/.github/ISSUE_TEMPLATE/documentation_improvement.yaml"
+    assert result["branch"] == "2.2.x"
+    assert result["commit"] is None
+    assert result["type"] == "blob"