Skip to content

Commit

Permalink
Improve pagination on Gitea & Pagure
Browse files Browse the repository at this point in the history
  • Loading branch information
ricardobranco777 committed Oct 21, 2023
1 parent d33befc commit ed5b102
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 36 deletions.
53 changes: 24 additions & 29 deletions services/gitea.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,43 +31,36 @@ def _get_paginated(self, url: str, params: dict[str, str] | None) -> list[dict]:
if params is None:
params = {}
if "limit" not in params:
params["limit"] = str(100)
params["limit"] = "100"
entries: list[dict] = []
while True:
try:
got = self.session.get(url, params=params)
got.raise_for_status()
entries.extend(got.json())
if "Link" in got.headers:
links = parse_header_links(got.headers["Link"])
last_link = next(
(link["url"] for link in links if link.get("rel") == "last"), None
)
if last_link is not None:
more_entries = self._get_paginated2(last_link, params=params)
if more_entries is None:
return None
entries.extend(more_entries)
return entries
next_link = next(
(link["url"] for link in links if link.get("rel") == "next"), None
)
if next_link:
url = next_link
params = {}
continue
break
except RequestException as exc:
logging.error("Pagure: %s: Error while fetching page 1: %s", url, exc)
raise
entries.extend(got.json())
if "Link" in got.headers:
links = parse_header_links(got.headers["Link"])
last_link = next(
(link["url"] for link in links if link.get("rel") == "last"), None
)
if last_link is not None:
last_page = int(parse_qs(urlparse(last_link).query)["page"][0])
more_entries = self._get_paginated2(url, params, last_page)
if more_entries is None:
return None
entries.extend(more_entries)
return entries
return entries

def _get_paginated2(self, url: str, params: dict[str, str] | None) -> list[dict]:
def _get_paginated2(
self, url: str, params: dict[str, str], last_page: int
) -> list[dict]:
"""
Get pages 2 to last using threads
"""
if params is None:
params = {}
entries: list[dict] = []
parsed_url = urlparse(url)
last_page = int(parse_qs(parsed_url.query)["page"][0])
url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"

def get_page(page: int) -> list[dict]:
try:
Expand All @@ -76,7 +69,9 @@ def get_page(page: int) -> list[dict]:
got.raise_for_status()
return got.json()
except RequestException as exc:
logging.error("Gitea: Error while fetching page %d: %s", page, exc)
logging.error(
"Gitea: %s: Error while fetching page %d: %s", url, page, exc
)
return []

with ThreadPoolExecutor(max_workers=10) as executor:
Expand Down
52 changes: 45 additions & 7 deletions services/pagure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import logging
from concurrent.futures import ThreadPoolExecutor
from typing import Any

from requests.exceptions import RequestException
Expand Down Expand Up @@ -43,15 +44,52 @@ def username(self) -> str:
def _get_paginated(
self, url: str, params: dict[str, str], key: str, next_key: str
) -> list[dict]:
got = self.session.get(url, params=params)
got.raise_for_status()
if "per_page" not in params:
params["per_page"] = "100"
try:
got = self.session.get(url, params=params)
got.raise_for_status()
except RequestException as exc:
logging.error("Pagure: %s: Error while fetching page 1: %s", url, exc)
raise
data = got.json()
entries = data[key]
while data[next_key]["next"]:
got = self.session.get(data[next_key]["next"], params=params)
got.raise_for_status()
data = got.json()
entries.extend(data[key])
if data[next_key]["next"] and data[next_key]["last"]:
more_entries = self._get_paginated2(
url, params, key, data[next_key]["pages"]
)
if more_entries is None:
return None
entries.extend(more_entries)
return entries

def _get_paginated2(
self, url: str, params: dict[str, str], key: str, last_page: int
) -> list[dict]:
"""
Get pages 2 to last using threads
"""
entries: list[dict] = []

def get_page(page: int) -> list[dict]:
try:
params["page"] = str(page)
got = self.session.get(url, params=params)
got.raise_for_status()
data = got.json()
return data[key]
except RequestException as exc:
logging.error(
"Pagure: %s: Error while fetching page %d: %s", url, page, exc
)
return []

with ThreadPoolExecutor(max_workers=10) as executor:
pages_to_fetch = range(2, last_page + 1)
results = executor.map(get_page, pages_to_fetch)
for result in results:
entries.extend(result)

return entries

def _get_issues(self, username: str, **params) -> list[dict]:
Expand Down

0 comments on commit ed5b102

Please sign in to comment.