Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions .github/workflows/weekly_download.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ on:
jobs:
download:
runs-on: ubuntu-latest
strategy:
matrix:
include:
- name: pypi
- name: npm
- name: dockerhub
steps:
- uses: actions/create-github-app-token@29824e69f54612133e76f7eaac726eef6c875baf # v2.2.1
id: app-token
Expand All @@ -25,15 +31,10 @@ jobs:
- name: Install the project
run: uv sync --locked --only-group download --python 3.14

- name: Download Pypi packages
- name: Download ${{ matrix.name }} packages
continue-on-error: true
run: |
uv run --no-project dependencies/scripts/download_packages.py download pypi

- name: Download NPM packages
continue-on-error: true
run: |
uv run --no-project dependencies/scripts/download_packages.py download npm
PYTHONPATH=dependencies/ uv run --no-project dependencies/scripts/download_packages.py download ${{ matrix.name }}

- name: Configure git
run: |
Expand All @@ -43,5 +44,5 @@ jobs:
- name: Push changes to repo
run: |
git add .
git commit -m "chore: Weekly update of trusted packages"
git commit -m "chore: Weekly update of `${{ matrix.name }}` trusted packages"
git push origin HEAD:main
1 change: 1 addition & 0 deletions dependencies/dockerhub.json

Large diffs are not rendered by default.

98 changes: 13 additions & 85 deletions dependencies/scripts/download_packages.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import logging
from collections.abc import Callable
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
Expand All @@ -10,6 +9,19 @@
import click
import httpx
import stamina
from requests.exceptions import InvalidJSONError

from scripts.exceptions import ServerError
from scripts.utils import (
DEPENDENCIES_DIR,
ECOSYSTEMS,
RETRY_ATTEMPTS,
RETRY_ON,
RETRY_WAIT_EXP_BASE,
RETRY_WAIT_JITTER,
RETRY_WAIT_MAX,
TIMEOUT,
)

logger = logging.getLogger("weekly_download")
logging.basicConfig(
Expand All @@ -19,90 +31,6 @@
)


class ServerError(Exception):
"""Custom exception for HTTP 5xx errors."""


class InvalidJSONError(Exception):
"""Custom exception for when the received JSON does not match the expected format."""


# Directory name
DEPENDENCIES_DIR = "dependencies"
"""Directory name where dependency files will be saved."""

# Sources
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
"""URL for fetching top PyPI packages data."""

TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
"""URL for fetching top npm packages data from ecosyste.ms."""

# Retry constants
RETRY_ON = (httpx.TransportError, httpx.TimeoutException, ServerError)
"""Tuple of exceptions that should trigger retry attempts."""

RETRY_ATTEMPTS = 15
"""Maximum number of retry attempts for failed requests."""

RETRY_WAIT_JITTER = 1
"""Random jitter factor for retry wait times."""

RETRY_WAIT_EXP_BASE = 2
"""Exponential backoff base multiplier for retry wait times."""

RETRY_WAIT_MAX = 8
"""Maximum wait time between retry attempts in seconds."""

TIMEOUT = 90
"""HTTP request timeout in seconds."""


def parse_npm(data: list[dict[str, Any]]) -> set[str]:
"""Parse npm package data and extract package names."""
try:
return {x["name"] for x in data}
except KeyError as e:
raise InvalidJSONError from e


def parse_pypi(data: dict[str, Any]) -> set[str]:
"""Parse PyPI package data and extract package names."""
try:
return {row["project"] for row in data["rows"]}
except KeyError as e:
raise InvalidJSONError from e


@dataclass(frozen=True)
class Ecosystem:
"""Configuration for a package ecosystem (PyPI, npm, etc.)."""

url: str
parser: Callable[[Any], set[str]]
params: dict[str, Any] = field(default_factory=dict)
pages: int | None = None


pypi_ecosystem = Ecosystem(
url=TOP_PYPI_SOURCE,
parser=parse_pypi,
)
"""Ecosystem configuration for PyPI packages."""

npm_ecosystem = Ecosystem(
url=TOP_NPM_SOURCE,
parser=parse_npm,
params={"per_page": 100, "sort": "downloads"},
pages=150,
)
"""Ecosystem configuration for npm packages with pagination."""


ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem}
"""Dictionary mapping ecosystem names to their configurations."""


def get_params(params: dict[str, Any] | None, page: int | None) -> dict[str, Any]:
"""Combine base parameters with page parameter if provided."""
new_params: dict[str, Any] = {}
Expand Down
6 changes: 6 additions & 0 deletions dependencies/scripts/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class ServerError(Exception):
"""Custom exception for HTTP 5xx errors."""


class InvalidJSONError(Exception):
"""Custom exception for when the received JSON does not match the expected format."""
93 changes: 93 additions & 0 deletions dependencies/scripts/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import Any

import httpx
from requests.exceptions import InvalidJSONError

from scripts.exceptions import ServerError

DEPENDENCIES_DIR = "dependencies"
"""Directory name where dependency files will be saved."""

# Sources
TOP_PYPI_SOURCE = "https://hugovk.github.io/top-pypi-packages/top-pypi-packages.min.json"
"""URL for fetching top PyPI packages data."""

TOP_NPM_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/npmjs.org/packages"
"""URL for fetching top npm packages data from ecosyste.ms."""

TOP_DOCKERHUB_SOURCE = "https://packages.ecosyste.ms/api/v1/registries/hub.docker.com/packages"
"""URL for fetching top npm packages data from ecosyste.ms."""

# Retry constants
RETRY_ON = (httpx.TransportError, httpx.TimeoutException, ServerError)
"""Tuple of exceptions that should trigger retry attempts."""

RETRY_ATTEMPTS = 15
"""Maximum number of retry attempts for failed requests."""

RETRY_WAIT_JITTER = 1
"""Random jitter factor for retry wait times."""

RETRY_WAIT_EXP_BASE = 2
"""Exponential backoff base multiplier for retry wait times."""

RETRY_WAIT_MAX = 8
"""Maximum wait time between retry attempts in seconds."""

TIMEOUT = 90
"""HTTP request timeout in seconds."""


def parse_packages_ecosystems_source(data: list[dict[str, Any]]) -> set[str]:
"""Parse npm package data and extract package names."""
try:
return {x["name"] for x in data}
except KeyError as e:
raise InvalidJSONError from e


def parse_pypi(data: dict[str, Any]) -> set[str]:
"""Parse PyPI package data and extract package names."""
try:
return {row["project"] for row in data["rows"]}
except KeyError as e:
raise InvalidJSONError from e


@dataclass(frozen=True)
class Ecosystem:
"""Configuration for a package ecosystem (PyPI, npm, etc.)."""

url: str
parser: Callable[[Any], set[str]]
params: dict[str, Any] = field(default_factory=dict)
pages: int | None = None


pypi_ecosystem = Ecosystem(
url=TOP_PYPI_SOURCE,
parser=parse_pypi,
)
"""Ecosystem configuration for PyPI packages."""

npm_ecosystem = Ecosystem(
url=TOP_NPM_SOURCE,
parser=parse_packages_ecosystems_source,
params={"per_page": 100, "sort": "downloads"},
pages=150,
)
"""Ecosystem configuration for npm packages with pagination."""

dockerhub_ecosystem = Ecosystem(
url=TOP_DOCKERHUB_SOURCE,
parser=parse_packages_ecosystems_source,
params={"per_page": 100, "sort": "downloads"},
pages=150,
)
"""Ecosystem configuration for DockerHub packages with pagination."""


ECOSYSTEMS = {"pypi": pypi_ecosystem, "npm": npm_ecosystem, "dockerhub": dockerhub_ecosystem}
"""Dictionary mapping ecosystem names to their configurations."""
71 changes: 68 additions & 3 deletions dependencies/tests/test_download_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
DEPENDENCIES_DIR,
ECOSYSTEMS,
RETRY_ATTEMPTS,
Ecosystem,
InvalidJSONError,
ServerError,
_run,
download,
)
from scripts.utils import (
Ecosystem,
dockerhub_ecosystem,
npm_ecosystem,
parse_npm,
parse_packages_ecosystems_source,
parse_pypi,
)

Expand Down Expand Up @@ -70,6 +73,18 @@ def patch_npm_ecosystem(data: dict[str, Any]) -> Iterator[None]:
yield


@contextmanager
def patch_dockerhub_ecosystem(data: dict[str, Any]) -> Iterator[None]:
"""Context manager that temporarily modifies the npm ecosystem configuration for testing."""
with (
patch.dict(
ECOSYSTEMS,
{"dockerhub": Ecosystem(**dockerhub_ecosystem.__dict__ | data)},
),
):
yield


@freeze_time("2025-01-01")
class TestDownload:
def test_pypi_download(self) -> None:
Expand Down Expand Up @@ -141,7 +156,7 @@ def test_invalid_pypi_json_format(self) -> None:
def test_invalid_npm_json_format(self) -> None:
"""Test that InvalidJSONError is raised when npm JSON data has invalid format."""
with pytest.raises(InvalidJSONError):
parse_npm([{"key": "val"}])
parse_packages_ecosystems_source([{"key": "val"}])

def test_invalid_downloaded_json(self) -> None:
"""Test that InvalidJSONError is raised when downloaded JSON cannot be parsed."""
Expand Down Expand Up @@ -219,6 +234,56 @@ def test_npm_download_with_multiple_pages(self) -> None:
assert set(m_save.call_args[0][0]["packages"]) == {"lodash", "@aws/sdk", "react", "express"}
assert m_save.call_args[0][1] == m_open().__enter__()

def test_dockerhub_download_with_multiple_pages(self) -> None:
"""Test that the script will iterate through pages if provided."""
page1_data = [
{"name": "sundeepm1/weatherapi", "downloads": 12345},
{"name": "hitesh25/jenkins_argo", "downloads": 98765},
]
page2_data = [
{"name": "jchensg/sg-support-integration", "downloads": 87654},
]

with (
patch_client(None) as m_client, # We'll configure the side_effect below
patch_save_to_file() as m_save,
patch_open_file() as m_open,
patch_dockerhub_ecosystem({"pages": 2}),
):
# Configure the mock to return different data for each call
mock_responses = []
for data in [page1_data, page2_data]:
mock_response = Mock()
mock_response.json.return_value = data
mock_responses.append(mock_response)

m_client.side_effect = mock_responses

_run("dockerhub")

assert m_client.call_count == 2

assert m_client.call_args_list == [
call(
"https://packages.ecosyste.ms/api/v1/registries/hub.docker.com/packages",
params={"per_page": 100, "sort": "downloads", "page": 1},
),
call(
"https://packages.ecosyste.ms/api/v1/registries/hub.docker.com/packages",
params={"per_page": 100, "sort": "downloads", "page": 2},
),
]

# Verify that all packages from all pages were collected
assert m_save.call_count == 1
assert m_save.call_args[0][0]["date"] == "2025-01-01T00:00:00+00:00"
assert set(m_save.call_args[0][0]["packages"]) == {
"sundeepm1/weatherapi",
"hitesh25/jenkins_argo",
"jchensg/sg-support-integration",
}
assert m_save.call_args[0][1] == m_open().__enter__()


class TestCli:
def test_non_existing_ecosystem_error(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion justfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,4 @@ install-dev: venv
uv pip install -e .

download ecosystem: venv
uv run --no-project dependencies/scripts/download_packages.py download {{ecosystem}}
PYTHONPATH=dependencies/ uv run --no-project dependencies/scripts/download_packages.py download {{ecosystem}}