diff --git a/analytics/Dockerfile b/analytics/Dockerfile index d3fbeff06..249c0dbc1 100644 --- a/analytics/Dockerfile +++ b/analytics/Dockerfile @@ -18,7 +18,6 @@ RUN apt-get update \ libpq-dev \ postgresql \ wget \ - jq \ # Install security updates # https://pythonspeed.com/articles/security-updates-in-docker/ && apt-get upgrade --yes \ @@ -28,24 +27,11 @@ RUN apt-get update \ libpq-dev \ postgresql \ wget \ - jq \ # Reduce the image size by clear apt cached lists # Complies with https://github.com/codacy/codacy-hadolint/blob/master/codacy-hadolint/docs/description/DL3009.md && rm -fr /var/lib/apt/lists/* \ && rm /etc/ssl/private/ssl-cert-snakeoil.key -# Install gh CLI -# docs: https://github.com/cli/cli/blob/trunk/docs/install_linux.md -SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN mkdir -p /etc/apt/keyrings \ - && wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ - && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ - && apt-get update \ - && apt-get install gh -y \ - && rm -fr /var/lib/apt/lists/* \ - && gh --version - ARG RUN_UID ARG RUN_USER diff --git a/analytics/config.py b/analytics/config.py index 67f620c66..1ab4f43e1 100644 --- a/analytics/config.py +++ b/analytics/config.py @@ -1,14 +1,14 @@ """Loads configuration variables from settings files """ -import os +import os from typing import Optional from pydantic_settings import BaseSettings, SettingsConfigDict from pydantic import Field # reads environment variables from .env files defaulting to "local.env" class PydanticBaseEnvConfig(BaseSettings): - model_config = SettingsConfigDict(env_file="%s.env" % os.getenv("ENVIRONMENT", "local"), extra="allow") + model_config = SettingsConfigDict(env_file="%s.env" % os.getenv("ENVIRONMENT", "local"), extra="allow") class DBSettings(PydanticBaseEnvConfig): db_host: str = Field(alias="DB_HOST") @@ -19,6 +19,7 @@ class DBSettings(PydanticBaseEnvConfig): ssl_mode: str = Field("require", alias="DB_SSL_MODE") db_schema: str = Field ("app", alias="DB_SCHEMA") slack_bot_token: str = Field(alias="ANALYTICS_SLACK_BOT_TOKEN") + github_token: str = Field(alias="GH_TOKEN") reporting_channel_id: str = Field(alias="ANALYTICS_REPORTING_CHANNEL_ID") aws_region: Optional[str] = Field(None, alias="AWS_REGION") local_env: bool = True if os.getenv("ENVIRONMENT", "local") == "local" else False diff --git a/analytics/local.env b/analytics/local.env index 9960b72af..b00c90e65 100644 --- a/analytics/local.env +++ b/analytics/local.env @@ -31,7 +31,7 @@ MB_DB_PASS=secret123 MB_DB_HOST=grants-analytics-db ########################### -# Slack Configuration # +# Secret Configuration # ########################### # Do not add these values to this file # to avoid mistakenly committing them. @@ -39,6 +39,7 @@ MB_DB_HOST=grants-analytics-db # by doing `export ANALYTICS_REPORTING_CHANNEL_ID=whatever` ANALYTICS_REPORTING_CHANNEL_ID=DO_NOT_SET_HERE ANALYTICS_SLACK_BOT_TOKEN=DO_NOT_SET_HERE +GH_TOKEN=DO_NOT_SET_HERE ############################ # Logging diff --git a/analytics/pyproject.toml b/analytics/pyproject.toml index c498c2699..749c828b1 100644 --- a/analytics/pyproject.toml +++ b/analytics/pyproject.toml @@ -68,8 +68,6 @@ line-length = 100 [tool.ruff.lint] select = ["ALL"] ignore = [ - "ANN101", # missing type annotation for self - "ANN102", # missing type annotation for cls "D203", # no blank line before class "D212", # multi-line summary first line "FIX002", # line contains TODO @@ -78,7 +76,6 @@ ignore = [ "PTH123", # `open()` should be replaced by `Path.open()` "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` "TD003", # missing an issue link on TODO - "PT004", # pytest fixture leading underscore - is marked deprecated "FA102", # Adding "from __future__ import annotations" to any new-style type annotation ] diff --git a/analytics/src/analytics/etl/github.py b/analytics/src/analytics/etl/github.py index 382382e84..a2b9ac6bb 100644 --- a/analytics/src/analytics/etl/github.py +++ b/analytics/src/analytics/etl/github.py @@ -101,6 +101,7 @@ def __init__(self, config: GitHubProjectConfig) -> None: self.config = config # Declare private attributes shared across ETL steps self._transient_files: list[InputFiles] + self.client = github.GitHubGraphqlClient() self.dataset: GitHubIssues def run(self) -> None: @@ -121,7 +122,8 @@ def extract(self) -> None: output_file=roadmap_file, ) - # Export sprint data + # Export sprint data for each GitHub project that the scrum teams use + # to manage their sprints, e.g. HHS/17 and HHS/13 input_files: list[InputFiles] = [] for sprint_board in self.config.sprint_projects: project = sprint_board.project_number @@ -167,6 +169,7 @@ def _export_roadmap_data( ) # Export the data github.export_roadmap_data( + client=self.client, owner=roadmap.owner, project=roadmap.project_number, quad_field=roadmap.quad_field, @@ -186,6 +189,7 @@ def _export_sprint_data( sprint_board.project_number, ) github.export_sprint_data( + client=self.client, owner=sprint_board.owner, project=sprint_board.project_number, sprint_field=sprint_board.sprint_field, @@ -201,6 +205,8 @@ def _export_sprint_data( def run_transformation_pipeline(files: InputFiles) -> list[dict]: """Load data from input files and apply transformations.""" + # Log the current sprint for which we're running the transformations + logger.info("Running transformations for sprint: %s", files.sprint) # Load sprint and roadmap data sprint_data_in = load_json_file(files.sprint) roadmap_data_in = load_json_file(files.roadmap) diff --git a/analytics/src/analytics/integrations/github/__init__.py b/analytics/src/analytics/integrations/github/__init__.py index c34934cb0..b057f7707 100644 --- a/analytics/src/analytics/integrations/github/__init__.py +++ b/analytics/src/analytics/integrations/github/__init__.py @@ -1,10 +1,12 @@ """Export data from GitHub.""" __all__ = [ + "GitHubGraphqlClient", "export_roadmap_data", "export_sprint_data", ] +from analytics.integrations.github.client import GitHubGraphqlClient from analytics.integrations.github.main import ( export_roadmap_data, export_sprint_data, diff --git a/analytics/src/analytics/integrations/github/client.py b/analytics/src/analytics/integrations/github/client.py new file mode 100644 index 000000000..2bdf90d2a --- /dev/null +++ b/analytics/src/analytics/integrations/github/client.py @@ -0,0 +1,139 @@ +"""Expose a client for making calls to GitHub's GraphQL API.""" + +import logging +from typing import Any + +import requests + +from config import get_db_settings + +logger = logging.getLogger(__name__) + + +class GraphqlError(Exception): + """ + Exception raised for errors returned by the GraphQL API. + + Attributes + ---------- + errors : list + List of error details returned by the API. + message : str + Human-readable explanation of the error. + + """ + + def __init__(self, errors: list[dict]) -> None: + """Initialize the GraphqlError.""" + self.errors = errors + self.message = f"GraphQL API returned errors: {errors}" + super().__init__(self.message) + + +class GitHubGraphqlClient: + """ + A client to interact with GitHub's GraphQL API. + + Methods + ------- + execute_paginated_query(query, variables, data_path, batch_size=100) + Executes a paginated GraphQL query and returns all results. + + """ + + def __init__(self) -> None: + """ + Initialize the GitHubClient. + + Parameters + ---------- + token : str + GitHub personal access token for authentication. + + """ + settings = get_db_settings() + self.endpoint = "https://api.github.com/graphql" + self.headers = { + "Authorization": f"Bearer {settings.github_token}", + "Content-Type": "application/json", + "GraphQL-Features": "sub_issues,issue_types", + } + + def execute_query(self, query: str, variables: dict[str, str | int]) -> dict: + """ + Make a POST request to the GitHub GraphQL API. + + Parameters + ---------- + query : str + The GraphQL query string. + variables : dict + A dictionary of variables to pass to the query. + + Returns + ------- + dict + The JSON response from the API. + + """ + response = requests.post( + self.endpoint, + headers=self.headers, + json={"query": query, "variables": variables}, + timeout=60, + ) + response.raise_for_status() + result = response.json() + if "errors" in result: + raise GraphqlError(result["errors"]) + return result + + def execute_paginated_query( + self, + query: str, + variables: dict[str, Any], + path_to_nodes: list[str], + batch_size: int = 100, + ) -> list[dict]: + """ + Execute a paginated GraphQL query. + + Parameters + ---------- + query : str + The GraphQL query string. + variables : dict + A dictionary of variables to pass to the query. + path_to_nodes : list of str + The path to traverse the response data to extract the "nodes" list, + so the nodes can be combined from multiple paginated responses. + batch_size : int, optional + The number of items to fetch per batch, by default 100. + + Returns + ------- + list of dict + The combined results from all paginated responses. + + """ + all_data = [] + has_next_page = True + variables["batch"] = batch_size + variables["endCursor"] = None + + while has_next_page: + response = self.execute_query(query, variables) + data = response["data"] + + # Traverse the data path to extract nodes + for key in path_to_nodes: + data = data[key] + + all_data.extend(data["nodes"]) + + # Handle pagination + page_info = data["pageInfo"] + has_next_page = page_info["hasNextPage"] + variables["endCursor"] = page_info["endCursor"] + + return all_data diff --git a/analytics/src/analytics/integrations/github/main.py b/analytics/src/analytics/integrations/github/main.py index dc4633ace..c1e6e973d 100644 --- a/analytics/src/analytics/integrations/github/main.py +++ b/analytics/src/analytics/integrations/github/main.py @@ -1,164 +1,156 @@ -"""Integrate with GitHub to read and write data from projects and repos.""" +""" +Export data from GitHub. + +TODO(widal001): 2025-01-04 Refactor and move this to src/analytics/etl/github when +we disable writing to disk in https://github.com/HHS/simpler-grants-gov/issues/3203 +""" -import shlex -import subprocess +import json +import logging from pathlib import Path +from pydantic import ValidationError + +from analytics.integrations.github.client import GitHubGraphqlClient +from analytics.integrations.github.validation import ProjectItem + +logger = logging.getLogger(__name__) + PARENT_DIR = Path(__file__).resolve().parent -def pipe_command_output_to_file(command: str, output_file: str) -> None: - """Write the command line output to a file.""" - # make sure the output file's directory exists - file_path = Path(output_file) - file_path.parent.mkdir(exist_ok=True, parents=True) - # invoke the command via a subprocess and write the output to a file - with open(output_file, "w", encoding="utf-8") as f: - subprocess.call(shlex.split(command), stdout=f) # noqa: S603 +def transform_project_data( + raw_data: list[dict], + owner: str, + project: int, + excluded_types: tuple = (), # By default include everything +) -> list[dict]: + """Pluck and reformat relevant fields for each item in the raw data.""" + transformed_data = [] + + for i, item in enumerate(raw_data): + try: + # Validate and parse the raw item + validated_item = ProjectItem.model_validate(item) + + # Skip excluded issue types + if validated_item.content.issue_type.name in excluded_types: + continue + + # Transform into flattened format + transformed = { + # project metadata + "project_owner": owner, + "project_number": project, + # issue metadata + "issue_title": validated_item.content.title, + "issue_url": validated_item.content.url, + "issue_parent": validated_item.content.parent.url, + "issue_type": validated_item.content.issue_type.name, + "issue_status": validated_item.status.name, + "issue_is_closed": validated_item.content.closed, + "issue_opened_at": validated_item.content.created_at, + "issue_closed_at": validated_item.content.closed_at, + "issue_points": validated_item.points.number, + # sprint metadata + "sprint_id": validated_item.sprint.iteration_id, + "sprint_name": validated_item.sprint.title, + "sprint_start": validated_item.sprint.start_date, + "sprint_length": validated_item.sprint.duration, + "sprint_end": validated_item.sprint.end_date, + # roadmap metadata + "deliverable_pillar": validated_item.pillar.name, + "quad_id": validated_item.quad.iteration_id, + "quad_name": validated_item.quad.title, + "quad_start": validated_item.quad.start_date, + "quad_length": validated_item.quad.duration, + "quad_end": validated_item.quad.end_date, + } + transformed_data.append(transformed) + except ValidationError as err: + logger.error("Error parsing row %d, skipped.", i) # noqa: TRY400 + logger.debug("Error: %s", err) + continue + + return transformed_data def export_sprint_data( + client: GitHubGraphqlClient, owner: str, project: int, sprint_field: str, points_field: str, output_file: str, ) -> None: - """ - Export the issue and project data from a Sprint Board. - - TODO(widal001): 2024-10-25 - Replace this with a direct call to the GraphQL API - https://github.com/HHS/simpler-grants-gov/issues/2590 - """ - # Get the path script and the GraphQL query - script = PARENT_DIR / "make-graphql-query.sh" + """Export the issue and project data from a Sprint Board.""" + # Load query query_path = PARENT_DIR / "getSprintData.graphql" - # Load the query with open(query_path) as f: query = f.read() - # Create the post-pagination transform jq - jq = f""" -[ - # iterate through each project item - .[] | - # reformat each item - {{ - project_owner: \"{owner}\", - project_number: {project}, - issue_title: .content.title, - issue_url: .content.url, - issue_parent: .content.parent.url, - issue_type: .content.issueType.name, - issue_status: .status.name, - issue_is_closed: .content.closed, - issue_opened_at: .content.createdAt, - issue_closed_at: .content.closedAt, - issue_points: .points.number, - sprint_id: .sprint.iterationId, - sprint_name: .sprint.title, - sprint_start: .sprint.startDate, - sprint_length: .sprint.duration, - sprint_end: ( - if .sprint.startDate == null - then null - else ( - (.sprint.startDate | strptime(\"%Y-%m-%d\") | mktime) - + (.sprint.duration * 86400) | strftime(\"%Y-%m-%d\") - ) - end - ), - }} | - # filter for task-level issues - select(.issue_type != \"Deliverable\") -] -""" - # Make the command - # fmt: off - command: list[str] = [ - str(script), - "--batch", "100", - "--field", f"login={owner}", - "--field", f"project={project}", - "--field", f"sprintField='{sprint_field}'", - "--field", f"pointsField='{points_field}'", - "--query", f"{query}", - "--paginate-jq", "'.data.organization.projectV2.items.nodes'", - "--transform-jq", jq, - ] - # fmt: on - # invoke the command via a subprocess and write the output to a file + + # Set query variables + variables = { + "login": owner, + "project": project, + "sprintField": sprint_field, + "pointsField": points_field, + } + + # Execute query + data = client.execute_paginated_query( + query, + variables, + ["organization", "projectV2", "items"], + ) + + # Transform data + # And exclude deliverables if they appear on the sprint boards + # so that we use their status value from the roadmap board instead + transformed_data = transform_project_data( + raw_data=data, + owner=owner, + project=project, + excluded_types=("Deliverable",), + ) + + # Write output with open(output_file, "w", encoding="utf-8") as f: - subprocess.call(command, stdout=f) # noqa: S603 + json.dump(transformed_data, f, indent=2) def export_roadmap_data( + client: GitHubGraphqlClient, owner: str, project: int, quad_field: str, pillar_field: str, output_file: str, ) -> None: - """ - Export the issue and project data from a Sprint Board. - - TODO(widal001): 2024-10-25 - Replace this with a direct call to the GraphQL API - https://github.com/HHS/simpler-grants-gov/issues/2590 - """ - # Get the path script and the GraphQL query - script = PARENT_DIR / "make-graphql-query.sh" + """Export the issue and project data from a Roadmap Board.""" + # Load query query_path = PARENT_DIR / "getRoadmapData.graphql" - # Load the query with open(query_path) as f: query = f.read() - # Create the post-pagination transform jq - jq = f""" -[ - # iterate through each project item - .[] | - # reformat each item - {{ - project_owner: \"{owner}\", - project_number: {project}, - issue_title: .content.title, - issue_url: .content.url, - issue_parent: .content.parent.url, - issue_type: .content.issueType.name, - issue_status: .status.name, - issue_is_closed: .content.closed, - issue_opened_at: .content.createdAt, - issue_closed_at: .content.closedAt, - deliverable_pillar: .pillar.name, - quad_id: .quad.iterationId, - quad_name: .quad.title, - quad_start: .quad.startDate, - quad_length: .quad.duration, - quad_end: ( - if .quad.startDate == null - then null - else ( - (.quad.startDate | strptime(\"%Y-%m-%d\") | mktime) - + (.quad.duration * 86400) | strftime(\"%Y-%m-%d\") - ) - end - ), - }} - -] -""" - # Make the command - # fmt: off - command: list[str] = [ - str(script), - "--batch", "100", - "--field", f"login={owner}", - "--field", f"project={project}", - "--field", f"quadField='{quad_field}'", - "--field", f"pillarField='{pillar_field}'", - "--query", f"{query}", - "--paginate-jq", "'.data.organization.projectV2.items.nodes'", - "--transform-jq", jq, - ] - # fmt: on - # invoke the command via a subprocess and write the output to a file + + # Set query variables + variables = { + "login": owner, + "project": project, + "quadField": quad_field, + "pillarField": pillar_field, + } + + # Execute query + data = client.execute_paginated_query( + query, + variables, + ["organization", "projectV2", "items"], + ) + + # Transform data + transformed_data = transform_project_data(data, owner, project) + + # Write output with open(output_file, "w", encoding="utf-8") as f: - subprocess.call(command, stdout=f) # noqa: S603 + json.dump(transformed_data, f, indent=2) diff --git a/analytics/src/analytics/integrations/github/make-graphql-query.sh b/analytics/src/analytics/integrations/github/make-graphql-query.sh deleted file mode 100755 index b51eabd0e..000000000 --- a/analytics/src/analytics/integrations/github/make-graphql-query.sh +++ /dev/null @@ -1,81 +0,0 @@ -#! /bin/bash -# Propagate project metadata from parent issues to their children -# Usage: -# ./export-issue-metadata.sh \ -# --org HHS \ -# --roadmap-project 12 \ -# --sprint-project 13 \ -# --roadmap-file data/roadmap-data.json -# --sprint-file data/sprint-data.json - - -# ####################################################### -# Parse command line args with format `--option arg` -# ####################################################### - -batch=100 -fields=() -while [[ $# -gt 0 ]]; do - case $1 in - --dry-run) - echo "Running in dry run mode" - dry_run=YES - shift # past argument - ;; - --batch) - batch="$2" - shift 2 # past argument and value - ;; - --query) - query="$2" - shift 2 # past argument and value - ;; - # jq query to include in each API request during pagination - --paginate-jq) - paginate_jq="$2" - shift 2 # past argument and value - ;; - # jq query to run after all pages have been retrieved - --transform-jq) - transform_jq="$2" - shift 2 # past argument and value - ;; - --field) - # Append field and value to newline - fields+=("--field $2") - shift 2 # past argument and value - ;; - -*|--*) - echo "Unknown option $1" - exit 1 - ;; - *) - positional_args+=("$1") # save positional arg - shift # past argument - ;; - esac -done - -# ####################################################### -# Execute a graphql query -# ####################################################### - -# Build the gh api graphql command with dynamic fields -command="gh api graphql \\ - --header 'GraphQL-Features:sub_issues' \\ - --header 'GraphQL-Features:issue_types' \\ - --paginate \\ - --field batch=$batch" - -# Loop over fields and append them individually, ensuring correct formatting -for field in "${fields[@]}"; do - command+=" \\ - $field" -done - -command+=" \\ - -f query='$query' \\ - --jq '$paginate_jq' | jq --slurp 'add'" - -# Use echo -e to interpret the newline characters -eval "$command" | jq "${transform_jq}" diff --git a/analytics/src/analytics/integrations/github/validation.py b/analytics/src/analytics/integrations/github/validation.py new file mode 100644 index 000000000..abd8fff35 --- /dev/null +++ b/analytics/src/analytics/integrations/github/validation.py @@ -0,0 +1,119 @@ +"""Pydantic schemas for validating GitHub API responses.""" + +# pylint: disable=no-self-argument +from datetime import datetime, timedelta + +from pydantic import BaseModel, Field, computed_field, model_validator + + +def safe_default_factory(data: dict, keys_to_replace: list[str]) -> dict: + """ + Replace keys that are explicitly set to None with an empty dict for default_factory. + + We need to do this because if a key is present, but its value is None or null, + it will raise a Pydantic ValidationError rather than using the default_factory. + """ + for key in keys_to_replace: + if data.get(key) is None: + data[key] = {} + return data + + +# ############################################# +# Issue content sub-schemas +# ############################################# + + +class IssueParent(BaseModel): + """Schema for the parent issue of a sub-issue.""" + + title: str | None = None + url: str | None = None + + +class IssueType(BaseModel): + """Schema for the type of an issue.""" + + name: str | None = None + + +class IssueContent(BaseModel): + """Schema for core issue metadata.""" + + title: str + url: str + closed: bool + created_at: str = Field(alias="createdAt") + closed_at: str | None = Field(alias="closedAt", default=None) + issue_type: IssueType = Field(alias="type", default_factory=IssueType) + parent: IssueParent = Field(default_factory=IssueParent) + + @model_validator(mode="before") + def replace_none_with_defaults(cls, values) -> dict: # noqa: ANN001, N805 + """Replace None with default_factory instances.""" + # Replace None with default_factory instances + return safe_default_factory(values, ["type", "parent"]) + + +# ############################################# +# Project field sub-schemas +# ############################################# + + +class IterationValue(BaseModel): + """Schema for iteration field values like Sprint or Quad.""" + + iteration_id: str | None = Field(alias="iterationId", default=None) + title: str | None = None + start_date: str | None = Field(alias="startDate", default=None) + duration: int | None = None + + @computed_field + def end_date(self) -> str | None: + """Calculate the end date of the iteration.""" + if not self.start_date or not self.duration: + return None + + start = datetime.strptime(self.start_date, "%Y-%m-%d") # noqa: DTZ007 + end = start + timedelta(days=self.duration) + return end.strftime("%Y-%m-%d") + + +class SingleSelectValue(BaseModel): + """Schema for single select field values like Status or Pillar.""" + + option_id: str | None = Field(alias="optionId", default=None) + name: str | None = None + + +class NumberValue(BaseModel): + """Schema for number field values like Points.""" + + number: int | None = None + + +# ############################################# +# Top-level project item schemas +# ############################################# + + +class ProjectItem(BaseModel): + """Schema that combines fields from both RoadmapItem and SprintItem.""" + + # Issue fields + content: IssueContent + status: SingleSelectValue = Field(default_factory=SingleSelectValue) + # Sprint fields + sprint: IterationValue = Field(default_factory=IterationValue) + points: NumberValue = Field(default_factory=NumberValue) + # Roadmap fields + quad: IterationValue = Field(default_factory=IterationValue) + pillar: SingleSelectValue = Field(default_factory=SingleSelectValue) + + @model_validator(mode="before") + def replace_none_with_defaults(cls, values) -> dict: # noqa: ANN001, N805 + """Replace None with default_factory instances.""" + return safe_default_factory( + values, + ["sprint", "points", "quad", "pillar", "status"], + ) diff --git a/analytics/tests/integrations/github/__init__.py b/analytics/tests/integrations/github/__init__.py new file mode 100644 index 000000000..ac9b0a189 --- /dev/null +++ b/analytics/tests/integrations/github/__init__.py @@ -0,0 +1 @@ +"""Test the integrations.github package.""" diff --git a/analytics/tests/integrations/github/test_client.py b/analytics/tests/integrations/github/test_client.py new file mode 100644 index 000000000..0e2cf3ba3 --- /dev/null +++ b/analytics/tests/integrations/github/test_client.py @@ -0,0 +1,124 @@ +"""Test the GitHubGraphqlClient class.""" + +from unittest.mock import Mock, patch + +import pytest +from analytics.integrations.github.client import GitHubGraphqlClient, GraphqlError + + +@pytest.fixture(name="client") +def mock_client() -> GitHubGraphqlClient: + """Fixture to initialize GitHubGraphqlClient with a mock token.""" + return GitHubGraphqlClient() + + +@pytest.fixture(name="sample_query") +def mock_query() -> str: + """Fixture for a sample GraphQL query.""" + return """ + query($login: String!, $first: Int!, $after: String) { + user(login: $login) { + repositories(first: $first, after: $after) { + pageInfo { + hasNextPage + endCursor + } + nodes { + name + } + } + } + } + """ + + +@patch("requests.post") # Mocks the requests.post() method +def test_paginated_query_success( + mock_post: Mock, + client: GitHubGraphqlClient, + sample_query: str, +) -> None: + """Test successfully making a paginated call and extracting data.""" + # Arrange - Mock the response from requests.post() + mock_response = { + "data": { + "user": { + "repositories": { + "nodes": [{"name": "repo1"}], + "pageInfo": {"hasNextPage": False, "endCursor": None}, + }, + }, + }, + } + mock_post.return_value = Mock( + status_code=200, + json=Mock(return_value=mock_response), + ) + + # Act - Set + variables: dict[str, str] = {"login": "octocat"} + path_to_nodes: list[str] = ["user", "repositories"] + result: list[dict[str, str]] = client.execute_paginated_query( + sample_query, + variables, + path_to_nodes, + ) + + assert result == [{"name": "repo1"}] + + +@patch("requests.post") +def test_invalid_path_to_nodes( + mock_post: Mock, + client: GitHubGraphqlClient, + sample_query: str, +) -> None: + """Test catching an error if the path_to_nodes is incorrect.""" + # Arrange - Mock the response from requests.post() + mock_response = { + "data": { + "user": { + "repositories": { + "nodes": [{"name": "repo1"}], + "pageInfo": {"hasNextPage": False, "endCursor": None}, + }, + }, + }, + } + mock_post.return_value = Mock( + status_code=200, + json=Mock(return_value=mock_response), + ) + + # Arrange - Set variables and incorrect path to nodes + variables: dict[str, str] = {"login": "octocat"} + path_to_nodes: list[str] = ["user", "invalid_path"] + + # Assert - Check that the incorrect path raises a KeyError + with pytest.raises(KeyError): + client.execute_paginated_query(sample_query, variables, path_to_nodes) + + +@patch("requests.post") +def test_graphql_error( + mock_post: Mock, + client: GitHubGraphqlClient, + sample_query: str, +) -> None: + """Test raising a GraphqlError if errors are present in the response.""" + # Arrange - Mock the response from requests.post() to include an error + mock_post.return_value = Mock( + status_code=200, + json=Mock(return_value={"errors": [{"message": "Test GitHub error"}]}), + ) + + # Arrange - Set the variables and path to nodes in the response body + variables: dict[str, str] = {"login": "octocat"} + path_to_nodes: list[str] = ["user", "repositories"] + + # Assert - Check that GraphqlError was raised + with pytest.raises(GraphqlError) as excinfo: + client.execute_paginated_query(sample_query, variables, path_to_nodes) + + # Assert - Check that it contains the error message from the mock response + assert "Test GitHub error" in str(excinfo.value) diff --git a/analytics/tests/integrations/github/test_validation.py b/analytics/tests/integrations/github/test_validation.py new file mode 100644 index 000000000..7f9dadee7 --- /dev/null +++ b/analytics/tests/integrations/github/test_validation.py @@ -0,0 +1,260 @@ +"""Test the validation schemas for GitHub API responses.""" + +import pytest # noqa: I001 +from pydantic import ValidationError +from analytics.integrations.github.validation import ( + IssueContent, + IterationValue, + NumberValue, + ProjectItem, + SingleSelectValue, +) + +# ############################################# +# Test data constants +# ############################################# + +VALID_ISSUE_CONTENT = { + "title": "Test Issue", + "url": "https://github.com/test/repo/issues/1", + "closed": True, + "createdAt": "2024-01-01T00:00:00Z", + "closedAt": "2024-01-02T00:00:00Z", + "parent": { + "title": "Test Parent", + "url": "https://github.com/test/repo/issues/2", + }, + "type": { + "name": "Bug", + }, +} + +VALID_ITERATION_VALUE = { + "iterationId": "123", + "title": "Sprint 1", + "startDate": "2024-01-01", + "duration": 14, +} + +VALID_SINGLE_SELECT = { + "optionId": "456", + "name": "In Progress", +} + + +# ############################################# +# Project items tests +# ############################################# + + +class TestProjectItems: + """Test cases for project item schemas.""" + + def test_fully_populated(self) -> None: + """Test validating a fully populated project item.""" + data = { + "content": VALID_ISSUE_CONTENT, + "status": VALID_SINGLE_SELECT, + "sprint": VALID_ITERATION_VALUE, + "points": {"number": 5}, + "quad": VALID_ITERATION_VALUE, + "pillar": VALID_SINGLE_SELECT, + } + item = ProjectItem.model_validate(data) + # Check issue content + assert item.content.title == "Test Issue" + assert item.status.name == "In Progress" + # Check sprint fields + assert item.sprint.title == "Sprint 1" + assert item.points.number == 5 + # Check roadmap fields + assert item.quad.title == "Sprint 1" + assert item.pillar.name == "In Progress" + + def test_minimal(self) -> None: + """Test validating a project item with only required fields.""" + data = { + "content": VALID_ISSUE_CONTENT, + } + item = ProjectItem.model_validate(data) + # Check status defaults + assert item.status.name is None + assert item.status.option_id is None + # Check sprint defaults + assert item.sprint.title is None + assert item.sprint.iteration_id is None + assert item.points.number is None + # Check roadmap defaults + assert item.quad.title is None + assert item.quad.iteration_id is None + assert item.pillar.name is None + assert item.pillar.option_id is None + + def test_with_nulls(self) -> None: + """Test validating a project item with null values explicitly set.""" + data = { + "content": { + "title": "Test Issue", + "url": "https://github.com/test/repo/issues/1", + "closed": True, + "createdAt": "2024-01-01T00:00:00Z", + "closedAt": "2024-01-02T00:00:00Z", + "type": None, + "parent": None, + }, + "status": None, + "sprint": None, + "points": None, + "quad": None, + "pillar": None, + } + item = ProjectItem.model_validate(data) + # Check status defaults + assert item.status.name is None + assert item.status.option_id is None + # Check sprint defaults + assert item.sprint.title is None + assert item.sprint.iteration_id is None + assert item.points.number is None + # Check roadmap defaults + assert item.quad.title is None + assert item.quad.iteration_id is None + assert item.pillar.name is None + assert item.pillar.option_id is None + + +# ############################################# +# Issue content tests +# ############################################# + + +class TestIssueContent: + """Test cases for issue content schemas.""" + + def test_fully_populated(self) -> None: + """Test validating a fully populated issue content.""" + issue = IssueContent.model_validate(VALID_ISSUE_CONTENT) + assert issue.title == "Test Issue" + assert issue.url == "https://github.com/test/repo/issues/1" + assert issue.closed is True + assert issue.parent.title == "Test Parent" + assert issue.issue_type.name == "Bug" + + def test_minimal(self) -> None: + """Test validating an issue content with only required fields.""" + minimal_content = { + "title": "Test Issue", + "url": "https://github.com/test/repo/issues/1", + "closed": False, + "createdAt": "2024-01-01T00:00:00Z", + } + issue = IssueContent.model_validate(minimal_content) + assert issue.closed_at is None + assert issue.parent.title is None + assert issue.parent.url is None + assert issue.issue_type.name is None + + def test_with_nulls(self) -> None: + """Test validating an issue content with null values.""" + data = { + "title": "Test Issue", + "url": "https://github.com/test/repo/issues/1", + "closed": True, + "createdAt": "2024-01-01T00:00:00Z", + "closedAt": None, + "type": None, + "parent": None, + } + issue = IssueContent.model_validate(data) + assert issue.title == "Test Issue" + assert issue.closed_at is None + assert issue.issue_type.name is None + assert issue.parent.title is None + assert issue.parent.url is None + + def test_missing_title_raises_error(self) -> None: + """Test that validation fails when title is missing.""" + with pytest.raises(ValidationError): + IssueContent.model_validate( + { + "url": "https://github.com/test/repo/issues/1", + "closed": True, + "createdAt": "2024-01-01T00:00:00Z", + }, + ) + + def test_missing_url_raises_error(self) -> None: + """Test that validation fails when url is missing.""" + with pytest.raises(ValidationError): + IssueContent.model_validate( + { + "title": "Test Issue", + "closed": True, + "createdAt": "2024-01-01T00:00:00Z", + }, + ) + + def test_missing_closed_raises_error(self) -> None: + """Test that validation fails when closed is missing.""" + with pytest.raises(ValidationError): + IssueContent.model_validate( + { + "title": "Test Issue", + "url": "https://github.com/test/repo/issues/1", + "createdAt": "2024-01-01T00:00:00Z", + }, + ) + + def test_missing_created_at_raises_error(self) -> None: + """Test that validation fails when createdAt is missing.""" + with pytest.raises(ValidationError): + IssueContent.model_validate( + { + "title": "Test Issue", + "url": "https://github.com/test/repo/issues/1", + "closed": True, + }, + ) + + +# ############################################# +# Project field tests +# ############################################# + + +class TestProjectFields: + """Test cases for project field schemas.""" + + def test_iteration_value_fully_populated(self) -> None: + """Test validating a fully populated iteration value.""" + iteration = IterationValue.model_validate(VALID_ITERATION_VALUE) + assert iteration.iteration_id == "123" + assert iteration.title == "Sprint 1" + assert iteration.start_date == "2024-01-01" + assert iteration.duration == 14 + assert iteration.end_date == "2024-01-15" + + def test_iteration_value_with_empty_data(self) -> None: + """Test validating an iteration value with empty data.""" + iteration = IterationValue.model_validate({}) + assert iteration.iteration_id is None + assert iteration.title is None + assert iteration.start_date is None + assert iteration.duration is None + assert iteration.end_date is None + + def test_single_select_value_fully_populated(self) -> None: + """Test validating a fully populated single select value.""" + select = SingleSelectValue.model_validate(VALID_SINGLE_SELECT) + assert select.option_id == "456" + assert select.name == "In Progress" + + def test_number_value_with_number(self) -> None: + """Test validating number value with a number.""" + with_value = NumberValue.model_validate({"number": 5}) + assert with_value.number == 5 + + def test_number_value_with_empty_data(self) -> None: + """Test validating number value with empty data.""" + without_value = NumberValue.model_validate({}) + assert without_value.number is None diff --git a/analytics/tests/logs/test_logging.py b/analytics/tests/logs/test_logging.py index cebc20e86..ecebc4b34 100644 --- a/analytics/tests/logs/test_logging.py +++ b/analytics/tests/logs/test_logging.py @@ -40,7 +40,7 @@ def test_init( records = caplog.records assert len(records) == 2 assert re.match( - r"^start test_logging: \w+ [0-9.]+ \w+, hostname \S+, pid \d+, user \d+\(\w+\)$", + r"^start test_logging: \w+ [0-9.]+ \w+, hostname \S+, pid \d+, user \d+\([\w\.]+\)", records[0].message, ) assert re.match(r"^invoked as:", records[1].message)