diff --git a/QUICK_START.md b/QUICK_START.md index a4a8744..15026e7 100644 --- a/QUICK_START.md +++ b/QUICK_START.md @@ -43,4 +43,4 @@ To wrap your own objective into a forever-loop driver, copy `example_scripts/cd_ - `sudo -n` prompts for password → sudoers drop-in missing. Re-run `sudo ./install.sh`. - `Invalid version format` → version string does not match `X.Y.Z` or `X.Y.Z-rcN`. - 404 from curl → version does not exist on `download.picknik.ai`. -- Service fails to start → check `journalctl -u moveit-pro@$USER.service -e`. If `SLACK_WEBHOOK_URL` is set in `/etc/default/moveit-pro`, `notify-crash.py` will also post to Slack. +- Service fails to start → check `journalctl -u moveit-pro@$USER.service -e`. If `SLACK_WEBHOOK_URL` / `MOVEIT_CD_GITHUB_TOKEN` are set in `/etc/default/moveit-pro`, `notify-crash.py` also posts to Slack and opens a GitHub issue. diff --git a/README.md b/README.md index 3ae90df..72d0362 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,10 @@ The full setup walkthrough lives at [Set Up CI/CD](https://docs.picknik.ai/how_t - `install.sh` — one-shot installer. Copies the wrapper, systemd unit, and sudoers drop-in into place. Run on each target machine. - `bin/install-moveit-pro` — root-owned installer wrapper. Validates the version string against a strict regex, downloads the `.deb` to a root-owned cache, installs it, and deletes the file. - `bin/moveit-pro@.service` — systemd template unit. Runs `moveit_pro run --no-browser` as `%i`. Restarts on failure. Reads optional environment from `/etc/default/moveit-pro`. -- `bin/notify-crash.py` — posts to Slack via `ExecStopPost` when the service exits non-zero. Reads `SLACK_WEBHOOK_URL` from the environment; if unset, the notification is skipped. +- `bin/notify-crash.py` — posts to Slack and opens/updates a GitHub issue via `ExecStopPost` when the service exits non-zero. Reads `SLACK_WEBHOOK_URL` and `MOVEIT_CD_GITHUB_TOKEN` from the environment; each notification is skipped if its variable is unset. +- `bin/notify_lib.py` — shared notification helpers (`slack_post`, `github_issue`) used by both `notify-crash.py` and `cd_objective_lib.py`. Installed to `/usr/lib/moveit-pro-scripts/`. `github_issue` deduplicates by exact title within a label: a repeated failure bumps an occurrence counter and appends a row instead of opening a new issue. - `bin/ci-runner.sudoers.template` — sudoers drop-in. `install.sh` substitutes `__CI_USER__` with the local account and installs at `/etc/sudoers.d/-ci`. Grants NOPASSWD on the installer and the user's own systemd unit only. -- `example_scripts/cd_objective_lib.py` — helper library for sending an Objective goal via rosbridge, used by the example scripts. +- `example_scripts/cd_objective_lib.py` — helper library for sending an Objective goal via rosbridge, used by the example scripts. On objective timeout or rosbridge failure it posts to Slack, opens/updates a GitHub issue, and stops the systemd unit (via `notify_lib.py`). - `example_scripts/3-waypoint-pick-and-place.py`, `example_scripts/ml-segment-image.py`, `example_scripts/move-all-boxes.py` — example smoke-test scripts that drive an Objective on `localhost:3201` rosbridge. ## Install @@ -27,6 +28,7 @@ sudo ./install.sh This installs: - The objective scripts to `/usr/bin/`. +- `cd_objective_lib.py` and `notify_lib.py` to `/usr/lib/moveit-pro-scripts/`. - `notify-crash.py` to `/usr/bin/`. - `install-moveit-pro` to `/usr/local/sbin/` (root-owned, `0755`). - `/var/cache/moveit-pro/` as a root-owned download cache. @@ -60,17 +62,28 @@ WORKSPACE_PIN_TO_RELEASE=false `WORKSPACE_REPO` is regex-restricted to `https://github.com//.git` or `git@github.com:/.git`. For the SSH form, the CI user needs a deploy key with read-only access. -### Optional: Slack crash notifications +### Optional: failure notifications (Slack + GitHub issues) -Set `SLACK_WEBHOOK_URL` in `/etc/default/moveit-pro` (root-owned). The systemd unit reads this file via `EnvironmentFile=`, so `notify-crash.py` and `cd_objective_lib.py` will post crash and CD-failure events to the webhook: +Both notifiers read their config from `/etc/default/moveit-pro` (root-owned). The systemd unit loads this file via `EnvironmentFile=`, so `notify-crash.py` and `cd_objective_lib.py` pick it up for crash and CD-failure events. Each notifier is independent: set only the variables you want. ```bash sudo install -m 0640 -o root -g root /dev/stdin /etc/default/moveit-pro <<'EOF' +# Slack incoming webhook. Unset -> Slack skipped. SLACK_WEBHOOK_URL=https://hooks.slack.com/services/XXX/YYY/ZZZ + +# GitHub issue on failure. Unset -> issue creation skipped. +MOVEIT_CD_GITHUB_TOKEN=github_pat_xxx +# Optional overrides (defaults shown): +# MOVEIT_CD_ISSUE_REPO=PickNikRobotics/moveit_pro +# MOVEIT_CD_ISSUE_LABEL=qa-deployment-failure EOF ``` -If the variable is unset, notifications are silently skipped. +If a variable is unset, that notification is silently skipped — this is how non-QA machines opt out of issue creation. + +`MOVEIT_CD_GITHUB_TOKEN` must be a **fine-grained PAT scoped to the issue repo with `Issues: Read and write` and nothing else** — the narrowest credential that can file an issue. Do not grant `Contents` or any other scope: a QA machine is a higher-exposure host, and the token only needs to open and comment on issues. The `qa-deployment-failure` label must already exist on the repo (the API does not create labels on demand). + +Repeated failures of the same kind on the same machine deduplicate to a single issue (matched by title within the label) — each recurrence bumps an occurrence counter, appends a table row with the version/time/reason, and adds a comment for visibility. ## Verify the install diff --git a/bin/notify-crash.py b/bin/notify-crash.py index d260cc1..124d291 100755 --- a/bin/notify-crash.py +++ b/bin/notify-crash.py @@ -1,27 +1,56 @@ #!/usr/bin/env python3 -import json -import os import socket import subprocess import sys -import urllib.request -WEBHOOK_URL = os.environ.get("SLACK_WEBHOOK_URL", "") - - -def get_payload_from_systemd(unit): - # Check if the service exited with a failure. - result = subprocess.run( - [ - "systemctl", - "show", - unit, - "--property=ExecMainStatus,ActiveEnterTimestamp,ActiveExitTimestamp", - ], - capture_output=True, - text=True, - ) +sys.path.insert(0, "/usr/lib/moveit-pro-scripts") + +try: + from notify_lib import build_payload, github_issue, slack_post +except ImportError as exc: + # ExecStopPost must never fail the service stop because a helper is missing. + print(f"notify_lib unavailable, notifications disabled: {exc}", file=sys.stderr) + + def build_payload(process_time, date=None): + return {"process_time": process_time} + + def slack_post(payload, dry_run=False): + pass + + def github_issue(title, reason, version=None, dry_run=False): + pass + + +# Bound the systemctl query so a hung call can't stall the service-stop path. +SYSTEMCTL_TIMEOUT_S = 10 + + +def get_crash_info(unit): + """Return (payload, reason) for a non-zero service exit, or (None, None). + + `payload` feeds Slack; `reason` is the human summary recorded on the + GitHub issue. A clean exit (status 0) returns (None, None) so a normal + `systemctl stop` does not notify. + """ + try: + result = subprocess.run( + [ + "systemctl", + "show", + unit, + "--property=ExecMainStatus,ActiveEnterTimestamp,ActiveExitTimestamp", + ], + capture_output=True, + text=True, + check=False, + timeout=SYSTEMCTL_TIMEOUT_S, + ) + except (OSError, subprocess.TimeoutExpired) as exc: + # systemctl missing (container/test VM) or hung. Never block or raise + # on the ExecStopPost path; just skip notification. + print(f"systemctl unavailable, skipping crash notify: {exc}", file=sys.stderr) + return None, None props = {} for line in result.stdout.strip().splitlines(): @@ -30,7 +59,7 @@ def get_payload_from_systemd(unit): exit_code = props.get("ExecMainStatus", "0") if exit_code == "0": - return None + return None, None crash_time = props.get("ActiveExitTimestamp", "unknown") start_time = props.get("ActiveEnterTimestamp", "unknown") @@ -48,55 +77,33 @@ def get_payload_from_systemd(unit): else: process_time = "unknown" - return { - "date": crash_time, - "laptop_name": socket.gethostname(), - "process_time": process_time, - } - - -def get_dummy_payload(): - return { - "date": "Sun 2026-04-13 14:19:47 MDT", - "laptop_name": socket.gethostname(), - "process_time": "2:34:12", - } - - -def send(payload, dry_run=False): - data = json.dumps(payload).encode() - - if dry_run: - print(f"POST {WEBHOOK_URL or ''}") - print(json.dumps(payload, indent=2)) - return - - if not WEBHOOK_URL: - print("SLACK_WEBHOOK_URL not set; skipping notification", file=sys.stderr) - return - - req = urllib.request.Request( - WEBHOOK_URL, - data=data, - headers={"Content-Type": "application/json"}, - ) - urllib.request.urlopen(req) + payload = build_payload(process_time, date=crash_time) + reason = f"Service {unit} exited with status {exit_code} (uptime {process_time})" + return payload, reason def main(): dry_run = "--dry-run" in sys.argv send_test = "--send" in sys.argv args = [a for a in sys.argv[1:] if a not in ("--dry-run", "--send")] + unit = args[0] if args else "moveit-pro@unknown" + title = f"QA deployment crash: {socket.gethostname()}" if dry_run or send_test: - payload = get_dummy_payload() - send(payload, dry_run=not send_test) - else: - unit = args[0] if args else "moveit-pro@unknown" - payload = get_payload_from_systemd(unit) - if payload is None: - return - send(payload) + payload = build_payload("2:34:12", date="Sun 2026-04-13 14:19:47 MDT") + reason = f"Test crash notification for {unit}" + slack_post(payload, dry_run=not send_test) + # Distinct title so a --send test never dedupes into the real crash + # issue stream. + test_title = f"QA deployment crash test: {socket.gethostname()}" + github_issue(test_title, reason, dry_run=not send_test) + return + + payload, reason = get_crash_info(unit) + if payload is None: + return + slack_post(payload) + github_issue(title, reason) if __name__ == "__main__": diff --git a/bin/notify_lib.py b/bin/notify_lib.py new file mode 100644 index 0000000..c1b8d5c --- /dev/null +++ b/bin/notify_lib.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +"""Shared failure-notification helpers for QA deployment hardware. + +Two call sites fire on a QA deployment failure: + + * notify-crash.py -- systemd ExecStopPost on non-zero service exit. + * cd_objective_lib.py -- objective-runner timeout / rosbridge failure. + +Both post to Slack (SLACK_WEBHOOK_URL) and, when a token is configured, open +or update a deduplicated GitHub issue on the MoveIt Pro repo. Every function +here is best-effort: a notification failure must never propagate and break the +service-stop path that called it. + +Environment (read from /etc/default/moveit-pro via the systemd unit): + SLACK_WEBHOOK_URL -- Slack incoming webhook. Unset -> Slack skipped. + MOVEIT_CD_GITHUB_TOKEN -- fine-grained PAT, Issues:RW on the issue repo. + Unset -> GitHub issue creation skipped (this is + how non-QA machines opt out). + MOVEIT_CD_ISSUE_REPO -- "owner/repo" for issues. Default below. + MOVEIT_CD_ISSUE_LABEL -- dedup label. Default below. +""" + +import json +import os +import re +import socket +import subprocess +import sys +import time +import urllib.error +import urllib.parse +import urllib.request + +WEBHOOK_URL_ENV = "SLACK_WEBHOOK_URL" +GITHUB_TOKEN_ENV = "MOVEIT_CD_GITHUB_TOKEN" +ISSUE_REPO_ENV = "MOVEIT_CD_ISSUE_REPO" +ISSUE_LABEL_ENV = "MOVEIT_CD_ISSUE_LABEL" + +DEFAULT_ISSUE_REPO = "PickNikRobotics/moveit_pro" +DEFAULT_ISSUE_LABEL = "qa-deployment-failure" + +# Debian package name installed by install-moveit-pro. +PACKAGE_NAME = "moveit-pro" + +GITHUB_API = "https://api.github.com" +# Kept well under systemd's default TimeoutStopSec (90s): this runs on the +# service-stop path, and a slow API must not eat the whole stop budget. +HTTP_TIMEOUT_S = 10 +# Safety ceiling on issue-list pagination so a malformed Link header can never +# loop forever on the stop path. 20 pages * 100 = far beyond any real backlog. +MAX_ISSUE_PAGES = 20 + +# owner/repo slug, matching GitHub's own naming constraint. Guards against an +# env value like "../../user" steering requests to other API endpoints. +_REPO_RE = re.compile(r"^[A-Za-z0-9_.-]+/[A-Za-z0-9_.-]+$") + + +def _env(name, default=""): + return os.environ.get(name, default) + + +def installed_version(): + """Return the installed MoveIt Pro package version, or "unknown". + + Queried from dpkg so the issue records exactly which build failed. + """ + try: + result = subprocess.run( + ["dpkg-query", "-W", "-f=${Version}", PACKAGE_NAME], + capture_output=True, + text=True, + check=False, + ) + version = result.stdout.strip() + return version or "unknown" + except (OSError, subprocess.SubprocessError): + return "unknown" + + +def build_payload(process_time, date=None): + """Build the Slack payload shared by both call sites. + + `process_time` carries either an uptime duration (crash) or a failure + reason (objective runner). `date` defaults to now; notify-crash passes the + systemd crash timestamp instead. + """ + return { + "date": time.strftime("%a %Y-%m-%d %H:%M:%S %Z") if date is None else date, + "laptop_name": socket.gethostname(), + "process_time": process_time, + } + + +def slack_post(payload, dry_run=False): + """POST `payload` to the Slack webhook. Best-effort; never raises.""" + webhook = _env(WEBHOOK_URL_ENV) + + if dry_run: + print(f"POST {webhook or ''}") + print(json.dumps(payload, indent=2)) + return + + if not webhook: + print( + f"{WEBHOOK_URL_ENV} not set; skipping Slack notification", file=sys.stderr + ) + return + + # Broad catch on purpose: this runs on the service-stop path and must never + # raise. json.dumps (TypeError) and urlopen (OSError) are both in scope. + try: + req = urllib.request.Request( + webhook, + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"}, + ) + urllib.request.urlopen(req, timeout=HTTP_TIMEOUT_S) + print("Slack notified") + except Exception as exc: + print(f"Slack notify failed: {exc}", file=sys.stderr) + + +def _gh_request(method, url, token, body=None): + """Issue an authenticated GitHub REST request; return (parsed JSON, Link + header string), or (None, "") on failure. The Link header is read inside + the response context so callers never touch a closed response. Never + raises. (urllib.error.URLError is an OSError subclass, so OSError covers + network errors, HTTP errors, and timeouts; ValueError covers JSON decode.) + """ + data = json.dumps(body).encode() if body is not None else None + req = urllib.request.Request(url, data=data, method=method) + req.add_header("Authorization", f"Bearer {token}") + req.add_header("Accept", "application/vnd.github+json") + req.add_header("X-GitHub-Api-Version", "2022-11-28") + req.add_header("User-Agent", "moveit-pro-hardware-scripts") + if data is not None: + req.add_header("Content-Type", "application/json") + try: + with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT_S) as resp: + raw = resp.read().decode() + link = resp.headers.get("Link", "") + parsed = json.loads(raw) if raw else None + return parsed, link + except (OSError, ValueError) as exc: + print(f"GitHub API {method} {url} failed: {exc}", file=sys.stderr) + return None, "" + + +def _gh_list_open_issues(repo, label, token): + """Return all open issues carrying `label`, following pagination.""" + issues = [] + url = ( + f"{GITHUB_API}/repos/{repo}/issues" + f"?state=open&labels={urllib.parse.quote(label)}&per_page=100" + ) + pages = 0 + while url: + pages += 1 + if pages > MAX_ISSUE_PAGES: + print( + f"Issue pagination exceeded {MAX_ISSUE_PAGES} pages; stopping", + file=sys.stderr, + ) + break + parsed, link = _gh_request("GET", url, token) + if parsed is None: + break + # The issues endpoint also returns PRs; they carry pull_request and + # never our label, but filter defensively. + issues.extend(i for i in parsed if "pull_request" not in i) + url = _next_link(link) + return issues + + +_NEXT_LINK_RE = re.compile(r'<([^>]+)>\s*;\s*[^,]*rel="next"') + + +def _next_link(link_header): + """Extract the rel="next" URL from an RFC 5988 Link header, or None. + + Matches the bracketed URL directly rather than splitting on "," so a comma + inside a URL cannot corrupt the parse and silently truncate pagination. + """ + match = _NEXT_LINK_RE.search(link_header) + return match.group(1) if match else None + + +def _sanitize_cell(text): + """Make `text` safe for a one-line Markdown cell/value. + + Strips newlines and pipes (table-breaking), neutralizes backticks (code + spans), and escapes brackets so a crafted reason/hostname cannot render as + a Markdown link. + """ + return ( + text.replace("\n", " ") + .replace("|", "/") + .replace("`", "'") + .replace("[", "\\[") + .replace("]", "\\]") + .strip() + ) + + +def github_issue(title, reason, version=None, dry_run=False): + """Open or update a deduplicated GitHub issue for a QA deployment failure. + + Dedup is by exact title within the configured label. An existing open issue + has its occurrence counter bumped, a new table row appended, and a comment + posted for notification visibility; otherwise a fresh issue is created. + + Skipped silently when no token is configured (non-QA machines). + """ + token = _env(GITHUB_TOKEN_ENV) + repo = _env(ISSUE_REPO_ENV, DEFAULT_ISSUE_REPO) + label = _env(ISSUE_LABEL_ENV, DEFAULT_ISSUE_LABEL) + version = version or installed_version() + hostname = socket.gethostname() + when = time.strftime("%a %Y-%m-%d %H:%M:%S %Z") + reason_cell = _sanitize_cell(reason) + + if dry_run: + print(f"GitHub issue on {repo} (label {label}): {title}") + print(f" version={version} host={hostname} reason={reason_cell}") + return + + if not token: + print( + f"{GITHUB_TOKEN_ENV} not set; skipping GitHub issue: {title}", + file=sys.stderr, + ) + return + + if not _REPO_RE.match(repo): + print( + f"Invalid {ISSUE_REPO_ENV} '{repo}'; skipping GitHub issue: {title}", + file=sys.stderr, + ) + return + + # Belt to _gh_request's suspenders: enforce the never-raise contract at the + # function boundary so a regex / int / response-shape surprise on the + # service-stop path cannot propagate. + try: + _do_github_issue( + title, reason_cell, version, hostname, when, repo, label, token + ) + except Exception as exc: + print(f"GitHub issue creation failed: {exc}", file=sys.stderr) + + +def _do_github_issue(title, reason_cell, version, hostname, when, repo, label, token): + """Look up the deduplicated issue and create or update it. + + May raise; the public github_issue() wrapper contains it. + """ + existing = None + for issue in _gh_list_open_issues(repo, label, token): + if issue.get("title") == title: + existing = issue + break + + # Build the row with a plain f-string (no .format) so a "{" or "}" in any + # field can never raise KeyError / corrupt the row. + def _row(n): + return f"| {n} | `{version}` | `{hostname}` | {when} | {reason_cell} |" + + if existing is None: + occurrence = 1 + body = ( + f"A QA deployment failed on `{hostname}`.\n\n" + f"**Reason:** {reason_cell}\n" + f"**Occurrences:** {occurrence}\n\n" + "| # | Version | Machine | Time | Reason |\n" + "|---|---------|---------|------|--------|\n" + f"{_row(occurrence)}\n" + ) + created, _ = _gh_request( + "POST", + f"{GITHUB_API}/repos/{repo}/issues", + token, + {"title": title, "body": body, "labels": [label]}, + ) + if created is not None: + print(f"Opened GitHub issue #{created.get('number')}: {title}") + return + + number = existing.get("number") + body = existing.get("body") or "" + count_match = re.search(r"\*\*Occurrences:\*\*\s*(\d+)", body) + if count_match: + occurrence = int(count_match.group(1)) + 1 + body = re.sub( + r"\*\*Occurrences:\*\*\s*\d+", + f"**Occurrences:** {occurrence}", + body, + count=1, + ) + else: + # Body lost its counter (e.g. hand-edited). Re-seed it so the count + # keeps tracking instead of freezing on every later occurrence. + occurrence = 2 + body = f"**Occurrences:** {occurrence}\n\n" + body + body = body.rstrip("\n") + "\n" + _row(occurrence) + "\n" + + _gh_request( + "PATCH", + f"{GITHUB_API}/repos/{repo}/issues/{number}", + token, + {"body": body}, + ) + _gh_request( + "POST", + f"{GITHUB_API}/repos/{repo}/issues/{number}/comments", + token, + { + "body": ( + f"QA deployment failed again (occurrence #{occurrence}).\n\n" + f"**Version:** `{version}`\n" + f"**Machine:** `{hostname}`\n" + f"**Time:** {when}\n" + f"**Reason:** {reason_cell}" + ) + }, + ) + print(f"Updated GitHub issue #{number} (occurrence #{occurrence}): {title}") diff --git a/example_scripts/cd_objective_lib.py b/example_scripts/cd_objective_lib.py index 1d6097e..777fa59 100644 --- a/example_scripts/cd_objective_lib.py +++ b/example_scripts/cd_objective_lib.py @@ -2,25 +2,45 @@ """Shared CD objective runner. Connects to rosbridge, waits up to 1 hour for the /do_objective action server, -sends the objective goal, then exits. On timeout: stops the moveit-pro service -and posts a Slack failure notification using the same webhook as notify-crash. +sends the objective goal, then exits. On timeout: stops the moveit-pro service, +posts a Slack failure notification using the same webhook as notify-crash, and +opens or updates a deduplicated GitHub issue (see notify_lib). Intended to be launched detached from CI over SSH; the calling shell can exit immediately and the script will continue running on the host. """ -import json import os import socket import subprocess import sys import time -import urllib.request from threading import Event import roslibpy from roslibpy import ActionClient +# Shared notification helpers live alongside this module once installed. +sys.path.insert(0, "/usr/lib/moveit-pro-scripts") + +try: + from notify_lib import build_payload, github_issue, slack_post +except ImportError as exc: + # A partial install / image skew must not stop the objective runner from + # running and stopping the service — notifications are auxiliary. Degrade + # to no-ops, loudly. + print(f"notify_lib unavailable, notifications disabled: {exc}", file=sys.stderr) + + def build_payload(process_time, date=None): + return {"process_time": process_time} + + def slack_post(payload, dry_run=False): + pass + + def github_issue(title, reason, version=None, dry_run=False): + pass + + ROSBRIDGE_HOST = "localhost" ROSBRIDGE_PORT = 3201 @@ -33,28 +53,18 @@ ROSAPI_CALL_TIMEOUT_S = 10 SEND_GOAL_DRAIN_S = 5 -WEBHOOK_URL = os.environ.get("SLACK_WEBHOOK_URL", "") +# Objective context for the GitHub issue title, set once the runner knows which +# objective(s) it is driving. None until then (e.g. rosbridge never came up). +_current_objective = None + + +def _failure_title() -> str: + suffix = f" — {_current_objective}" if _current_objective else "" + return f"QA deployment failure: {socket.gethostname()}{suffix}" def _slack_post(message: str) -> None: - if not WEBHOOK_URL: - print(f"SLACK_WEBHOOK_URL not set; skipping notify: {message}", file=sys.stderr) - return - payload = { - "date": time.strftime("%a %Y-%m-%d %H:%M:%S %Z"), - "laptop_name": socket.gethostname(), - "process_time": message, - } - try: - req = urllib.request.Request( - WEBHOOK_URL, - data=json.dumps(payload).encode(), - headers={"Content-Type": "application/json"}, - ) - urllib.request.urlopen(req, timeout=30) - print(f"Slack notified: {message}") - except Exception as exc: - print(f"Slack notify failed: {exc}", file=sys.stderr) + slack_post(build_payload(message)) def _stop_service() -> None: @@ -73,6 +83,7 @@ def _stop_service() -> None: def _fail(reason: str): print(reason, file=sys.stderr) _slack_post(reason) + github_issue(_failure_title(), reason) _stop_service() sys.exit(1) @@ -151,6 +162,8 @@ def _wait_for_action_server(client: roslibpy.Ros, deadline: float) -> None: def run_objective(objective_name: str) -> None: + global _current_objective + _current_objective = objective_name deadline = time.monotonic() + TOTAL_TIMEOUT_S client = roslibpy.Ros(host=ROSBRIDGE_HOST, port=ROSBRIDGE_PORT) @@ -187,6 +200,8 @@ def _send_and_wait( crashes still get caught by the systemd notify-crash hook. We only fail() on rosbridge errors or timeouts. """ + global _current_objective + _current_objective = objective_name done = Event() def _on_result(_result): @@ -226,11 +241,19 @@ def run_objectives_forever( Used by customer-config CD machines whose BT XML does not self-loop (Clean-Botix populate_mission_scene + test_change_tool, Auto Wash Test Run Job). Stuck objectives or rosbridge errors call _fail() which - Slacks and stops the systemd unit; healthy iterations log and continue. + Slacks, files a GitHub issue, and stops the systemd unit; healthy + iterations log and continue. """ if not objectives: _fail("run_objectives_forever called with empty objectives list") + # Seed the issue-title context with the whole list. _send_and_wait narrows + # it to the specific objective once we start sending; this initial value is + # only what a pre-send failure (rosbridge / action server never came up) + # reports. + global _current_objective + _current_objective = ", ".join(objectives) + deadline = time.monotonic() + TOTAL_TIMEOUT_S client = roslibpy.Ros(host=ROSBRIDGE_HOST, port=ROSBRIDGE_PORT) diff --git a/install.sh b/install.sh index a31d57a..9d240de 100755 --- a/install.sh +++ b/install.sh @@ -42,9 +42,10 @@ if ! python3 -c "import roslibpy" 2>/dev/null; then python3 -m pip install "${PIP_ARGS[@]}" roslibpy fi -echo "Installing CD objective shared library to /usr/lib/moveit-pro-scripts/" +echo "Installing shared libraries to /usr/lib/moveit-pro-scripts/" sudo install -d -m 0755 -o root -g root /usr/lib/moveit-pro-scripts sudo install -m 644 "$SCRIPT_DIR/example_scripts/cd_objective_lib.py" /usr/lib/moveit-pro-scripts/cd_objective_lib.py +sudo install -m 644 "$SCRIPT_DIR/bin/notify_lib.py" /usr/lib/moveit-pro-scripts/notify_lib.py echo "Installing objective scripts to /usr/bin/" sudo install -m 755 "$SCRIPT_DIR/example_scripts/3-waypoint-pick-and-place.py" /usr/bin/3-waypoint-pick-and-place.py diff --git a/test/test_notify_lib.py b/test/test_notify_lib.py new file mode 100644 index 0000000..7e651a3 --- /dev/null +++ b/test/test_notify_lib.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +"""Tests for notify_lib pure functions and GitHub-issue dedup logic. + +Network-free: GitHub REST is stubbed so the dedup branch (create vs. bump vs. +re-seed a hand-edited body) is exercised deterministically. Run with: + + python3 -m unittest discover -s test +""" + +import os +import sys +import unittest +from pathlib import Path +from unittest import mock + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "bin")) + +import notify_lib # noqa: E402 + + +class TestNextLink(unittest.TestCase): + """_next_link parses the rel="next" target out of a Link header.""" + + def test_returns_next_url(self) -> None: + header = ( + '; rel="next", ' + '; rel="last"' + ) + self.assertEqual( + notify_lib._next_link(header), + "https://api.github.com/repos/o/r/issues?page=2", + ) + + def test_comma_inside_url_does_not_truncate(self) -> None: + header = '; rel="next"' + self.assertEqual( + notify_lib._next_link(header), + "https://api.github.com/x?labels=a,b&page=2", + ) + + def test_no_next_returns_none(self) -> None: + header = '; rel="last"' + self.assertIsNone(notify_lib._next_link(header)) + + def test_empty_returns_none(self) -> None: + self.assertIsNone(notify_lib._next_link("")) + + +class TestSanitizeCell(unittest.TestCase): + """_sanitize_cell keeps text on one Markdown table cell.""" + + def test_strips_table_breaking_chars(self) -> None: + self.assertEqual( + notify_lib._sanitize_cell("a|b\nc`d "), + "a/b c'd", + ) + + +class TestBuildPayload(unittest.TestCase): + """build_payload shapes the Slack body for both call sites.""" + + def test_explicit_date_preserved(self) -> None: + payload = notify_lib.build_payload("reason X", date="D") + self.assertEqual(payload["date"], "D") + self.assertEqual(payload["process_time"], "reason X") + self.assertIn("laptop_name", payload) + + def test_default_date_filled(self) -> None: + payload = notify_lib.build_payload("reason X") + self.assertTrue(payload["date"]) + + +@mock.patch.dict(os.environ, {"MOVEIT_CD_GITHUB_TOKEN": "t"}) +@mock.patch.object(notify_lib, "installed_version", return_value="9.9.9") +@mock.patch.object(notify_lib.socket, "gethostname", return_value="qa-host") +class TestGithubIssueDedup(unittest.TestCase): + """github_issue creates, bumps, or re-seeds depending on the existing body.""" + + def test_creates_new_issue_when_none_exists(self, _host, _ver) -> None: + with ( + mock.patch.object(notify_lib, "_gh_list_open_issues", return_value=[]), + mock.patch.object( + notify_lib, "_gh_request", return_value=({"number": 1}, "") + ) as req, + ): + notify_lib.github_issue("QA deployment crash: qa-host", "boom") + + method, url, _token, body = req.call_args.args + self.assertEqual(method, "POST") + self.assertTrue(url.endswith("/issues")) + self.assertEqual(body["labels"], ["qa-deployment-failure"]) + self.assertIn("**Occurrences:** 1", body["body"]) + self.assertIn("| 1 | `9.9.9` | `qa-host` |", body["body"]) + + def test_bumps_existing_counter(self, _host, _ver) -> None: + existing = { + "number": 7, + "title": "QA deployment crash: qa-host", + "body": "intro\n\n**Occurrences:** 3\n\n| # |\n|---|\n| 3 | x |\n", + } + with ( + mock.patch.object( + notify_lib, "_gh_list_open_issues", return_value=[existing] + ), + mock.patch.object( + notify_lib, "_gh_request", return_value=(None, "") + ) as req, + ): + notify_lib.github_issue("QA deployment crash: qa-host", "boom again") + + patch_call = req.call_args_list[0] + self.assertEqual(patch_call.args[0], "PATCH") + patched_body = patch_call.args[3]["body"] + self.assertIn("**Occurrences:** 4", patched_body) + self.assertNotIn("**Occurrences:** 3", patched_body) + self.assertIn("| 4 | `9.9.9` | `qa-host` |", patched_body) + # A visibility comment is posted on the second call. + self.assertEqual(req.call_args_list[1].args[0], "POST") + self.assertIn("occurrence #4", req.call_args_list[1].args[3]["body"]) + + @mock.patch.dict(os.environ, {"MOVEIT_CD_ISSUE_REPO": "../../user"}) + def test_invalid_repo_is_rejected_before_any_request(self, _host, _ver) -> None: + with mock.patch.object(notify_lib, "_gh_request") as req: + notify_lib.github_issue("QA deployment crash: qa-host", "boom") + req.assert_not_called() + + def test_never_raises_when_lookup_explodes(self, _host, _ver) -> None: + # An unexpected error in the work path must be contained, not propagate + # onto the systemd service-stop path. + with mock.patch.object( + notify_lib, "_gh_list_open_issues", side_effect=RuntimeError("boom") + ): + notify_lib.github_issue("QA deployment crash: qa-host", "boom") + + def test_reseeds_counter_when_missing(self, _host, _ver) -> None: + existing = { + "number": 9, + "title": "QA deployment crash: qa-host", + "body": "someone deleted the counter line\n\n| # |\n|---|\n| 1 | x |\n", + } + with ( + mock.patch.object( + notify_lib, "_gh_list_open_issues", return_value=[existing] + ), + mock.patch.object( + notify_lib, "_gh_request", return_value=(None, "") + ) as req, + ): + notify_lib.github_issue("QA deployment crash: qa-host", "boom") + + patched_body = req.call_args_list[0].args[3]["body"] + self.assertIn("**Occurrences:** 2", patched_body) + + +if __name__ == "__main__": + unittest.main()