From 7e47bb0246c250545d1c42b7657bd40844ef23ff Mon Sep 17 00:00:00 2001 From: jennypng <63012604+JennyPng@users.noreply.github.com> Date: Mon, 15 Jun 2026 11:22:11 -0700 Subject: [PATCH 1/2] dump --- .../.github/workflows/analyze.yml | 155 +++++ .../.github/workflows/ci.yml | 32 + copilot-review-analyzer/DESIGN.md | 398 ++++++++++++ copilot-review-analyzer/EXECUTION_LOG.md | 117 ++++ .../IMPLEMENTATION_PLAN.md | 578 ++++++++++++++++++ copilot-review-analyzer/README.md | 167 +++++ copilot-review-analyzer/analyzer.db | Bin 0 -> 876544 bytes copilot-review-analyzer/analyzer/__init__.py | 3 + copilot-review-analyzer/analyzer/cli.py | 212 +++++++ copilot-review-analyzer/analyzer/config.py | 169 +++++ .../analyzer/github/__init__.py | 1 + .../analyzer/github/client.py | 385 ++++++++++++ .../analyzer/github/queries.py | 370 +++++++++++ .../analyzer/llm/__init__.py | 1 + .../analyzer/llm/client.py | 108 ++++ copilot-review-analyzer/analyzer/llm/judge.py | 188 ++++++ .../analyzer/llm/prompts.py | 94 +++ .../analyzer/llm/suggest.py | 174 ++++++ .../analyzer/pipeline/__init__.py | 1 + .../analyzer/pipeline/attribute.py | 169 +++++ .../analyzer/pipeline/classify.py | 199 ++++++ .../analyzer/pipeline/gaps.py | 93 +++ .../analyzer/pipeline/ingest.py | 103 ++++ .../analyzer/pipeline/metrics.py | 137 +++++ .../analyzer/pipeline/orchestrate.py | 178 ++++++ .../analyzer/pipeline/suggest.py | 100 +++ .../analyzer/pipeline/themes.py | 185 ++++++ .../analyzer/report/__init__.py | 1 + .../analyzer/report/data.py | 138 +++++ .../analyzer/report/export.py | 220 +++++++ .../analyzer/report/render.py | 130 ++++ .../analyzer/store/__init__.py | 1 + copilot-review-analyzer/analyzer/store/db.py | 451 ++++++++++++++ .../analyzer/store/schema.sql | 122 ++++ copilot-review-analyzer/config.yaml | 43 ++ copilot-review-analyzer/pyproject.toml | 62 ++ copilot-review-analyzer/tests/__init__.py | 0 .../tests/fixtures/pr_page1.json | 143 +++++ .../tests/fixtures/pr_page2.json | 81 +++ .../tests/test_acted_on.py | 129 ++++ .../tests/test_attribute.py | 65 ++ .../tests/test_attribute_persist.py | 119 ++++ copilot-review-analyzer/tests/test_client.py | 185 ++++++ copilot-review-analyzer/tests/test_config.py | 79 +++ copilot-review-analyzer/tests/test_db.py | 132 ++++ copilot-review-analyzer/tests/test_export.py | 172 ++++++ copilot-review-analyzer/tests/test_gaps.py | 99 +++ copilot-review-analyzer/tests/test_judge.py | 137 +++++ copilot-review-analyzer/tests/test_metrics.py | 104 ++++ copilot-review-analyzer/tests/test_queries.py | 125 ++++ copilot-review-analyzer/tests/test_suggest.py | 141 +++++ copilot-review-analyzer/tests/test_themes.py | 147 +++++ 52 files changed, 7343 insertions(+) create mode 100644 copilot-review-analyzer/.github/workflows/analyze.yml create mode 100644 copilot-review-analyzer/.github/workflows/ci.yml create mode 100644 copilot-review-analyzer/DESIGN.md create mode 100644 copilot-review-analyzer/EXECUTION_LOG.md create mode 100644 copilot-review-analyzer/IMPLEMENTATION_PLAN.md create mode 100644 copilot-review-analyzer/README.md create mode 100644 copilot-review-analyzer/analyzer.db create mode 100644 copilot-review-analyzer/analyzer/__init__.py create mode 100644 copilot-review-analyzer/analyzer/cli.py create mode 100644 copilot-review-analyzer/analyzer/config.py create mode 100644 copilot-review-analyzer/analyzer/github/__init__.py create mode 100644 copilot-review-analyzer/analyzer/github/client.py create mode 100644 copilot-review-analyzer/analyzer/github/queries.py create mode 100644 copilot-review-analyzer/analyzer/llm/__init__.py create mode 100644 copilot-review-analyzer/analyzer/llm/client.py create mode 100644 copilot-review-analyzer/analyzer/llm/judge.py create mode 100644 copilot-review-analyzer/analyzer/llm/prompts.py create mode 100644 copilot-review-analyzer/analyzer/llm/suggest.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/__init__.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/attribute.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/classify.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/gaps.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/ingest.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/metrics.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/orchestrate.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/suggest.py create mode 100644 copilot-review-analyzer/analyzer/pipeline/themes.py create mode 100644 copilot-review-analyzer/analyzer/report/__init__.py create mode 100644 copilot-review-analyzer/analyzer/report/data.py create mode 100644 copilot-review-analyzer/analyzer/report/export.py create mode 100644 copilot-review-analyzer/analyzer/report/render.py create mode 100644 copilot-review-analyzer/analyzer/store/__init__.py create mode 100644 copilot-review-analyzer/analyzer/store/db.py create mode 100644 copilot-review-analyzer/analyzer/store/schema.sql create mode 100644 copilot-review-analyzer/config.yaml create mode 100644 copilot-review-analyzer/pyproject.toml create mode 100644 copilot-review-analyzer/tests/__init__.py create mode 100644 copilot-review-analyzer/tests/fixtures/pr_page1.json create mode 100644 copilot-review-analyzer/tests/fixtures/pr_page2.json create mode 100644 copilot-review-analyzer/tests/test_acted_on.py create mode 100644 copilot-review-analyzer/tests/test_attribute.py create mode 100644 copilot-review-analyzer/tests/test_attribute_persist.py create mode 100644 copilot-review-analyzer/tests/test_client.py create mode 100644 copilot-review-analyzer/tests/test_config.py create mode 100644 copilot-review-analyzer/tests/test_db.py create mode 100644 copilot-review-analyzer/tests/test_export.py create mode 100644 copilot-review-analyzer/tests/test_gaps.py create mode 100644 copilot-review-analyzer/tests/test_judge.py create mode 100644 copilot-review-analyzer/tests/test_metrics.py create mode 100644 copilot-review-analyzer/tests/test_queries.py create mode 100644 copilot-review-analyzer/tests/test_suggest.py create mode 100644 copilot-review-analyzer/tests/test_themes.py diff --git a/copilot-review-analyzer/.github/workflows/analyze.yml b/copilot-review-analyzer/.github/workflows/analyze.yml new file mode 100644 index 000000000000..e5a09c6dc95e --- /dev/null +++ b/copilot-review-analyzer/.github/workflows/analyze.yml @@ -0,0 +1,155 @@ +name: Analyze Copilot Reviews + +# Weekly mining of recently merged PRs to track Copilot-reviewer miss-rate/precision. +# +# DB persistence strategy (chosen per Phase 8): the SQLite DB is committed to a +# dedicated orphan branch `analyzer-data` so weekly trends accumulate durably across +# runs, AND uploaded as a build artifact for per-run audit snapshots. (Cache was +# rejected because eviction would silently break long-term trend continuity.) +# +# Token wiring: GITHUB_TOKEN authenticates repo reads, issue writes, and the data +# branch push. Provide the optional `ANALYZER_PAT` secret for cross-repo reads or +# higher GitHub Models limits — it is preferred when present. Tokens are never echoed. + +on: + schedule: + - cron: "0 6 * * 1" # Mondays 06:00 UTC + workflow_dispatch: + inputs: + since: + description: "Analysis window (e.g. 7d, 24h, 2w)" + default: "7d" + repo: + description: "owner/name to analyze" + default: "Azure/azure-sdk-for-python" + +permissions: + contents: write # push analyzer.db to the data branch + issues: write # open/update the weekly summary issue + +concurrency: + group: analyze-copilot-reviews + cancel-in-progress: false + +defaults: + run: + working-directory: copilot-review-analyzer + +env: + GITHUB_TOKEN: ${{ secrets.ANALYZER_PAT || secrets.GITHUB_TOKEN }} + DATA_BRANCH: analyzer-data + ANALYZER_DB: analyzer.db + ISSUE_LABEL: copilot-review-analyzer + SINCE: ${{ github.event.inputs.since || '7d' }} + TARGET_REPO: ${{ github.event.inputs.repo || 'Azure/azure-sdk-for-python' }} + +jobs: + analyze: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install + run: pip install -e . + + - name: Restore DB from data branch + run: | + set -euo pipefail + tmp="$(mktemp -d)" + if git clone --depth 1 --branch "$DATA_BRANCH" \ + "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" \ + "$tmp" >/dev/null 2>&1; then + if [ -f "$tmp/analyzer.db" ]; then + cp "$tmp/analyzer.db" "$ANALYZER_DB" + echo "Restored existing DB from $DATA_BRANCH." + else + echo "Data branch present but has no analyzer.db; starting fresh." + fi + else + echo "Data branch $DATA_BRANCH not found; starting fresh history." + fi + rm -rf "$tmp" + + - name: Run analysis + run: | + set -euo pipefail + analyzer run \ + --repo "$TARGET_REPO" \ + --since "$SINCE" \ + --state merged \ + --use-llm \ + --db "$ANALYZER_DB" + + - name: Build summary + run: | + set -euo pipefail + analyzer report --format markdown --db "$ANALYZER_DB" > summary.md || true + if [ ! -s summary.md ] || grep -qi '^No data' summary.md; then + printf '## Copilot Review Analyzer\n\nNo new data in the last %s.\n' "$SINCE" > summary.md + fi + { + echo "" + echo "---" + echo "### Proposed prompt deltas (requires human approval)" + echo "" + echo "> The themes above are the issue categories humans caught that the" + echo "> Copilot reviewer missed. Review the top themes and decide whether the" + echo "> judge/review prompts should emphasize them. **No prompt change is applied" + echo "> automatically** — edit \`analyzer/llm/prompts.py\` via PR if warranted." + } >> summary.md + cat summary.md >> "$GITHUB_STEP_SUMMARY" + + - name: Upload DB artifact + uses: actions/upload-artifact@v4 + with: + name: analyzer-db + path: copilot-review-analyzer/analyzer.db + if-no-files-found: warn + + - name: Persist DB to data branch + run: | + set -euo pipefail + tmp="$(mktemp -d)" + if ! git clone --depth 1 --branch "$DATA_BRANCH" \ + "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" \ + "$tmp" >/dev/null 2>&1; then + git clone --depth 1 \ + "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" \ + "$tmp" >/dev/null 2>&1 + ( cd "$tmp" && git checkout --orphan "$DATA_BRANCH" && git rm -rfq . || true ) + fi + cp "$ANALYZER_DB" "$tmp/analyzer.db" + cd "$tmp" + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add -f analyzer.db + if git diff --cached --quiet; then + echo "DB unchanged; nothing to persist." + else + git commit -q -m "Update analyzer.db ($(date -u +%FT%TZ))" + git push -q origin "$DATA_BRANCH" + echo "Persisted DB to $DATA_BRANCH." + fi + + - name: Open or update summary issue + run: | + set -euo pipefail + title="Weekly Copilot Review Analysis" + existing="$(gh issue list --state open --label "$ISSUE_LABEL" \ + --json number --jq '.[0].number' --repo "$GITHUB_REPOSITORY" || true)" + gh label create "$ISSUE_LABEL" --color BFD4F2 \ + --description "Copilot review analyzer reports" \ + --repo "$GITHUB_REPOSITORY" >/dev/null 2>&1 || true + if [ -n "$existing" ] && [ "$existing" != "null" ]; then + gh issue edit "$existing" --body-file summary.md --repo "$GITHUB_REPOSITORY" + gh issue comment "$existing" \ + --body "Updated $(date -u +%FT%TZ) (window: $SINCE)." \ + --repo "$GITHUB_REPOSITORY" + else + gh issue create --title "$title" --label "$ISSUE_LABEL" \ + --body-file summary.md --repo "$GITHUB_REPOSITORY" + fi diff --git a/copilot-review-analyzer/.github/workflows/ci.yml b/copilot-review-analyzer/.github/workflows/ci.yml new file mode 100644 index 000000000000..95062c0b3770 --- /dev/null +++ b/copilot-review-analyzer/.github/workflows/ci.yml @@ -0,0 +1,32 @@ +name: CI + +on: + push: + paths: + - "copilot-review-analyzer/**" + pull_request: + paths: + - "copilot-review-analyzer/**" + +defaults: + run: + working-directory: copilot-review-analyzer + +jobs: + lint-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install + run: pip install -e ".[dev]" + - name: Ruff + run: ruff check analyzer tests + - name: Black + run: black --check analyzer tests + - name: Mypy + run: mypy analyzer + - name: Pytest + run: pytest diff --git a/copilot-review-analyzer/DESIGN.md b/copilot-review-analyzer/DESIGN.md new file mode 100644 index 000000000000..804bd083f963 --- /dev/null +++ b/copilot-review-analyzer/DESIGN.md @@ -0,0 +1,398 @@ +# Copilot Code-Review Effectiveness Analyzer — Design & Plan + +## 1. Goal +Periodically mine recently closed/merged PRs, separate Copilot-reviewer comments +from human comments, use an LLM judge to find **substantive, diff-detectable +issues that humans caught but Copilot missed**, cluster those into **themes**, and +track **miss-rate / precision metrics** over time. The output feeds prompt +improvements for the Copilot reviewer — proposed automatically, approved by a human. + +## 2. Tech stack +- **Language:** Python 3.10+ +- **GitHub access:** GraphQL (one query per PR for reviews + threads + commits) via + `httpx`; auth from `GH_TOKEN` / `gh auth token`. +- **LLM judge:** GitHub Models (`https://models.inference.ai.azure.com`, + OpenAI-compatible), same token. +- **Storage:** SQLite (`stdlib sqlite3`). +- **CLI:** `typer` (or `argparse`); report rendering with `rich`. +- **Optional web viewer (later):** `datasette analyzer.db` for zero-effort, or a + small FastAPI app reading the same DB. + +## 3. Pipeline architecture (5 stages) +``` +[1 Ingest] -> [2 Attribute] -> [3 Classify] -> [4 Gap/Theme] -> [5 Store] -> [Viewer] +``` + +1. **Ingest** — enumerate PRs in a date/state window, then per-PR GraphQL fetch of + metadata, diff hunks, review threads (file+line), top-level reviews, and the + commit timeline. +2. **Attribute** (deterministic, no LLM) — tag each comment author as + `copilot` / `human` / `other_bot`; map comments to `(file, line_range)`. +3. **Classify** (LLM judge, the only LLM step) — per human comment: + `is_substantive`, `diff_detectable`, `category`, `confidence`. Independently, + compute `acted_on` from the commit timeline (deterministic). +4. **Gap & theme detection** — a **gap** = substantive + diff_detectable + no + Copilot comment overlapping the same file/lines. Cluster gaps into **themes** + via a controlled vocabulary. +5. **Store** — write to SQLite; compute per-run metrics. + +## 4. File layout +``` +copilot-review-analyzer/ +├── pyproject.toml +├── README.md +├── DESIGN.md # this file +├── config.yaml # repos, bot logins, model, sampling, theme vocab +├── analyzer/ +│ ├── __init__.py +│ ├── cli.py # entrypoints: run, report, themes, trend, init-db +│ ├── config.py # load/validate config.yaml -> dataclass +│ ├── github/ +│ │ ├── client.py # httpx GraphQL/REST client, auth, pagination, retry +│ │ └── queries.py # GraphQL query strings + response parsers +│ ├── pipeline/ +│ │ ├── ingest.py # Stage 1 +│ │ ├── attribute.py # Stage 2: author_kind, line-range mapping, overlap +│ │ ├── classify.py # Stage 3: LLM judge + acted_on linkage +│ │ ├── gaps.py # Stage 4a: gap detection +│ │ ├── themes.py # Stage 4b: controlled-vocab tagging +│ │ └── metrics.py # Stage 5: per-run metrics +│ ├── llm/ +│ │ ├── judge.py # GitHub Models call, batching, JSON validation +│ │ └── prompts.py # judge + theme prompt templates +│ ├── store/ +│ │ ├── schema.sql # tables (see §6) +│ │ └── db.py # connection, migrations, typed upserts/queries +│ └── report/ +│ ├── render.py # rich tables, trend sparklines, theme summaries +│ └── export.py # JSON/CSV/markdown export for the Actions issue body +├── tests/ +│ ├── fixtures/ # recorded GraphQL responses, sample diffs +│ ├── test_attribute.py +│ ├── test_gaps.py +│ └── test_metrics.py +└── .github/workflows/ + └── analyze.yml # weekly cron: run + export + open summary issue +``` + +## 5. Module contracts (key) +- `github/queries.py` — per-PR GraphQL (see §7); page `reviewThreads` & `commits`. +- `pipeline/attribute.py` — pure functions: + - `classify_author(login) -> "copilot" | "human" | "other_bot"` using + `config.copilot_logins` + `login.endswith("[bot]")` heuristic. + - `overlaps(human_range, copilot_ranges) -> bool` — same file, intersecting line + ranges (±N line fuzz, configurable). +- `pipeline/classify.py`: + - `judge_comments(human_comments, diff) -> [Judgement]` (batched). + - `acted_on(comment, commits) -> bool` — any commit to same `path` with + `committedDate > comment.createdAt` (coarse; documented soft signal). +- `pipeline/gaps.py`: `gap = is_substantive and diff_detectable and not copilot_overlap`. +- `pipeline/themes.py`: LLM tags each gap into a controlled vocabulary from + `config.yaml` (`null-handling`, `error-handling`, `test-coverage`, `security`, + `api-design`, `concurrency`, `perf`, `docs`, ... + `other`). + +## 6. SQLite schema (`analyzer/store/schema.sql`) +```sql +PRAGMA foreign_keys = ON; + +CREATE TABLE IF NOT EXISTS runs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + repo TEXT NOT NULL, + started_at TEXT NOT NULL, -- ISO8601 + finished_at TEXT, + window_start TEXT NOT NULL, + window_end TEXT NOT NULL, + pr_state TEXT NOT NULL, -- merged | closed | all + pr_count INTEGER NOT NULL DEFAULT 0, + model TEXT NOT NULL, -- e.g. github-models/gpt-4o + config_hash TEXT -- for reproducibility +); + +CREATE TABLE IF NOT EXISTS prs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + number INTEGER NOT NULL, + title TEXT, + author TEXT, + state TEXT, + url TEXT, + created_at TEXT, + merged_at TEXT, + closed_at TEXT, + additions INTEGER, + deletions INTEGER, + UNIQUE (run_id, number) +); + +CREATE TABLE IF NOT EXISTS comments ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + pr_id INTEGER NOT NULL REFERENCES prs(id) ON DELETE CASCADE, + external_id INTEGER, -- GitHub databaseId + author TEXT, + author_kind TEXT NOT NULL, -- human | copilot | other_bot + is_review_body INTEGER NOT NULL DEFAULT 0, -- 1 = top-level review summary + file_path TEXT, + line_start INTEGER, + line_end INTEGER, + body TEXT, + diff_hunk TEXT, + created_at TEXT, + url TEXT, + + -- Stage 3 (LLM judge); NULL until classified + is_substantive INTEGER, + diff_detectable INTEGER, + category TEXT, + judge_rationale TEXT, + judge_confidence REAL, + + -- Deterministic enrichments + acted_on INTEGER, -- commit to same path after comment + copilot_overlap INTEGER, -- human comment overlapped by Copilot + + UNIQUE (run_id, external_id) +); + +CREATE TABLE IF NOT EXISTS themes ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + label TEXT NOT NULL, -- controlled vocab term + description TEXT, + gap_count INTEGER NOT NULL DEFAULT 0, + UNIQUE (run_id, label) +); + +CREATE TABLE IF NOT EXISTS gaps ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + run_id INTEGER NOT NULL REFERENCES runs(id) ON DELETE CASCADE, + pr_id INTEGER NOT NULL REFERENCES prs(id) ON DELETE CASCADE, + comment_id INTEGER NOT NULL REFERENCES comments(id) ON DELETE CASCADE, + category TEXT, + theme_id INTEGER REFERENCES themes(id) ON DELETE SET NULL +); + +CREATE TABLE IF NOT EXISTS metrics ( + run_id INTEGER PRIMARY KEY REFERENCES runs(id) ON DELETE CASCADE, + substantive_human_count INTEGER NOT NULL DEFAULT 0, + copilot_comment_count INTEGER NOT NULL DEFAULT 0, + gap_count INTEGER NOT NULL DEFAULT 0, + miss_rate REAL, + copilot_overlap_rate REAL, + copilot_acted_on_rate REAL, + human_burden_per_pr REAL +); + +CREATE INDEX IF NOT EXISTS idx_comments_run ON comments(run_id); +CREATE INDEX IF NOT EXISTS idx_comments_kind ON comments(run_id, author_kind); +CREATE INDEX IF NOT EXISTS idx_comments_pr ON comments(pr_id); +CREATE INDEX IF NOT EXISTS idx_gaps_run ON gaps(run_id); +CREATE INDEX IF NOT EXISTS idx_themes_run ON themes(run_id); +``` + +## 7. GraphQL query (`analyzer/github/queries.py`) +```graphql +query PRReviewData( + $owner: String! + $name: String! + $number: Int! + $threadsAfter: String + $commitsAfter: String +) { + repository(owner: $owner, name: $name) { + pullRequest(number: $number) { + number + title + state + url + createdAt + mergedAt + closedAt + additions + deletions + author { login } + + reviewThreads(first: 50, after: $threadsAfter) { + pageInfo { hasNextPage endCursor } + nodes { + isResolved + isOutdated + path + line + originalLine + startLine + originalStartLine + comments(first: 50) { + nodes { + databaseId + author { login } + authorAssociation + body + createdAt + url + diffHunk + } + } + } + } + + reviews(first: 50) { + nodes { + databaseId + author { login } + state + body + submittedAt + url + } + } + + commits(first: 100, after: $commitsAfter) { + pageInfo { hasNextPage endCursor } + nodes { + commit { + oid + committedDate + changedFilesIfAvailable + } + } + } + } + } + rateLimit { remaining resetAt cost } +} +``` + +**Parser notes** +- Enumerate PR numbers first via REST `GET /repos/{o}/{r}/pulls?state=closed&sort=updated` + or the search API, then call this query per PR. +- Thread range = `(path, startLine|originalStartLine .. line|originalLine)`; fall back + to `originalLine` when `line` is null (outdated threads). +- The thread's **first comment** is the review point; replies are conversation. +- Page `reviewThreads` and `commits` via `pageInfo.endCursor` until `hasNextPage=false`. +- Use `rateLimit.cost` to throttle proactively. + +## 8. LLM judge & theme prompts (`analyzer/llm/prompts.py`) +```python +JUDGE_SYSTEM = """\ +You are an expert software code reviewer evaluating the review comments left on \ +a pull request. Your job is to classify each human review comment objectively. + +Rules: +- Judge ONLY from the information visible in the provided diff hunk(s). Do NOT \ +assume external context (chat, issues, runtime behavior, tribal knowledge). +- "substantive" means the comment identifies a real code-quality issue: a bug, \ +security flaw, performance problem, design/API concern, or a missing test. \ +Style nitpicks, typos, praise, questions, and process/social chatter are NOT \ +substantive. +- "diff_detectable" means a competent automated reviewer could plausibly raise \ +this issue from the diff alone, without external context. +- Be conservative: if a comment relies on knowledge not present in the diff, set \ +diff_detectable = false. + +Return STRICT JSON only, matching the schema. No prose, no markdown.""" + +JUDGE_USER_TEMPLATE = """\ +Classify each comment below. Return a JSON object: +{{"results": [{{"id": , "is_substantive": , "category": , \ +"diff_detectable": , "rationale": , \ +"confidence": }}]}} + +"category" must be exactly one of: +["bug", "security", "perf", "design", "test-gap", "docs", "nit", "style", \ +"question", "social"]. + +Comments to classify: +{comments_block}""" + +COMMENT_ITEM_TEMPLATE = """\ +--- COMMENT id={id} --- +File: {file_path} Lines: {line_start}-{line_end} +Diff hunk: +``` +{diff_hunk} +``` +Reviewer comment: +\"\"\"{body}\"\"\" +""" + +THEME_SYSTEM = """\ +You map code-review issues to a fixed taxonomy of recurring themes so they can \ +be trended over time. Use ONLY labels from the provided vocabulary; if none fit, \ +use "other". Return strict JSON.""" + +THEME_USER_TEMPLATE = """\ +Allowed theme labels: {vocab} + +For each gap, assign exactly one label. Return: +{{"results": [{{"id": , "theme":