|
| 1 | +#!/bin/bash |
| 2 | +# Reward: Partial credit (0.0-1.0) — each correct link in the chain scores independently |
| 3 | +# |
| 4 | +# DEPENDENCY CHAIN SCORER |
| 5 | +# ----------------------- |
| 6 | +# Scores agent output by comparing each step in the dependency chain against |
| 7 | +# ground truth. Each step is worth an equal fraction of the total score. |
| 8 | +# Matching is fuzzy on line numbers (+/- tolerance) and exact on repo/file. |
| 9 | + |
| 10 | +set -e |
| 11 | + |
| 12 | +OUTPUT_PATH="/workspace/chain.json" |
| 13 | +GROUND_TRUTH="/tests/ground_truth.json" |
| 14 | +REWARD_FILE="/logs/verifier/reward.txt" |
| 15 | +LINE_TOLERANCE=50 # +/- 50 lines for line number matching |
| 16 | + |
| 17 | +mkdir -p /logs/verifier |
| 18 | + |
| 19 | +# ── Check prerequisites ─────────────────────────────────────────────────── |
| 20 | +if [ ! -f "$GROUND_TRUTH" ]; then |
| 21 | + echo "ERROR: ground_truth.json not found at $GROUND_TRUTH" |
| 22 | + echo "0.0" > "$REWARD_FILE" |
| 23 | + exit 0 |
| 24 | +fi |
| 25 | + |
| 26 | +if [ ! -f "$OUTPUT_PATH" ]; then |
| 27 | + echo "No agent output found at $OUTPUT_PATH" |
| 28 | + echo "Agent did not produce the required chain.json file." |
| 29 | + echo "0.0" > "$REWARD_FILE" |
| 30 | + exit 0 |
| 31 | +fi |
| 32 | + |
| 33 | +echo "Scoring dependency chain..." |
| 34 | +echo "Output: $OUTPUT_PATH" |
| 35 | +echo "Ground truth: $GROUND_TRUTH" |
| 36 | +echo "" |
| 37 | + |
| 38 | +# ── Delegate scoring to Python ──────────────────────────────────────────── |
| 39 | +OUTPUT_PATH="$OUTPUT_PATH" GROUND_TRUTH="$GROUND_TRUTH" \ |
| 40 | +REWARD_FILE="$REWARD_FILE" LINE_TOLERANCE="$LINE_TOLERANCE" \ |
| 41 | +python3 << 'PYEOF' |
| 42 | +import json, os, re, sys |
| 43 | +
|
| 44 | +OUTPUT_PATH = os.environ["OUTPUT_PATH"] |
| 45 | +GT_PATH = os.environ["GROUND_TRUTH"] |
| 46 | +REWARD_PATH = os.environ["REWARD_FILE"] |
| 47 | +LINE_TOLERANCE = int(os.environ.get("LINE_TOLERANCE", "50")) |
| 48 | +
|
| 49 | +def write_reward(score): |
| 50 | + """Write score to reward file and print summary.""" |
| 51 | + with open(REWARD_PATH, "w") as f: |
| 52 | + f.write(f"{score:.2f}\n") |
| 53 | + print(f"\nTests completed - Score: {score:.2f}") |
| 54 | +
|
| 55 | +def strip_code_fences(text): |
| 56 | + """Strip markdown code fences if agent wrapped JSON in ```json blocks.""" |
| 57 | + m = re.search(r'```(?:json)?\s*\n(.*?)```', text, re.DOTALL) |
| 58 | + return m.group(1).strip() if m else text.strip() |
| 59 | +
|
| 60 | +def normalize_path(path): |
| 61 | + """Normalize file paths (remove leading ./ or /workspace/).""" |
| 62 | + path = path.strip() |
| 63 | + path = re.sub(r'^\./', '', path) |
| 64 | + path = re.sub(r'^/workspace/[^/]+/', '', path) |
| 65 | + return path |
| 66 | +
|
| 67 | +def lines_match(line1, line2, tolerance): |
| 68 | + """Check if two line numbers match within tolerance (both can be None).""" |
| 69 | + if line1 is None or line2 is None: |
| 70 | + return True # Don't penalize if line number not provided |
| 71 | + return abs(int(line1) - int(line2)) <= tolerance |
| 72 | +
|
| 73 | +# ── Load ground truth ──────────────────────────────────────────────────── |
| 74 | +with open(GT_PATH) as f: |
| 75 | + gt = json.load(f) |
| 76 | +
|
| 77 | +expected_steps = gt.get("steps", []) |
| 78 | +if not expected_steps: |
| 79 | + print("ERROR: ground_truth.json must have a 'steps' array") |
| 80 | + write_reward(0.0) |
| 81 | + sys.exit(0) |
| 82 | +
|
| 83 | +num_expected = len(expected_steps) |
| 84 | +
|
| 85 | +# ── Load agent output ──────────────────────────────────────────────────── |
| 86 | +try: |
| 87 | + with open(OUTPUT_PATH) as f: |
| 88 | + raw = f.read() |
| 89 | + raw = strip_code_fences(raw) |
| 90 | + reported_steps = json.loads(raw) |
| 91 | + if not isinstance(reported_steps, list): |
| 92 | + print("Agent output is not a JSON array — scoring as empty.") |
| 93 | + reported_steps = [] |
| 94 | +except (json.JSONDecodeError, ValueError) as e: |
| 95 | + print(f"Malformed JSON in agent output: {e}") |
| 96 | + reported_steps = [] |
| 97 | +
|
| 98 | +num_reported = len(reported_steps) |
| 99 | +
|
| 100 | +if num_reported == 0: |
| 101 | + print("Agent output is empty — no chain steps to score.") |
| 102 | + print(f"Expected {num_expected} steps.") |
| 103 | + write_reward(0.0) |
| 104 | + sys.exit(0) |
| 105 | +
|
| 106 | +# ── Score each step ────────────────────────────────────────────────────── |
| 107 | +print(f"=== Dependency Chain Scoring ===") |
| 108 | +print(f" Expected steps: {num_expected}") |
| 109 | +print(f" Reported steps: {num_reported}") |
| 110 | +print(f" Line tolerance: +/- {LINE_TOLERANCE}") |
| 111 | +print() |
| 112 | +
|
| 113 | +correct_steps = 0 |
| 114 | +step_details = [] |
| 115 | +
|
| 116 | +for i, expected in enumerate(expected_steps, start=1): |
| 117 | + # Find matching reported step by step number or position |
| 118 | + reported = None |
| 119 | + for r in reported_steps: |
| 120 | + if r.get("step") == expected.get("step", i): |
| 121 | + reported = r |
| 122 | + break |
| 123 | +
|
| 124 | + if not reported and i <= num_reported: |
| 125 | + # Fallback: match by position if step field missing |
| 126 | + reported = reported_steps[i-1] |
| 127 | +
|
| 128 | + if not reported: |
| 129 | + step_details.append({ |
| 130 | + "step": i, |
| 131 | + "status": "MISSING", |
| 132 | + "expected": expected |
| 133 | + }) |
| 134 | + continue |
| 135 | +
|
| 136 | + # Check each field |
| 137 | + repo_match = expected.get("repo", "").strip() == reported.get("repo", "").strip() |
| 138 | + file_match = normalize_path(expected.get("file", "")) == normalize_path(reported.get("file", "")) |
| 139 | + line_match = lines_match(expected.get("line"), reported.get("line"), LINE_TOLERANCE) |
| 140 | +
|
| 141 | + all_match = repo_match and file_match and line_match |
| 142 | +
|
| 143 | + if all_match: |
| 144 | + correct_steps += 1 |
| 145 | + status = "CORRECT" |
| 146 | + else: |
| 147 | + status = "PARTIAL" if (repo_match or file_match) else "WRONG" |
| 148 | +
|
| 149 | + step_details.append({ |
| 150 | + "step": i, |
| 151 | + "status": status, |
| 152 | + "repo_match": repo_match, |
| 153 | + "file_match": file_match, |
| 154 | + "line_match": line_match, |
| 155 | + "expected": expected, |
| 156 | + "reported": reported |
| 157 | + }) |
| 158 | +
|
| 159 | +# ── Compute score ──────────────────────────────────────────────────────── |
| 160 | +# Each step is worth equal credit |
| 161 | +score = correct_steps / num_expected if num_expected > 0 else 0.0 |
| 162 | +
|
| 163 | +# ── Print detailed results ─────────────────────────────────────────────── |
| 164 | +print("=== Step-by-Step Results ===") |
| 165 | +for detail in step_details: |
| 166 | + status = detail["status"] |
| 167 | + symbol = "✓" if status == "CORRECT" else "✗" if status == "WRONG" else "~" |
| 168 | + print(f"\nStep {detail['step']}: [{symbol}] {status}") |
| 169 | +
|
| 170 | + exp = detail["expected"] |
| 171 | + print(f" Expected: {exp.get('repo')} / {exp.get('file')} : {exp.get('line')}") |
| 172 | + print(f" {exp.get('context', 'N/A')}") |
| 173 | +
|
| 174 | + if "reported" in detail: |
| 175 | + rep = detail["reported"] |
| 176 | + print(f" Reported: {rep.get('repo')} / {rep.get('file')} : {rep.get('line')}") |
| 177 | + print(f" {rep.get('context', 'N/A')}") |
| 178 | +
|
| 179 | + if status != "MISSING": |
| 180 | + print(f" Match: repo={detail['repo_match']}, file={detail['file_match']}, line={detail['line_match']}") |
| 181 | + else: |
| 182 | + print(f" Reported: (missing)") |
| 183 | +
|
| 184 | +print(f"\n=== Summary ===") |
| 185 | +print(f" Correct steps: {correct_steps}/{num_expected}") |
| 186 | +print(f" Score: {score:.2f}") |
| 187 | +
|
| 188 | +write_reward(score) |
| 189 | +PYEOF |
0 commit comments