11#! /bin/bash
22# Reward: checklist (0.0-1.0) — weighted pattern matching against ground_truth.json
3- # Verifier for investigation tasks: scores /logs/agent/investigation.md
4- # against ground-truth findings, file references, causal chains, and negative checks.
5- #
6- # Scoring weights (from ground_truth.json):
7- # required_findings: 0.40
8- # file_references: 0.30
9- # causal_chain: 0.20
10- # negative_checks: 0.10
3+ # Shared checklist verifier with soft length scaling and optional canonical-SHA bypass.
114
125set -e
136
14- REPORT= " /logs/agent/investigation.md"
15- GROUND_TRUTH=" /tests/ground_truth.json"
7+ REPORT_PATH= " ${REPORT_PATH :- / logs/ agent/ investigation.md} "
8+ GROUND_TRUTH=" ${GROUND_TRUTH :- / tests/ ground_truth.json} "
169REWARD_FILE=" /logs/verifier/reward.txt"
10+ MIN_REPORT_BYTES=" ${MIN_REPORT_BYTES:- 100} "
11+ # Below this, output is treated as effectively missing / unusable.
12+ MIN_ABS_BYTES=" ${MIN_ABS_BYTES:- 24} "
1713
1814mkdir -p /logs/verifier
1915
20- # ── Check prerequisites ────────────────────────────────────────────────
2116if [ ! -f " $GROUND_TRUTH " ]; then
2217 echo " ERROR: ground_truth.json not found at $GROUND_TRUTH "
2318 echo " 0.0" > " $REWARD_FILE "
2419 exit 0
2520fi
2621
27- if [ ! -f " $REPORT " ]; then
28- echo " No investigation report found at $REPORT "
29- echo " Agent did not produce the required output."
22+ if [ ! -f " $REPORT_PATH " ]; then
23+ echo " No agent output found at $REPORT_PATH "
3024 echo " 0.0" > " $REWARD_FILE "
3125 exit 0
3226fi
3327
34- REPORT_SIZE=$( wc -c < " $REPORT " )
35- if [ " $REPORT_SIZE " -lt 100 ]; then
36- echo " Investigation report is too short (${REPORT_SIZE} bytes). Likely incomplete ."
28+ REPORT_SIZE=$( wc -c < " $REPORT_PATH " )
29+ if [ " $REPORT_SIZE " -lt " $MIN_ABS_BYTES " ]; then
30+ echo " Agent output too small (${REPORT_SIZE} bytes, minimum usable ${MIN_ABS_BYTES} ) ."
3731 echo " 0.0" > " $REWARD_FILE "
3832 exit 0
3933fi
4034
41- echo " Scoring investigation report ($REPORT_SIZE bytes)..."
35+ echo " Scoring agent output ($REPORT_SIZE bytes)..."
36+ echo " Report: $REPORT_PATH "
4237echo " Ground truth: $GROUND_TRUTH "
4338echo " "
4439
45- # ── Delegate scoring to Python (avoids shell escaping issues with regex) ─
46- python3 << 'PYEOF '
47- import json, re, sys
40+ REPORT_PATH=" $REPORT_PATH " GROUND_TRUTH=" $GROUND_TRUTH " REWARD_FILE=" $REWARD_FILE " REPORT_SIZE=" $REPORT_SIZE " MIN_REPORT_BYTES=" $MIN_REPORT_BYTES " python3 << 'PYEOF '
41+ import hashlib
42+ import json
43+ import os
44+ import re
4845
49- REPORT_PATH = "/logs/agent/investigation.md"
50- GT_PATH = "/tests/ground_truth.json"
51- REWARD_PATH = "/logs/verifier/reward.txt"
46+ REPORT_PATH = os.environ["REPORT_PATH"]
47+ GT_PATH = os.environ["GROUND_TRUTH"]
48+ REWARD_PATH = os.environ["REWARD_FILE"]
49+ REPORT_SIZE = int(os.environ["REPORT_SIZE"])
50+ MIN_REPORT_BYTES = max(1, int(os.environ["MIN_REPORT_BYTES"]))
5251
5352with open(REPORT_PATH) as f:
5453 report = f.read()
5554with open(GT_PATH) as f:
5655 gt = json.load(f)
5756
57+ # Canonical source (when provided) should always score 1.0 if text is exact.
58+ doc_sha = hashlib.sha256(report.encode("utf-8")).hexdigest().lower()
59+ canonical_sha = ((((gt.get("ground_truth_provenance") or {}).get("canonical_source") or {}).get("sha256") or "").lower())
60+ if canonical_sha and doc_sha == canonical_sha:
61+ print("Canonical source SHA match: awarding 1.0")
62+ with open(REWARD_PATH, "w") as f:
63+ f.write("1.00\n")
64+ raise SystemExit(0)
65+
66+
5867def check_any_pattern(patterns, text):
59- """Return True if at least one pattern matches (case-insensitive)."""
6068 for p in patterns:
6169 try:
6270 if re.search(p, text, re.IGNORECASE):
6371 return True
6472 except re.error:
65- # Fall back to literal substring match if regex is invalid
6673 if p.lower() in text.lower():
6774 return True
6875 return False
6976
77+
7078def check_all_patterns(patterns, text):
71- """Return True if ALL patterns match (each represents a step in causal chain)."""
7279 for p in patterns:
7380 try:
7481 if not re.search(p, text, re.IGNORECASE):
@@ -78,81 +85,59 @@ def check_all_patterns(patterns, text):
7885 return False
7986 return True
8087
81- # ── Score required_findings ──────────────────────────────────────────────
82- print("=== Required Findings ===")
83- f_score, f_total = 0.0, 0.0
84- for item in gt["required_findings"]:
85- f_total += item["weight"]
86- if check_any_pattern(item["patterns"], report):
87- f_score += item["weight"]
88- print(f" [x] {item['description']} (weight: {item['weight']})")
89- else:
90- print(f" [ ] {item['description']} (weight: {item['weight']})")
91- f_ratio = f_score / f_total if f_total > 0 else 0
92- print(f" Findings score: {f_score:.2f} / {f_total:.2f} = {f_ratio:.2f}")
93- print()
94-
95- # ── Score file_references ────────────────────────────────────────────────
96- print("=== File References ===")
97- r_score, r_total = 0.0, 0.0
98- for item in gt["file_references"]:
99- r_total += item["weight"]
100- if check_any_pattern(item["patterns"], report):
101- r_score += item["weight"]
102- print(f" [x] {item['description']} (weight: {item['weight']})")
103- else:
104- print(f" [ ] {item['description']} (weight: {item['weight']})")
105- r_ratio = r_score / r_total if r_total > 0 else 0
106- print(f" File refs score: {r_score:.2f} / {r_total:.2f} = {r_ratio:.2f}")
107- print()
108-
109- # ── Score causal_chain ───────────────────────────────────────────────────
110- print("=== Causal Chain ===")
111- c_score, c_total = 0.0, 0.0
112- for item in gt["causal_chain"]:
113- c_total += item["weight"]
114- # All patterns must match (they represent steps in the causal chain)
115- if check_all_patterns(item["patterns"], report):
116- c_score += item["weight"]
117- print(f" [x] {item['description']} (weight: {item['weight']})")
118- else:
119- print(f" [ ] {item['description']} (weight: {item['weight']})")
120- c_ratio = c_score / c_total if c_total > 0 else 0
121- print(f" Causal chain score: {c_score:.2f} / {c_total:.2f} = {c_ratio:.2f}")
122- print()
123-
124- # ── Score negative_checks ────────────────────────────────────────────────
125- print("=== Negative Checks ===")
126- n_score, n_total = 0.0, 0.0
127- for item in gt["negative_checks"]:
128- n_total += item["weight"]
129- # Negative checks PASS when the pattern is NOT found
130- if not check_any_pattern(item["patterns"], report):
131- n_score += item["weight"]
132- print(f" [x] {item['description']} (weight: {item['weight']})")
133- else:
134- print(f" [ ] FAIL: {item['description']} (weight: {item['weight']}) -- wrong conclusion found")
135- n_ratio = n_score / n_total if n_total > 0 else 1.0
136- print(f" Negative checks score: {n_score:.2f} / {n_total:.2f} = {n_ratio:.2f}")
137- print()
13888
139- # ── Compute weighted total ───────────────────────────────────────────────
140- w = gt["weights"]
141- total = (f_ratio * w["required_findings"] +
142- r_ratio * w["file_references"] +
143- c_ratio * w["causal_chain"] +
144- n_ratio * w["negative_checks"])
89+ def score_category(items, label, use_all=False, negate=False):
90+ print(f"=== {label} ===")
91+ score = 0.0
92+ total = 0.0
93+ for item in items:
94+ w = float(item["weight"])
95+ total += w
96+ matched = check_all_patterns(item["patterns"], report) if use_all else check_any_pattern(item["patterns"], report)
97+ passed = (not matched) if negate else matched
98+ if passed:
99+ score += w
100+ print(f" [x] {item['description']} (weight: {w})")
101+ else:
102+ msg = " -- wrong conclusion found" if negate else ""
103+ prefix = "FAIL: " if negate else ""
104+ print(f" [ ] {prefix}{item['description']} (weight: {w}){msg}")
105+ ratio = score / total if total > 0 else (1.0 if negate else 0.0)
106+ print(f" Score: {score:.2f} / {total:.2f} = {ratio:.2f}")
107+ print()
108+ return ratio
109+
110+
111+ f_ratio = score_category(gt.get("required_findings", []), "Required Findings")
112+ r_ratio = score_category(gt.get("file_references", []), "File References")
113+ c_ratio = score_category(gt.get("causal_chain", []), "Causal Chain", use_all=True)
114+ n_ratio = score_category(gt.get("negative_checks", []), "Negative Checks", negate=True)
115+
116+ weights = gt.get("weights", {
117+ "required_findings": 0.40,
118+ "file_references": 0.30,
119+ "causal_chain": 0.20,
120+ "negative_checks": 0.10,
121+ })
122+ base = (
123+ f_ratio * float(weights.get("required_findings", 0.40))
124+ + r_ratio * float(weights.get("file_references", 0.30))
125+ + c_ratio * float(weights.get("causal_chain", 0.20))
126+ + n_ratio * float(weights.get("negative_checks", 0.10))
127+ )
128+
129+ # Soft length scaling: avoid hard 0 for concise but correct outputs.
130+ length_factor = min(1.0, REPORT_SIZE / float(MIN_REPORT_BYTES))
131+ final = max(0.0, min(1.0, base * length_factor))
145132
146133print("=== Final Score ===")
147- print(f" Findings: {f_ratio:.2f} * {w['required_findings']} = {f_ratio * w['required_findings']:.3f}")
148- print(f" File refs: {r_ratio:.2f} * {w['file_references']} = {r_ratio * w['file_references']:.3f}")
149- print(f" Causal: {c_ratio:.2f} * {w['causal_chain']} = {c_ratio * w['causal_chain']:.3f}")
150- print(f" Negative: {n_ratio:.2f} * {w['negative_checks']} = {n_ratio * w['negative_checks']:.3f}")
151- print(f" TOTAL: {total:.2f}")
134+ print(f" Base checklist: {base:.3f}")
135+ print(f" Length factor: min(1.0, {REPORT_SIZE}/{MIN_REPORT_BYTES}) = {length_factor:.3f}")
136+ print(f" TOTAL: {final:.2f}")
152137
153138with open(REWARD_PATH, "w") as f:
154- f.write(f"{total :.2f}\n")
139+ f.write(f"{final :.2f}\n")
155140
156141print()
157- print(f"Tests completed - Score: {total :.2f}")
142+ print(f"Tests completed - Score: {final :.2f}")
158143PYEOF
0 commit comments