diff --git a/skills/workflow-build/SKILL.md b/skills/workflow-build/SKILL.md
index 7c8be88b..99f508ec 100644
--- a/skills/workflow-build/SKILL.md
+++ b/skills/workflow-build/SKILL.md
@@ -98,13 +98,11 @@ Apply the CEO Review Gate protocol:
 
 *On RELOOP: return to `builder` (max 3 iterations)*
 
-## Phase 5: Evaluator
+## Step: Eval
 
 
 ```bash
-factory agent evaluator --task "Run eval: factory eval $PROJECT_PATH. Capture composite score and per-dimension breakdown. Report delta from baseline. Interpret which dimensions improved/regressed.
-Read: .factory/reviews/builder-latest.md
-Write output to: .factory/reviews/evaluator-latest.md" --project "$PROJECT_PATH" --timeout 600
+factory eval "$PROJECT_PATH"
 ```
 
 ### Gate — Precheck (Automated)
diff --git a/skills/workflow-design/SKILL.md b/skills/workflow-design/SKILL.md
index 1713343e..d2929e45 100644
--- a/skills/workflow-design/SKILL.md
+++ b/skills/workflow-design/SKILL.md
@@ -93,13 +93,11 @@ Apply the CEO Review Gate protocol:
 
 *On RELOOP: return to `builder` (max 3 iterations)*
 
-## Phase 5: Evaluator
+## Step: Eval
 
 
 ```bash
-factory agent evaluator --task "Run eval: factory eval $PROJECT_PATH. Capture composite score and per-dimension breakdown. Report delta from baseline. Interpret which dimensions improved/regressed.
-Read: .factory/reviews/builder-latest.md
-Write output to: .factory/reviews/evaluator-latest.md" --project "$PROJECT_PATH" --timeout 600
+factory eval "$PROJECT_PATH"
 ```
 
 ### Gate — Precheck (Automated)
diff --git a/skills/workflow-improve/SKILL.md b/skills/workflow-improve/SKILL.md
index 4d06b10f..77697b3c 100644
--- a/skills/workflow-improve/SKILL.md
+++ b/skills/workflow-improve/SKILL.md
@@ -93,13 +93,11 @@ Apply the CEO Review Gate protocol:
 
 *On RELOOP: return to `builder` (max 3 iterations)*
 
-## Phase 5: Evaluator
+## Step: Eval
 
 
 ```bash
-factory agent evaluator --task "Run eval: factory eval $PROJECT_PATH. Capture composite score. Report delta from baseline. Interpret dimension changes.
-Read: .factory/reviews/builder-latest.md
-Write output to: .factory/reviews/evaluator-latest.md" --project "$PROJECT_PATH" --timeout 600
+factory eval "$PROJECT_PATH"
 ```
 
 ### Gate — Precheck (Automated)
diff --git a/skills/workflow-refine/SKILL.md b/skills/workflow-refine/SKILL.md
index 5fae4bab..e93cb92f 100644
--- a/skills/workflow-refine/SKILL.md
+++ b/skills/workflow-refine/SKILL.md
@@ -64,7 +64,7 @@ Write output to: .factory/reviews/builder-latest.md" --project "$PROJECT_PATH" -
 
 
 ```bash
-factory agent reviewer --task "Verify the refinement. Run all 3 verification sections: 1. Health Check — run factory eval. Report composite score and delta. 2. Code Review — read PR diff, evaluate 7-category checklist. Run factory guard with --check-scope. 3. Adversarial QA — run/test the project, verify the refinement works.
+factory agent qa --task "Verify the refinement. Run all 3 verification sections: 1. Health Check — run factory eval. Report composite score and delta. 2. Code Review — read PR diff, evaluate 7-category checklist. Run factory guard with --check-scope. 3. Adversarial QA — run/test the project, verify the refinement works.
 Read: .factory/reviews/builder-latest.md
 Write output to: .factory/reviews/qa-latest.md" --project "$PROJECT_PATH" --timeout 600
 ```
diff --git a/skills/workflow-research/SKILL.md b/skills/workflow-research/SKILL.md
index eb43f7b1..7262c319 100644
--- a/skills/workflow-research/SKILL.md
+++ b/skills/workflow-research/SKILL.md
@@ -13,7 +13,7 @@ The user wants: **$ARGUMENTS**
 
 
 ```bash
-factory agent evaluator --task "Run eval and report results." --project "$PROJECT_PATH" --timeout 300
+factory eval "$PROJECT_PATH"
 ```
 
 ## Phase 1: Failure Analyst
@@ -98,11 +98,11 @@ Apply the CEO Review Gate protocol:
 
 *On RELOOP: return to `builder` (max 3 iterations)*
 
-## Step: Evaluator
+## Step: Eval
 
 
 ```bash
-factory agent evaluator --task "Run eval and report results." --project "$PROJECT_PATH" --timeout 300
+factory eval "$PROJECT_PATH"
 ```
 
 ### Gate — Precheck (Automated)
diff --git a/skills/workflow-review/SKILL.md b/skills/workflow-review/SKILL.md
index ae47858e..59a9cf05 100644
--- a/skills/workflow-review/SKILL.md
+++ b/skills/workflow-review/SKILL.md
@@ -56,7 +56,7 @@ factory init $PROJECT_PATH
 
 
 ```bash
-factory agent evaluator --task "Run eval and report results." --project "$PROJECT_PATH" --timeout 300
+factory eval "$PROJECT_PATH"
 ```
 
 ## Step: Commit
diff --git a/tests/test_qa_delegation.py b/tests/test_qa_delegation.py
index e51badc6..4e8fcea2 100644
--- a/tests/test_qa_delegation.py
+++ b/tests/test_qa_delegation.py
@@ -3,7 +3,7 @@
 Verifies that:
 - The QA prompt covers all 3 verification sections
 - The CEO prompt references skill-based routing (mode sections moved to SKILL.md)
-- Generated workflow skills delegate eval to QA Agent, not direct factory eval
+- Generated workflow skills do not reference nonexistent agent roles
 - Builder precedes Evaluator in generated workflow skills (graph ordering)
 - Event-based flow validation detects Builder→QA sequencing
 """
@@ -91,24 +91,17 @@ def test_ceo_prompt_references_skill_routing(
             "CEO prompt must reference SKILL.md files"
         )
 
-    def test_workflow_skills_delegate_eval_to_agents(self) -> None:
-        """Generated workflow skills must not contain standalone factory eval calls."""
-        for skill_dir in SKILLS_DIR.glob("workflow-*"):
-            skill_path = skill_dir / "SKILL.md"
+    def test_workflow_skills_use_valid_agent_roles(self) -> None:
+        """Workflow skills must not reference nonexistent agent roles."""
+        invalid_roles = ['factory agent evaluator', 'factory agent reviewer']
+        for skill_dir in SKILLS_DIR.glob('workflow-*'):
+            skill_path = skill_dir / 'SKILL.md'
             if not skill_path.exists():
                 continue
             content = skill_path.read_text()
-            for match in re.finditer(r"factory eval", content):
-                pos = match.start()
-                preceding = content[:pos]
-                last_agent_task = preceding.rfind('factory agent')
-                last_code_block_end = preceding.rfind('```\n')
-                if last_agent_task > last_code_block_end:
-                    continue
-                context = content[max(0, pos - 80):pos + 40]
-                pytest.fail(
-                    f"Direct 'factory eval' in {skill_path.name} outside "
-                    f"agent task. Context: ...{context}..."
+            for role in invalid_roles:
+                assert role not in content, (
+                    f'Invalid agent role in {skill_path.name}: {role}'
                 )