ServiceNow · fanny-riols · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/apps/analysis.py b/apps/analysis.py
@@ -75,7 +75,7 @@ def _build_metric_group_map() -> dict[str, str]:
     "Other": "#AAAAAA",
 }
 
-_NON_NORMALIZED_METRICS = {"response_speed"}
+_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}
 
 # EVA composite scores to show in the bar chart
 _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]

diff --git a/pyproject.toml b/pyproject.toml
@@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"]
 simulation_version = "0.1.0"
 # Bump when metrics pipeline changes (metrics code, judge prompts, pricing,
 # postprocessor). Old metric results become stale — cheap to recompute.
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
 
 [tool.mypy]
 python_version = "3.11"

diff --git a/src/eva/__init__.py b/src/eva/__init__.py
@@ -11,4 +11,4 @@
 
 # Bump metrics_version when changes affect metric computation (metrics code,
 # judge prompts, pricing tables, postprocessor).
-metrics_version = "0.1.1"
+metrics_version = "0.1.2"
diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py
@@ -4,54 +4,93 @@
 final evaluation scores.
 """
 
+import json
+from abc import abstractmethod
+from pathlib import Path
+
 from eva.metrics.base import CodeMetric, MetricContext
 from eva.metrics.registry import register_metric
 from eva.models.results import MetricScore
 
 
-@register_metric
-class ResponseSpeedMetric(CodeMetric):
-    """Response speed metric.
+def _split_turn_taking_latencies_by_tool_calls(
+    context: MetricContext,
+) -> tuple[list[float], list[float]]:
+    """Partition turn_taking per_turn_latency values into (with_tool_calls, no_tool_calls).
 
-    Measures the elapsed time between the end of the user's utterance
-    and the beginning of the assistant's response.
+    Reads metrics/turn_taking/details/per_turn_latency from the record's
+    metrics.json, then checks conversation_trace to determine which turn_ids
+    had at least one tool call.
 
-    Reports raw latency values in seconds — no normalization applied.
+    Returns:
+        (with_tool_latencies, no_tool_latencies)
+    """
+    if not context.output_dir:
+        return [], []
 
-    This is a diagnostic metric used for diagnosing model performance issues.
-    It is not directly used in final evaluation scores.
+    metrics_path = Path(context.output_dir) / "metrics.json"
+    if not metrics_path.exists():
+        return [], []
+
+    with open(metrics_path) as f:
+        data = json.load(f)
+
+    per_turn_latency: dict[str, float] = (
+        data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})
+    )
+    if not per_turn_latency:
+        return [], []
+
+    tool_call_turn_ids = {
+        entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call"
+    }
+
+    with_tool: list[float] = []
+    no_tool: list[float] = []
+    for turn_id_str, latency in per_turn_latency.items():
+        if int(turn_id_str) in tool_call_turn_ids:
+            with_tool.append(latency)
+        else:
+            no_tool.append(latency)
+
+    return with_tool, no_tool
+
+
+class _ResponseSpeedBase(CodeMetric):
+    """Base class for response-speed metrics.
+
+    Subclasses implement `_get_latencies` to return the subset of latencies
+    to compute over; everything else is shared.
     """
 
-    name = "response_speed"
-    description = "Debug metric: latency between user utterance end and assistant response start"
     category = "diagnostic"
     exclude_from_pass_at_k = True
 
+    @abstractmethod
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        """Return (latencies, error_if_empty) for this metric variant."""
+
     async def compute(self, context: MetricContext) -> MetricScore:
-        """Compute response speed from Pipecat's UserBotLatencyObserver measurements."""
         try:
-            # Check if we have response speed latencies from UserBotLatencyObserver
-            if not context.response_speed_latencies:
+            latencies, empty_error = self._get_latencies(context)
+
+            if not latencies:
                 return MetricScore(
                     name=self.name,
                     score=0.0,
                     normalized_score=None,
-                    error="No response latencies available (UserBotLatencyObserver data missing)",
+                    error=empty_error,
                 )
 
-            # Use latencies measured by Pipecat's UserBotLatencyObserver
-            # These measure the time from user stopped speaking to assistant started speaking
             speeds = []
             per_turn_speeds = []
-
-            for response_speed in context.response_speed_latencies:
-                # Filter out invalid values (negative or extremely large)
-                if 0 < response_speed < 1000:  # Sanity check: under 1000 seconds
-                    speeds.append(response_speed)
-                    per_turn_speeds.append(round(response_speed, 3))
+            for latency in latencies:
+                if 0 < latency < 1000:
+                    speeds.append(latency)
+                    per_turn_speeds.append(round(latency, 3))
                 else:
                     self.logger.warning(
-                        f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds"
+                        f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds"
                     )
 
             if not speeds:
@@ -63,19 +102,76 @@ async def compute(self, context: MetricContext) -> MetricScore:
                 )
 
             mean_speed = sum(speeds) / len(speeds)
-            max_speed = max(speeds)
 
             return MetricScore(
                 name=self.name,
-                score=round(mean_speed, 3),  # Mean response speed in seconds
-                normalized_score=None,  # Raw latency in seconds; not normalizable to [0,1]
+                score=round(mean_speed, 3),
+                normalized_score=None,
                 details={
                     "mean_speed_seconds": round(mean_speed, 3),
-                    "max_speed_seconds": round(max_speed, 3),
+                    "max_speed_seconds": round(max(speeds), 3),
                     "num_turns": len(speeds),
                     "per_turn_speeds": per_turn_speeds,
                 },
             )
 
         except Exception as e:
             return self._handle_error(e, context)
+
+
+@register_metric
+class ResponseSpeedMetric(_ResponseSpeedBase):
+    """Response speed metric.
+
+    Measures the elapsed time between the end of the user's utterance
+    and the beginning of the assistant's response, using Pipecat's
+    UserBotLatencyObserver measurements.
+
+    Reports raw latency values in seconds — no normalization applied.
+
+    This is a diagnostic metric used for diagnosing model performance issues.
+    It is not directly used in final evaluation scores.
+    """
+
+    name = "response_speed"
+    description = "Debug metric: latency between user utterance end and assistant response start"
+
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        return (
+            context.response_speed_latencies,
+            "No response latencies available (UserBotLatencyObserver data missing)",
+        )
+
+
+@register_metric
+class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase):
+    """Response speed restricted to turns where the assistant made at least one tool call.
+
+    Uses per_turn_latency from the turn_taking metric and filters to turns
+    that contain a tool_call entry in the conversation trace.
+    This is a diagnostic metric not used in final evaluation scores.
+    """
+
+    name = "response_speed_with_tool_calls"
+    description = "Debug metric: response latency for turns that included a tool call"
+
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        with_tool, _ = _split_turn_taking_latencies_by_tool_calls(context)
+        return with_tool, "No turns with tool calls found (or turn_taking latency data unavailable)"
+
+
+@register_metric
+class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase):
+    """Response speed restricted to turns where the assistant made no tool calls.
+
+    Uses per_turn_latency from the turn_taking metric and filters to turns
+    that contain no tool_call entry in the conversation trace.
+    This is a diagnostic metric not used in final evaluation scores.
+    """
+
+    name = "response_speed_no_tool_calls"
+    description = "Debug metric: response latency for turns that did not include a tool call"
+
+    def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
+        _, no_tool = _split_turn_taking_latencies_by_tool_calls(context)
+        return no_tool, "No turns without tool calls found (or turn_taking latency data unavailable)"
diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py
@@ -24,6 +24,19 @@
 
 logger = get_logger(__name__)
 
+
+def _resolve_path(stored: str | None, fallback: Path) -> str | Path:
+    """Return *stored* if it exists on disk, otherwise *fallback*.
+
+    Allows metrics to re-run correctly when a run directory has been moved:
+    the stored path reflects the original location, but the file is now at
+    *fallback* (i.e. output_dir / filename).
+    """
+    if stored and Path(stored).exists():
+        return stored
+    return fallback
+
+
 # Elevenlabs audio user field → _ProcessorContext attribute name
 AUDIO_ATTR = {
     "pipecat_agent": "audio_timestamps_assistant_turns",
@@ -824,8 +837,10 @@ def _build_history(
         Each entry: {timestamp_ms, source, event_type, data}.
         """
         history = self._load_audit_log_transcript(output_dir)
-        history.extend(self._load_pipecat_logs(result.pipecat_logs_path))
-        history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path))
+        pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl")
+        history.extend(self._load_pipecat_logs(pipecat_path))
+        elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl")
+        history.extend(self._load_elevenlabs_logs(elevenlabs_path))
 
         history.sort(key=lambda e: e["timestamp_ms"])
         context.history = history

diff --git a/src/eva/models/config.py b/src/eva/models/config.py
@@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
             if not has_redacted:
                 continue
             if name not in live_by_name:
-                raise ValueError(
-                    f"Cannot restore secrets: deployment {name!r} not found in "
-                    f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                active_llm = getattr(self.model, "llm", None)
+                if name == active_llm:
+                    raise ValueError(
+                        f"Cannot restore secrets: deployment {name!r} not found in "
+                        f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
+                    )
+                logger.warning(
+                    f"Deployment {name!r} has redacted secrets but is not in the current "
+                    f"EVA_MODEL_LIST — skipping (not used in this run)."
                 )
+                continue
             live_params = live_by_name[name].get("litellm_params", {})
             for key, value in saved_params.items():
                 if value == "***" and key in live_params: