Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def _build_metric_group_map() -> dict[str, str]:
"Other": "#AAAAAA",
}

_NON_NORMALIZED_METRICS = {"response_speed"}
_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"}

# EVA composite scores to show in the bar chart
_EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"]
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"]
simulation_version = "0.1.0"
# Bump when metrics pipeline changes (metrics code, judge prompts, pricing,
# postprocessor). Old metric results become stale — cheap to recompute.
metrics_version = "0.1.1"
metrics_version = "0.1.2"

[tool.mypy]
python_version = "3.11"
Expand Down
2 changes: 1 addition & 1 deletion src/eva/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@

# Bump metrics_version when changes affect metric computation (metrics code,
# judge prompts, pricing tables, postprocessor).
metrics_version = "0.1.1"
metrics_version = "0.1.2"
150 changes: 123 additions & 27 deletions src/eva/metrics/diagnostic/response_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,93 @@
final evaluation scores.
"""

import json
from abc import abstractmethod
from pathlib import Path

from eva.metrics.base import CodeMetric, MetricContext
from eva.metrics.registry import register_metric
from eva.models.results import MetricScore


@register_metric
class ResponseSpeedMetric(CodeMetric):
"""Response speed metric.
def _split_turn_taking_latencies_by_tool_calls(
context: MetricContext,
) -> tuple[list[float], list[float]]:
"""Partition turn_taking per_turn_latency values into (with_tool_calls, no_tool_calls).

Measures the elapsed time between the end of the user's utterance
and the beginning of the assistant's response.
Reads metrics/turn_taking/details/per_turn_latency from the record's
metrics.json, then checks conversation_trace to determine which turn_ids
had at least one tool call.

Reports raw latency values in seconds — no normalization applied.
Returns:
(with_tool_latencies, no_tool_latencies)
"""
if not context.output_dir:
return [], []

This is a diagnostic metric used for diagnosing model performance issues.
It is not directly used in final evaluation scores.
metrics_path = Path(context.output_dir) / "metrics.json"
if not metrics_path.exists():
return [], []

with open(metrics_path) as f:
data = json.load(f)

per_turn_latency: dict[str, float] = (
data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {})
)
if not per_turn_latency:
return [], []

tool_call_turn_ids = {
entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call"
}

with_tool: list[float] = []
no_tool: list[float] = []
for turn_id_str, latency in per_turn_latency.items():
if int(turn_id_str) in tool_call_turn_ids:
with_tool.append(latency)
else:
no_tool.append(latency)

return with_tool, no_tool


class _ResponseSpeedBase(CodeMetric):
"""Base class for response-speed metrics.

Subclasses implement `_get_latencies` to return the subset of latencies
to compute over; everything else is shared.
"""

name = "response_speed"
description = "Debug metric: latency between user utterance end and assistant response start"
category = "diagnostic"
exclude_from_pass_at_k = True

@abstractmethod
def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
"""Return (latencies, error_if_empty) for this metric variant."""

async def compute(self, context: MetricContext) -> MetricScore:
"""Compute response speed from Pipecat's UserBotLatencyObserver measurements."""
try:
# Check if we have response speed latencies from UserBotLatencyObserver
if not context.response_speed_latencies:
latencies, empty_error = self._get_latencies(context)

if not latencies:
return MetricScore(
name=self.name,
score=0.0,
normalized_score=None,
error="No response latencies available (UserBotLatencyObserver data missing)",
error=empty_error,
)

# Use latencies measured by Pipecat's UserBotLatencyObserver
# These measure the time from user stopped speaking to assistant started speaking
speeds = []
per_turn_speeds = []

for response_speed in context.response_speed_latencies:
# Filter out invalid values (negative or extremely large)
if 0 < response_speed < 1000: # Sanity check: under 1000 seconds
speeds.append(response_speed)
per_turn_speeds.append(round(response_speed, 3))
for latency in latencies:
if 0 < latency < 1000:
speeds.append(latency)
per_turn_speeds.append(round(latency, 3))
else:
self.logger.warning(
f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds"
f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds"
)

if not speeds:
Expand All @@ -63,19 +102,76 @@ async def compute(self, context: MetricContext) -> MetricScore:
)

mean_speed = sum(speeds) / len(speeds)
max_speed = max(speeds)

return MetricScore(
name=self.name,
score=round(mean_speed, 3), # Mean response speed in seconds
normalized_score=None, # Raw latency in seconds; not normalizable to [0,1]
score=round(mean_speed, 3),
normalized_score=None,
details={
"mean_speed_seconds": round(mean_speed, 3),
"max_speed_seconds": round(max_speed, 3),
"max_speed_seconds": round(max(speeds), 3),
"num_turns": len(speeds),
"per_turn_speeds": per_turn_speeds,
},
)

except Exception as e:
return self._handle_error(e, context)


@register_metric
class ResponseSpeedMetric(_ResponseSpeedBase):
"""Response speed metric.

Measures the elapsed time between the end of the user's utterance
and the beginning of the assistant's response, using Pipecat's
UserBotLatencyObserver measurements.

Reports raw latency values in seconds — no normalization applied.

This is a diagnostic metric used for diagnosing model performance issues.
It is not directly used in final evaluation scores.
"""

name = "response_speed"
description = "Debug metric: latency between user utterance end and assistant response start"

def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
return (
context.response_speed_latencies,
"No response latencies available (UserBotLatencyObserver data missing)",
)


@register_metric
class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need these as new metrics? In my mind, they should be sub-fields computed by the response_speed metric, as we will do for turn-taking. Our number of metrics will quickly explode otherwise?

"""Response speed restricted to turns where the assistant made at least one tool call.

Uses per_turn_latency from the turn_taking metric and filters to turns
that contain a tool_call entry in the conversation trace.
This is a diagnostic metric not used in final evaluation scores.
"""

name = "response_speed_with_tool_calls"
description = "Debug metric: response latency for turns that included a tool call"

def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
with_tool, _ = _split_turn_taking_latencies_by_tool_calls(context)
return with_tool, "No turns with tool calls found (or turn_taking latency data unavailable)"


@register_metric
class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase):
"""Response speed restricted to turns where the assistant made no tool calls.

Uses per_turn_latency from the turn_taking metric and filters to turns
that contain no tool_call entry in the conversation trace.
This is a diagnostic metric not used in final evaluation scores.
"""

name = "response_speed_no_tool_calls"
description = "Debug metric: response latency for turns that did not include a tool call"

def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]:
_, no_tool = _split_turn_taking_latencies_by_tool_calls(context)
return no_tool, "No turns without tool calls found (or turn_taking latency data unavailable)"
19 changes: 17 additions & 2 deletions src/eva/metrics/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@

logger = get_logger(__name__)


def _resolve_path(stored: str | None, fallback: Path) -> str | Path:
"""Return *stored* if it exists on disk, otherwise *fallback*.

Allows metrics to re-run correctly when a run directory has been moved:
the stored path reflects the original location, but the file is now at
*fallback* (i.e. output_dir / filename).
"""
if stored and Path(stored).exists():
return stored
return fallback


# Elevenlabs audio user field → _ProcessorContext attribute name
AUDIO_ATTR = {
"pipecat_agent": "audio_timestamps_assistant_turns",
Expand Down Expand Up @@ -824,8 +837,10 @@ def _build_history(
Each entry: {timestamp_ms, source, event_type, data}.
"""
history = self._load_audit_log_transcript(output_dir)
history.extend(self._load_pipecat_logs(result.pipecat_logs_path))
history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path))
pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl")
history.extend(self._load_pipecat_logs(pipecat_path))
elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl")
history.extend(self._load_elevenlabs_logs(elevenlabs_path))

history.sort(key=lambda e: e["timestamp_ms"])
context.history = history
Expand Down
13 changes: 10 additions & 3 deletions src/eva/models/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None:
if not has_redacted:
continue
if name not in live_by_name:
raise ValueError(
f"Cannot restore secrets: deployment {name!r} not found in "
f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
active_llm = getattr(self.model, "llm", None)
if name == active_llm:
raise ValueError(
f"Cannot restore secrets: deployment {name!r} not found in "
f"current EVA_MODEL_LIST (available: {list(live_by_name)})"
)
logger.warning(
f"Deployment {name!r} has redacted secrets but is not in the current "
f"EVA_MODEL_LIST — skipping (not used in this run)."
)
continue
live_params = live_by_name[name].get("litellm_params", {})
for key, value in saved_params.items():
if value == "***" and key in live_params:
Expand Down
Loading
Loading