diff --git a/apps/analysis.py b/apps/analysis.py index 4e651752..9bb15520 100644 --- a/apps/analysis.py +++ b/apps/analysis.py @@ -75,7 +75,7 @@ def _build_metric_group_map() -> dict[str, str]: "Other": "#AAAAAA", } -_NON_NORMALIZED_METRICS = {"response_speed"} +_NON_NORMALIZED_METRICS = {"response_speed", "response_speed_with_tool_calls", "response_speed_no_tool_calls"} # EVA composite scores to show in the bar chart _EVA_BAR_COMPOSITES = ["EVA-A_pass", "EVA-X_pass", "EVA-A_mean", "EVA-X_mean"] diff --git a/pyproject.toml b/pyproject.toml index 561cba2d..47827e98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ ignore = ["D203", "D206", "D213", "D400", "D401", "D413", "D415", "E1", "E501"] simulation_version = "0.1.0" # Bump when metrics pipeline changes (metrics code, judge prompts, pricing, # postprocessor). Old metric results become stale — cheap to recompute. -metrics_version = "0.1.1" +metrics_version = "0.1.2" [tool.mypy] python_version = "3.11" diff --git a/src/eva/__init__.py b/src/eva/__init__.py index 6796f4aa..03f1f13b 100644 --- a/src/eva/__init__.py +++ b/src/eva/__init__.py @@ -11,4 +11,4 @@ # Bump metrics_version when changes affect metric computation (metrics code, # judge prompts, pricing tables, postprocessor). -metrics_version = "0.1.1" +metrics_version = "0.1.2" diff --git a/src/eva/metrics/diagnostic/response_speed.py b/src/eva/metrics/diagnostic/response_speed.py index 0dd4fb53..224ebc49 100644 --- a/src/eva/metrics/diagnostic/response_speed.py +++ b/src/eva/metrics/diagnostic/response_speed.py @@ -4,54 +4,93 @@ final evaluation scores. """ +import json +from abc import abstractmethod +from pathlib import Path + from eva.metrics.base import CodeMetric, MetricContext from eva.metrics.registry import register_metric from eva.models.results import MetricScore -@register_metric -class ResponseSpeedMetric(CodeMetric): - """Response speed metric. +def _split_turn_taking_latencies_by_tool_calls( + context: MetricContext, +) -> tuple[list[float], list[float]]: + """Partition turn_taking per_turn_latency values into (with_tool_calls, no_tool_calls). - Measures the elapsed time between the end of the user's utterance - and the beginning of the assistant's response. + Reads metrics/turn_taking/details/per_turn_latency from the record's + metrics.json, then checks conversation_trace to determine which turn_ids + had at least one tool call. - Reports raw latency values in seconds — no normalization applied. + Returns: + (with_tool_latencies, no_tool_latencies) + """ + if not context.output_dir: + return [], [] - This is a diagnostic metric used for diagnosing model performance issues. - It is not directly used in final evaluation scores. + metrics_path = Path(context.output_dir) / "metrics.json" + if not metrics_path.exists(): + return [], [] + + with open(metrics_path) as f: + data = json.load(f) + + per_turn_latency: dict[str, float] = ( + data.get("metrics", {}).get("turn_taking", {}).get("details", {}).get("per_turn_latency", {}) + ) + if not per_turn_latency: + return [], [] + + tool_call_turn_ids = { + entry["turn_id"] for entry in (context.conversation_trace or []) if entry.get("type") == "tool_call" + } + + with_tool: list[float] = [] + no_tool: list[float] = [] + for turn_id_str, latency in per_turn_latency.items(): + if int(turn_id_str) in tool_call_turn_ids: + with_tool.append(latency) + else: + no_tool.append(latency) + + return with_tool, no_tool + + +class _ResponseSpeedBase(CodeMetric): + """Base class for response-speed metrics. + + Subclasses implement `_get_latencies` to return the subset of latencies + to compute over; everything else is shared. """ - name = "response_speed" - description = "Debug metric: latency between user utterance end and assistant response start" category = "diagnostic" exclude_from_pass_at_k = True + @abstractmethod + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + """Return (latencies, error_if_empty) for this metric variant.""" + async def compute(self, context: MetricContext) -> MetricScore: - """Compute response speed from Pipecat's UserBotLatencyObserver measurements.""" try: - # Check if we have response speed latencies from UserBotLatencyObserver - if not context.response_speed_latencies: + latencies, empty_error = self._get_latencies(context) + + if not latencies: return MetricScore( name=self.name, score=0.0, normalized_score=None, - error="No response latencies available (UserBotLatencyObserver data missing)", + error=empty_error, ) - # Use latencies measured by Pipecat's UserBotLatencyObserver - # These measure the time from user stopped speaking to assistant started speaking speeds = [] per_turn_speeds = [] - - for response_speed in context.response_speed_latencies: - # Filter out invalid values (negative or extremely large) - if 0 < response_speed < 1000: # Sanity check: under 1000 seconds - speeds.append(response_speed) - per_turn_speeds.append(round(response_speed, 3)) + for latency in latencies: + if 0 < latency < 1000: + speeds.append(latency) + per_turn_speeds.append(round(latency, 3)) else: self.logger.warning( - f"[{context.record_id}] Unusual response speed detected and dropped: {response_speed} seconds" + f"[{context.record_id}] Unusual response speed detected and dropped: {latency} seconds" ) if not speeds: @@ -63,15 +102,14 @@ async def compute(self, context: MetricContext) -> MetricScore: ) mean_speed = sum(speeds) / len(speeds) - max_speed = max(speeds) return MetricScore( name=self.name, - score=round(mean_speed, 3), # Mean response speed in seconds - normalized_score=None, # Raw latency in seconds; not normalizable to [0,1] + score=round(mean_speed, 3), + normalized_score=None, details={ "mean_speed_seconds": round(mean_speed, 3), - "max_speed_seconds": round(max_speed, 3), + "max_speed_seconds": round(max(speeds), 3), "num_turns": len(speeds), "per_turn_speeds": per_turn_speeds, }, @@ -79,3 +117,61 @@ async def compute(self, context: MetricContext) -> MetricScore: except Exception as e: return self._handle_error(e, context) + + +@register_metric +class ResponseSpeedMetric(_ResponseSpeedBase): + """Response speed metric. + + Measures the elapsed time between the end of the user's utterance + and the beginning of the assistant's response, using Pipecat's + UserBotLatencyObserver measurements. + + Reports raw latency values in seconds — no normalization applied. + + This is a diagnostic metric used for diagnosing model performance issues. + It is not directly used in final evaluation scores. + """ + + name = "response_speed" + description = "Debug metric: latency between user utterance end and assistant response start" + + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + return ( + context.response_speed_latencies, + "No response latencies available (UserBotLatencyObserver data missing)", + ) + + +@register_metric +class ResponseSpeedWithToolCallsMetric(_ResponseSpeedBase): + """Response speed restricted to turns where the assistant made at least one tool call. + + Uses per_turn_latency from the turn_taking metric and filters to turns + that contain a tool_call entry in the conversation trace. + This is a diagnostic metric not used in final evaluation scores. + """ + + name = "response_speed_with_tool_calls" + description = "Debug metric: response latency for turns that included a tool call" + + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + with_tool, _ = _split_turn_taking_latencies_by_tool_calls(context) + return with_tool, "No turns with tool calls found (or turn_taking latency data unavailable)" + + +@register_metric +class ResponseSpeedNoToolCallsMetric(_ResponseSpeedBase): + """Response speed restricted to turns where the assistant made no tool calls. + + Uses per_turn_latency from the turn_taking metric and filters to turns + that contain no tool_call entry in the conversation trace. + This is a diagnostic metric not used in final evaluation scores. + """ + + name = "response_speed_no_tool_calls" + description = "Debug metric: response latency for turns that did not include a tool call" + + def _get_latencies(self, context: MetricContext) -> tuple[list[float], str]: + _, no_tool = _split_turn_taking_latencies_by_tool_calls(context) + return no_tool, "No turns without tool calls found (or turn_taking latency data unavailable)" diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py index 660e7ce9..94aa6e44 100644 --- a/src/eva/metrics/processor.py +++ b/src/eva/metrics/processor.py @@ -24,6 +24,19 @@ logger = get_logger(__name__) + +def _resolve_path(stored: str | None, fallback: Path) -> str | Path: + """Return *stored* if it exists on disk, otherwise *fallback*. + + Allows metrics to re-run correctly when a run directory has been moved: + the stored path reflects the original location, but the file is now at + *fallback* (i.e. output_dir / filename). + """ + if stored and Path(stored).exists(): + return stored + return fallback + + # Elevenlabs audio user field → _ProcessorContext attribute name AUDIO_ATTR = { "pipecat_agent": "audio_timestamps_assistant_turns", @@ -824,8 +837,10 @@ def _build_history( Each entry: {timestamp_ms, source, event_type, data}. """ history = self._load_audit_log_transcript(output_dir) - history.extend(self._load_pipecat_logs(result.pipecat_logs_path)) - history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path)) + pipecat_path = _resolve_path(result.pipecat_logs_path, output_dir / "pipecat_logs.jsonl") + history.extend(self._load_pipecat_logs(pipecat_path)) + elevenlabs_path = _resolve_path(result.elevenlabs_logs_path, output_dir / "elevenlabs_events.jsonl") + history.extend(self._load_elevenlabs_logs(elevenlabs_path)) history.sort(key=lambda e: e["timestamp_ms"]) context.history = history diff --git a/src/eva/models/config.py b/src/eva/models/config.py index e08783bd..f3885c54 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -648,10 +648,17 @@ def apply_env_overrides(self, live: "RunConfig") -> None: if not has_redacted: continue if name not in live_by_name: - raise ValueError( - f"Cannot restore secrets: deployment {name!r} not found in " - f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + active_llm = getattr(self.model, "llm", None) + if name == active_llm: + raise ValueError( + f"Cannot restore secrets: deployment {name!r} not found in " + f"current EVA_MODEL_LIST (available: {list(live_by_name)})" + ) + logger.warning( + f"Deployment {name!r} has redacted secrets but is not in the current " + f"EVA_MODEL_LIST — skipping (not used in this run)." ) + continue live_params = live_by_name[name].get("litellm_params", {}) for key, value in saved_params.items(): if value == "***" and key in live_params: diff --git a/tests/unit/metrics/test_response_speed.py b/tests/unit/metrics/test_response_speed.py index 8cb3ecfc..343e73ee 100644 --- a/tests/unit/metrics/test_response_speed.py +++ b/tests/unit/metrics/test_response_speed.py @@ -1,11 +1,51 @@ """Tests for the ResponseSpeedMetric.""" +import json + import pytest -from eva.metrics.diagnostic.response_speed import ResponseSpeedMetric +from eva.metrics.diagnostic.response_speed import ( + ResponseSpeedMetric, + ResponseSpeedNoToolCallsMetric, + ResponseSpeedWithToolCallsMetric, +) from .conftest import make_metric_context +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _write_metrics_json(tmp_path, per_turn_latency: dict) -> None: + """Write a minimal metrics.json with turn_taking per_turn_latency data.""" + data = { + "metrics": { + "turn_taking": { + "details": { + "per_turn_latency": per_turn_latency, + } + } + } + } + (tmp_path / "metrics.json").write_text(json.dumps(data)) + + +def _make_trace(tool_call_turn_ids: set[int], all_turn_ids: set[int]) -> list[dict]: + """Build a minimal conversation_trace with the given turn structure.""" + trace = [] + for tid in sorted(all_turn_ids): + trace.append({"turn_id": tid, "type": "transcribed", "content": "user utterance"}) + if tid in tool_call_turn_ids: + trace.append({"turn_id": tid, "type": "tool_call", "tool_name": "some_tool"}) + trace.append({"turn_id": tid, "type": "tool_response", "tool_name": "some_tool"}) + return trace + + +# --------------------------------------------------------------------------- +# ResponseSpeedMetric +# --------------------------------------------------------------------------- + class TestResponseSpeedMetric: @pytest.mark.asyncio @@ -91,3 +131,193 @@ async def test_single_latency_value(self): assert result.details["max_speed_seconds"] == pytest.approx(0.75) assert result.details["num_turns"] == 1 assert result.details["per_turn_speeds"] == [0.75] + + +# --------------------------------------------------------------------------- +# ResponseSpeedWithToolCallsMetric +# --------------------------------------------------------------------------- + + +class TestResponseSpeedWithToolCallsMetric: + @pytest.mark.asyncio + async def test_no_output_dir(self): + """Missing output_dir returns error.""" + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context() + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_missing_metrics_json(self, tmp_path): + """output_dir exists but has no metrics.json — returns error.""" + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_missing_turn_taking_data(self, tmp_path): + """metrics.json exists but has no turn_taking entry — returns error.""" + (tmp_path / "metrics.json").write_text(json.dumps({"metrics": {}})) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_no_turns_with_tool_calls(self, tmp_path): + """Record has no tool-call turns — returns 'not found' error.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) + trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + assert "No turns with tool calls" in result.error + + @pytest.mark.asyncio + async def test_mixed_turns(self, tmp_path): + """Correctly includes only tool-call turn latencies.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0}) + # Turns 2 and 4 have tool calls + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 + assert result.score == pytest.approx((5.0 + 7.0) / 2) + assert result.details["max_speed_seconds"] == pytest.approx(7.0) + assert result.details["per_turn_speeds"] == [5.0, 7.0] + + @pytest.mark.asyncio + async def test_all_turns_have_tool_calls(self, tmp_path): + """When every turn has a tool call, all latencies are included.""" + _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0}) + trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 + assert result.score == pytest.approx(3.0) + + @pytest.mark.asyncio + async def test_filters_invalid_latency_values(self, tmp_path): + """Sanity filter (0 < x < 1000) applies to per_turn_latency values.""" + _write_metrics_json(tmp_path, {"1": -1.0, "2": 5.0, "3": 2000.0, "4": 3.0}) + trace = _make_trace(tool_call_turn_ids={1, 2, 3, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedWithToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 # only 5.0 and 3.0 pass + assert result.score == pytest.approx((5.0 + 3.0) / 2) + + +# --------------------------------------------------------------------------- +# ResponseSpeedNoToolCallsMetric +# --------------------------------------------------------------------------- + + +class TestResponseSpeedNoToolCallsMetric: + @pytest.mark.asyncio + async def test_no_output_dir(self): + """Missing output_dir returns error.""" + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context() + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_missing_metrics_json(self, tmp_path): + """output_dir exists but has no metrics.json — returns error.""" + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + + @pytest.mark.asyncio + async def test_all_turns_have_tool_calls(self, tmp_path): + """Every turn has a tool call — no-tool bucket is empty.""" + _write_metrics_json(tmp_path, {"1": 2.0, "2": 4.0}) + trace = _make_trace(tool_call_turn_ids={1, 2}, all_turn_ids={1, 2}) + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.score == 0.0 + assert result.error is not None + assert "No turns without tool calls" in result.error + + @pytest.mark.asyncio + async def test_mixed_turns(self, tmp_path): + """Correctly includes only non-tool-call turn latencies.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0}) + # Turns 2 and 4 have tool calls; turns 1 and 3 do not + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4}) + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 2 + assert result.score == pytest.approx((1.0 + 3.0) / 2) + assert result.details["max_speed_seconds"] == pytest.approx(3.0) + assert result.details["per_turn_speeds"] == [1.0, 3.0] + + @pytest.mark.asyncio + async def test_no_turns_with_tool_calls(self, tmp_path): + """Record with no tool-call turns — all latencies included.""" + _write_metrics_json(tmp_path, {"1": 1.0, "2": 2.0, "3": 3.0}) + trace = _make_trace(tool_call_turn_ids=set(), all_turn_ids={1, 2, 3}) + metric = ResponseSpeedNoToolCallsMetric() + ctx = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result = await metric.compute(ctx) + + assert result.error is None + assert result.details["num_turns"] == 3 + assert result.score == pytest.approx(2.0) + + @pytest.mark.asyncio + async def test_with_and_no_tool_split_is_exhaustive(self, tmp_path): + """with_tool + no_tool latencies together cover all per_turn_latency values.""" + per_turn = {"1": 1.0, "2": 5.0, "3": 3.0, "4": 7.0, "5": 2.0} + _write_metrics_json(tmp_path, per_turn) + trace = _make_trace(tool_call_turn_ids={2, 4}, all_turn_ids={1, 2, 3, 4, 5}) + + ctx_with = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + ctx_no = make_metric_context(output_dir=tmp_path, conversation_trace=trace) + + result_with = await ResponseSpeedWithToolCallsMetric().compute(ctx_with) + result_no = await ResponseSpeedNoToolCallsMetric().compute(ctx_no) + + combined = result_with.details["per_turn_speeds"] + result_no.details["per_turn_speeds"] + assert sorted(combined) == sorted(per_turn.values())