diff --git a/docs/metric_context.md b/docs/metric_context.md index dbcabb06..744d57f0 100644 --- a/docs/metric_context.md +++ b/docs/metric_context.md @@ -181,7 +181,7 @@ Counts and flags computed during benchmark execution. - `"transfer"`: Assistant transferred to live agent - `"error"`: An error occurred - **`duration_seconds: float`** - Total duration of the conversation in seconds. -- **`is_audio_native: bool`** - Whether this conversation used an audio-native architecture. Metrics should check this flag to adjust behavior (e.g., audio-native uses intended user text in conversation_trace). +- **`pipeline_type: PipelineType`** - The pipeline architecture used (`CASCADE`, `AUDIO_LLM`, or `S2S`). Access `context.is_audio_native` for a convenience boolean that returns `True` for both `AUDIO_LLM` and `S2S`. - **`response_speed_latencies: list[float]`** - List of response latencies in seconds (time from user speech end to assistant speech start). ### File Paths @@ -212,11 +212,11 @@ The LLM processes **transcribed text**, so `transcribed_user_turns` reflects wha The model processes **raw audio**. The audit log may contain a transcript from the service's own secondary STT, but this is **not what the model used** — it's just for reference. This is why `transcribed_user_turns` is unreliable for audio-native models and `intended_user_turns` should be used instead. -Check `context.is_audio_native` (audio-native) to determine which mode was used. +Check `context.pipeline_type` to determine which mode was used, or `context.is_audio_native` for a boolean grouping of `S2S` and `AUDIO_LLM`. ### Writing Audio-Native-Aware Metrics -If your metric needs user text directly (rather than via `conversation_trace`, which handles this automatically), branch on `context.is_audio_native` (audio-native): +If your metric needs user text directly (rather than via `conversation_trace`, which handles this automatically), branch on `context.is_audio_native`: ```python async def compute(self, context: MetricContext) -> MetricScore: diff --git a/src/eva/metrics/base.py b/src/eva/metrics/base.py index ade4722b..865b10ad 100644 --- a/src/eva/metrics/base.py +++ b/src/eva/metrics/base.py @@ -19,6 +19,7 @@ resolve_turn_id, validate_rating, ) +from eva.models.config import PipelineType from eva.models.results import MetricScore from eva.utils.llm_client import LLMClient from eva.utils.logging import get_logger @@ -84,7 +85,7 @@ def __init__( response_speed_latencies: list[float] | None = None, assistant_interrupted_turns: set[int] | None = None, user_interrupted_turns: set[int] | None = None, - is_audio_native: bool = False, + pipeline_type: PipelineType = PipelineType.CASCADE, ): self.record_id = record_id @@ -134,7 +135,11 @@ def __init__( self.response_speed_latencies = response_speed_latencies or [] self.assistant_interrupted_turns = assistant_interrupted_turns or set() self.user_interrupted_turns = user_interrupted_turns or set() - self.is_audio_native = is_audio_native + self.pipeline_type = pipeline_type + + @property + def is_audio_native(self) -> bool: + return self.pipeline_type in (PipelineType.S2S, PipelineType.AUDIO_LLM) def to_dict(self) -> dict[str, Any]: """Convert MetricContext to a serializable dictionary.""" diff --git a/src/eva/metrics/processor.py b/src/eva/metrics/processor.py index 660e7ce9..0a9e7375 100644 --- a/src/eva/metrics/processor.py +++ b/src/eva/metrics/processor.py @@ -6,6 +6,7 @@ from pathlib import Path from eva.assistant.agentic.system import GENERIC_ERROR +from eva.models.config import PipelineType from eva.models.results import ConversationResult from eva.utils.log_processing import ( AnnotationLabel, @@ -138,7 +139,7 @@ def _process_user_speech( state: _TurnExtractionState, context: "_ProcessorContext", conversation_trace: list[dict], - is_audio_native: bool, + pipeline_type: PipelineType, ) -> None: """Process a single user_speech event into intended_user_turns (and audio-native trace).""" turn_idx = state.last_user_audio_turn @@ -150,7 +151,7 @@ def _process_user_speech( append_turn_text(context.intended_user_turns, turn_idx, user_text, sep) state.user_speech_in_session = True # For audio-native models, use intended user text in the conversation trace - if is_audio_native: + if pipeline_type in (PipelineType.S2S, PipelineType.AUDIO_LLM): trace_entry = { "role": "user", "content": user_text, @@ -186,7 +187,7 @@ def _handle_audit_log_event( state: "_TurnExtractionState", context: "_ProcessorContext", conversation_trace: list[dict], - is_audio_native: bool, + pipeline_type: PipelineType, ) -> None: """Process a single audit_log source event into turn variables and conversation trace.""" if event["event_type"] == "user": @@ -212,12 +213,14 @@ def _handle_audit_log_event( entry["content"] = f"{AnnotationLabel.USER_INTERRUPTS} {entry['content']}" state.pending_user_interrupts_label = False # For audio-native models, user trace entries come from ElevenLabs user_speech instead - if not is_audio_native: + if pipeline_type == PipelineType.CASCADE: conversation_trace.append(entry) sep = _user_transcript_separator(existing, turn, state) append_turn_text(context.transcribed_user_turns, turn, entry["content"], sep) elif event["event_type"] == "assistant": + if pipeline_type == PipelineType.S2S: + return turn = state.turn_num content = event["data"] # Apply interruption prefix if this is the first assistant entry in a turn where assistant barged in @@ -226,7 +229,7 @@ def _handle_audit_log_event( has_prior = any(e.get("role") == "assistant" and e.get("turn_id") == turn for e in conversation_trace) if not has_prior: content = f"{AnnotationLabel.ASSISTANT_INTERRUPTS} {content}" - user_entry_type = "intended" if is_audio_native else "transcribed" + user_entry_type = "transcribed" if pipeline_type == PipelineType.CASCADE else "intended" annotate_last_entry( conversation_trace, turn, "user", user_entry_type, AnnotationLabel.CUT_OFF_BY_ASSISTANT ) @@ -303,7 +306,7 @@ def _handle_audio_start( state: "_TurnExtractionState", context: "_ProcessorContext", conversation_trace: list[dict], - is_audio_native: bool, + pipeline_type: PipelineType, ) -> None: """Process an ElevenLabs audio_start event, advancing the turn counter if needed.""" role = event["data"]["user"] @@ -345,7 +348,7 @@ def _handle_audio_start( # Replay any buffered user_speech that arrived before this audio_start — now we know the correct turn. if state.buffered_user_speech: for buffered in state.buffered_user_speech: - _process_user_speech(buffered, state, context, conversation_trace, is_audio_native) + _process_user_speech(buffered, state, context, conversation_trace, pipeline_type) state.buffered_user_speech.clear() elif role == "pipecat_agent": @@ -394,7 +397,7 @@ def _handle_elevenlabs_event( state: "_TurnExtractionState", context: "_ProcessorContext", conversation_trace: list[dict], - is_audio_native: bool, + pipeline_type: PipelineType, ) -> bool: """Process a single elevenlabs source event. Returns True if the caller should continue.""" if event["event_type"] == "assistant_speech": @@ -403,7 +406,7 @@ def _handle_elevenlabs_event( turn = state.last_assistant_audio_turn # Only mark "assistant spoke" if the speech belongs to the current turn; late transcripts from a # previous turn must not trigger a spurious turn advance. - if turn == state.turn_num: + if turn == state.turn_num or pipeline_type == PipelineType.S2S: state.assistant_spoke_in_turn = True existing = context.transcribed_assistant_turns.get(turn, "") sep = _assistant_speech_separator(existing, turn, state) @@ -411,6 +414,17 @@ def _handle_elevenlabs_event( if not existing and turn in state.assistant_interrupted_turns: text = f"{AnnotationLabel.ASSISTANT_INTERRUPTS} {text}" append_turn_text(context.transcribed_assistant_turns, turn, text, sep) + # For S2S, assistant trace entries come from EL (audit log assistant entries are skipped) + if pipeline_type == PipelineType.S2S: + conversation_trace.append( + { + "role": "assistant", + "content": text, + "timestamp": event["timestamp_ms"], + "type": "transcribed", + "turn_id": turn, + } + ) elif event["event_type"] == "user_speech": # Buffer user_speech when it cannot be paired with the current user audio session. This happens when: @@ -431,10 +445,10 @@ def _handle_elevenlabs_event( if raw_text in state.buffered_user_speech_texts: state.buffered_user_speech_texts.discard(raw_text) return False - _process_user_speech(event, state, context, conversation_trace, is_audio_native) + _process_user_speech(event, state, context, conversation_trace, pipeline_type) elif event["event_type"] == "audio_start": - _handle_audio_start(event, state, context, conversation_trace, is_audio_native) + _handle_audio_start(event, state, context, conversation_trace, pipeline_type) elif event["event_type"] == "audio_end": _handle_audio_end(event, state) @@ -550,7 +564,10 @@ def _finalize_extraction( context.tool_params, context.tool_responses = extract_tool_params_and_responses(conversation_trace) context.tool_called = [t["tool_name"].lower() for t in context.tool_params] context.num_tool_calls = len(context.tool_params) - context.num_assistant_turns = len(context.intended_assistant_turns) + if context.pipeline_type == PipelineType.S2S: + context.num_assistant_turns = len(context.transcribed_assistant_turns) + else: + context.num_assistant_turns = len(context.intended_assistant_turns) context.num_user_turns = len(context.transcribed_user_turns) _warn_turn_misalignment(context) @@ -592,11 +609,12 @@ def _ensure_greeting_is_first(context: "_ProcessorContext") -> None: if greeting_idx is not None: greeting = context.conversation_trace.pop(greeting_idx) else: - # Cascade: greeting not in audit log — create from pipecat text. + # Greeting not in audit log — create from pipecat text (cascade) or transcribed text (S2S). + greeting_text = context.intended_assistant_turns.get(0) or context.transcribed_assistant_turns.get(0) greeting = { "role": "assistant", - "content": context.intended_assistant_turns.get(0), - "type": "intended", + "content": greeting_text, + "type": "intended" if context.intended_assistant_turns.get(0) else "transcribed", "turn_id": 0, } context.conversation_trace.insert(0, greeting) @@ -639,8 +657,9 @@ def _label_trailing_assistant_turn(context: "_ProcessorContext", last_entry: dic {"role": "assistant", "content": labeled, "type": "intended", "turn_id": trailing_turn_id} ) - # Sync intended + transcribed - context.intended_assistant_turns[trailing_turn_id] = labeled + # Sync intended + transcribed (skip intended for S2S — no intended text exists) + if context.pipeline_type != PipelineType.S2S: + context.intended_assistant_turns[trailing_turn_id] = labeled if not context.transcribed_assistant_turns.get(trailing_turn_id): context.transcribed_assistant_turns[trailing_turn_id] = labeled else: @@ -687,7 +706,7 @@ def __init__(self): # Conversation metadata self.conversation_finished: bool = False self.conversation_ended_reason: str | None = None - self.is_audio_native: bool = False + self.pipeline_type: PipelineType = PipelineType.CASCADE # Response latencies from Pipecat's UserBotLatencyObserver self.response_speed_latencies: list[float] = [] @@ -703,14 +722,14 @@ def process_record( self, result: ConversationResult, output_dir: Path, - is_audio_native: bool = False, + pipeline_type: PipelineType = PipelineType.CASCADE, ) -> _ProcessorContext | None: """Process a single conversation record to create metric context. Args: result: ConversationResult object output_dir: Path to the output directory containing logs - is_audio_native: Whether the model is audio-native + pipeline_type: The type of voice pipeline used Returns: _ProcessorContext object with all processed variables, or None if processing failed @@ -720,7 +739,7 @@ def process_record( context.audio_assistant_path = result.audio_assistant_path context.audio_user_path = result.audio_user_path context.audio_mixed_path = result.audio_mixed_path - context.is_audio_native = is_audio_native + context.pipeline_type = pipeline_type try: self._build_history(context, output_dir, result) @@ -824,7 +843,8 @@ def _build_history( Each entry: {timestamp_ms, source, event_type, data}. """ history = self._load_audit_log_transcript(output_dir) - history.extend(self._load_pipecat_logs(result.pipecat_logs_path)) + if context.pipeline_type != PipelineType.S2S: + history.extend(self._load_pipecat_logs(result.pipecat_logs_path)) history.extend(self._load_elevenlabs_logs(result.elevenlabs_logs_path)) history.sort(key=lambda e: e["timestamp_ms"]) @@ -859,18 +879,22 @@ def _extract_turns_from_history(context: _ProcessorContext) -> None: conversation_trace: list[dict] = [] for event in context.history: if event["source"] == "audit_log": - _handle_audit_log_event(event, state, context, conversation_trace, context.is_audio_native) + _handle_audit_log_event(event, state, context, conversation_trace, context.pipeline_type) elif event["source"] == "pipecat": _handle_pipecat_event(event, state, context, conversation_trace) elif event["source"] == "elevenlabs": - if _handle_elevenlabs_event(event, state, context, conversation_trace, context.is_audio_native): + if _handle_elevenlabs_event(event, state, context, conversation_trace, context.pipeline_type): continue if not state.session_end_ts: state.session_end_ts = context.history[-1].get("timestamp_ms") / 1000.0 _pair_audio_segments(state, context) - validated_trace = _validate_conversation_trace(conversation_trace, context) + if context.pipeline_type == PipelineType.S2S: + # S2S has no pipecat segments to validate against — trace entries come from EL directly + validated_trace = conversation_trace + else: + validated_trace = _validate_conversation_trace(conversation_trace, context) context.conversation_trace = group_consecutive_turns(validated_trace) _fix_interruption_labels(context, state) _finalize_extraction(context, state, conversation_trace) diff --git a/src/eva/metrics/runner.py b/src/eva/metrics/runner.py index 2d808d7a..289ce8cb 100644 --- a/src/eva/metrics/runner.py +++ b/src/eva/metrics/runner.py @@ -13,7 +13,7 @@ from eva.metrics.base import BaseMetric, MetricContext from eva.metrics.processor import MetricsContextProcessor from eva.metrics.registry import MetricRegistry, get_global_registry -from eva.models.config import is_audio_native_pipeline +from eva.models.config import PipelineType, get_pipeline_type from eva.models.record import EvaluationRecord from eva.models.results import ConversationResult, MetricScore, PassAtKResult, RecordMetrics from eva.utils.hash_utils import get_dict_hash @@ -130,7 +130,7 @@ def _load_agent_config(self) -> dict[str, Any]: # Determine pipeline type from config (fallback to False for legacy runs) model_data = config_data.get("model", {}) - self._is_audio_native = is_audio_native_pipeline(model_data) if model_data else False + self._pipeline_type = get_pipeline_type(model_data) if model_data else PipelineType.CASCADE agent_config_path = config_data.get("agent_config_path") @@ -429,9 +429,7 @@ def _load_context(self, record_id: str, record_dir: Path) -> MetricContext: result = ConversationResult(**result_data) # Use postprocessor to process logs and create enriched context - metrics_context = self.metrics_processor.process_record( - result, record_dir, is_audio_native=self._is_audio_native - ) + metrics_context = self.metrics_processor.process_record(result, record_dir, pipeline_type=self._pipeline_type) # Get agent instructions and tools from config agent_instructions = self._agent_config["instructions"] diff --git a/src/eva/models/config.py b/src/eva/models/config.py index e08783bd..02fc7b54 100644 --- a/src/eva/models/config.py +++ b/src/eva/models/config.py @@ -15,6 +15,7 @@ import copy import logging from datetime import UTC, datetime +from enum import StrEnum from pathlib import Path from typing import Annotated, Any, ClassVar, Literal @@ -171,6 +172,14 @@ def pipeline_parts(self) -> dict[str, str]: _AUDIO_LLM_FIELDS = {"audio_llm", "audio_llm_params", "tts", "tts_params"} +class PipelineType(StrEnum): + """Type of voice pipeline.""" + + CASCADE = "cascade" + AUDIO_LLM = "audio_llm" + S2S = "s2s" + + def _model_config_discriminator(data: Any) -> str: """Discriminate which pipeline config type to use based on unique fields.""" if isinstance(data, dict): @@ -186,21 +195,22 @@ def _model_config_discriminator(data: Any) -> str: return "pipeline" -def is_audio_native_pipeline(model_data: dict | Any) -> bool: - """Return True if the model config represents an audio-native pipeline (S2S or AudioLLM). +def get_pipeline_type(model_data: dict | Any) -> PipelineType: + """Return the pipeline type for the given model config. Works with both raw dicts (e.g. from config.json) and parsed model config objects. Also handles legacy configs where ``realtime_model`` was stored alongside ``llm_model`` in a flat dict (before the discriminated-union refactor). - Returns False for configs missing the ``model`` key. """ mode = _model_config_discriminator(model_data) - if mode in ("s2s", "audio_llm"): - return True + if mode == "s2s": + return PipelineType.S2S + if mode == "audio_llm": + return PipelineType.AUDIO_LLM # Legacy: realtime_model was a sibling of llm_model before the union split if isinstance(model_data, dict) and model_data.get("realtime_model"): - return True - return False + return PipelineType.S2S + return PipelineType.CASCADE def _strip_other_mode_fields(data: dict) -> dict: diff --git a/tests/fixtures/processor_histories.json b/tests/fixtures/processor_histories.json index 21133a90..dbc15119 100644 --- a/tests/fixtures/processor_histories.json +++ b/tests/fixtures/processor_histories.json @@ -504,27 +504,20 @@ { "id": "s2s_assistant_speak_tool_speak", "description": "S2S pipeline (llm_response): greeting at turn 0, user reply at turn 1, assistant speak\u2192tool_call\u2192speak at turn 2. Tests that: (1) user entries use ElevenLabs intended text, (2) assistant entries come from audit_log with proper tool call boundaries, (3) greeting is moved to front when ElevenLabs timestamps arrive before audit_log.", - "is_audio_native": true, + "pipeline_type": "s2s", "history": [ - {"timestamp_ms": 4784, "source": "pipecat", "event_type": "turn_start", "data": {}}, - {"timestamp_ms": 7000, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Hello, thank you for calling SkyWay Airlines. How can I help you?"}}, {"timestamp_ms": 7188, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 7.188}}, {"timestamp_ms": 13227, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 13.227}}, {"timestamp_ms": 14602, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, thank you for calling Skyway Airlines. How can I help you?"}}}, {"timestamp_ms": 15013, "source": "audit_log", "event_type": "assistant", "data": "Hello, thank you for calling SkyWay Airlines. How can I help you?"}, - {"timestamp_ms": 15412, "source": "pipecat", "event_type": "turn_end", "data": {}}, - {"timestamp_ms": 15412, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 14608, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 14.608}}, {"timestamp_ms": 14832, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I need to change my flight to March 25th."}}}, {"timestamp_ms": 18028, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 18.028}}, {"timestamp_ms": 15414, "source": "audit_log", "event_type": "user", "data": "Hi. I need to change my flight to March 25."}, - {"timestamp_ms": 19000, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Sure! Could you provide your confirmation number and last name?"}}, {"timestamp_ms": 19590, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 19.59}}, {"timestamp_ms": 30175, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 30.175}}, {"timestamp_ms": 32912, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure! Could you provide your confirmation number and last name?"}}}, {"timestamp_ms": 32044, "source": "audit_log", "event_type": "assistant", "data": "Sure! Could you provide your confirmation number and last name?"}, - {"timestamp_ms": 34059, "source": "pipecat", "event_type": "turn_end", "data": {}}, - {"timestamp_ms": 34059, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 32916, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 32.916}}, {"timestamp_ms": 33960, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Confirmation is ZK3FFW, last name Rodriguez."}}}, {"timestamp_ms": 40596, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 40.596}}, @@ -535,12 +528,9 @@ {"timestamp_ms": 66393, "source": "audit_log", "event_type": "tool_call", "data": {"tool": "search_flights", "parameters": {"origin": "AUS", "destination": "ORD", "date": "2026-03-25"}}}, {"timestamp_ms": 66393, "source": "audit_log", "event_type": "tool_response", "data": {"tool": "search_flights", "response": {"status": "success", "count": 3}}}, {"timestamp_ms": 66396, "source": "audit_log", "event_type": "assistant", "data": "I found your reservation. You are booked on flight SK621 from Austin to Chicago."}, - {"timestamp_ms": 41000, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Thank you! Let me pull up your booking.\nI found your reservation. You are booked on flight SK621 from Austin to Chicago."}}, {"timestamp_ms": 41880, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 41.88}}, {"timestamp_ms": 64809, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 64.809}}, {"timestamp_ms": 69138, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Thank you. Let me pull up your booking. I found your reservation. You are booked on flight SK 621 from Austin to Chicago."}}}, - {"timestamp_ms": 70314, "source": "pipecat", "event_type": "turn_end", "data": {}}, - {"timestamp_ms": 70314, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 69150, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 69.15}}, {"timestamp_ms": 69751, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "That is not right. I am Austin to LAX."}}}, {"timestamp_ms": 76172, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 76.172}}, @@ -550,22 +540,21 @@ "expected": { "transcribed_assistant_turns": {"0": "Hello, thank you for calling Skyway Airlines. How can I help you?", "1": "Sure! Could you provide your confirmation number and last name?", "2": "Thank you. Let me pull up your booking. I found your reservation. You are booked on flight SK 621 from Austin to Chicago."}, "transcribed_user_turns": {"1": "Hi. I need to change my flight to March 25.", "2": "Sure. Confirmation is ZK3FFW, and my last name is Rodriguez.", "3": "No. That is not right. I am Austin to LAX on March 20."}, - "intended_assistant_turns": {"0": "Hello, thank you for calling SkyWay Airlines. How can I help you?", "1": "Sure! Could you provide your confirmation number and last name?", "2": "Thank you! Let me pull up your booking.\nI found your reservation. You are booked on flight SK621 from Austin to Chicago."}, + "intended_assistant_turns": {"0": "", "1": "", "2": ""}, "intended_user_turns": {"1": "I need to change my flight to March 25th.", "2": "Confirmation is ZK3FFW, last name Rodriguez.", "3": "That is not right. I am Austin to LAX."}, "num_assistant_turns": 3, "num_user_turns": 3, "num_tool_calls": 2, "conversation_trace": [ - {"role": "assistant", "content": "Hello, thank you for calling SkyWay Airlines. How can I help you?", "type": "intended", "turn_id": 0}, + {"role": "assistant", "content": "Hello, thank you for calling Skyway Airlines. How can I help you?", "type": "transcribed", "turn_id": 0}, {"role": "user", "content": "I need to change my flight to March 25th.", "type": "intended", "turn_id": 1}, - {"role": "assistant", "content": "Sure! Could you provide your confirmation number and last name?", "type": "intended", "turn_id": 1}, + {"role": "assistant", "content": "Sure! Could you provide your confirmation number and last name?", "type": "transcribed", "turn_id": 1}, {"role": "user", "content": "Confirmation is ZK3FFW, last name Rodriguez.", "type": "intended", "turn_id": 2}, {"tool_name": "get_reservation", "parameters": {"confirmation_number": "ZK3FFW", "last_name": "Rodriguez"}, "type": "tool_call", "turn_id": 2}, {"tool_name": "get_reservation", "tool_response": {"status": "success", "reservation": {"confirmation_number": "ZK3FFW"}}, "type": "tool_response", "turn_id": 2}, - {"role": "assistant", "content": "Thank you! Let me pull up your booking.", "type": "intended", "turn_id": 2}, {"tool_name": "search_flights", "parameters": {"origin": "AUS", "destination": "ORD", "date": "2026-03-25"}, "type": "tool_call", "turn_id": 2}, {"tool_name": "search_flights", "tool_response": {"status": "success", "count": 3}, "type": "tool_response", "turn_id": 2}, - {"role": "assistant", "content": "I found your reservation. You are booked on flight SK621 from Austin to Chicago.", "type": "intended", "turn_id": 2}, + {"role": "assistant", "content": "Thank you. Let me pull up your booking. I found your reservation. You are booked on flight SK 621 from Austin to Chicago.", "type": "transcribed", "turn_id": 2}, {"role": "user", "content": "That is not right. I am Austin to LAX.", "type": "intended", "turn_id": 3} ], "assistant_interrupted_turns": [], @@ -798,32 +787,26 @@ { "id": "s2s_audit_truncated_to_spoken", "description": "S2S pipeline (llm_response): audit_log/assistant has the full LLM response but llm_response only covers the spoken prefix. Tests that _truncate_to_spoken truncates audit_log text for S2S conversations.", - "is_audio_native": true, + "pipeline_type": "s2s", "history": [ - {"timestamp_ms": 1000, "source": "pipecat", "event_type": "turn_start", "data": {}}, - {"timestamp_ms": 1100, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Welcome to SkyWay Airlines!"}}, {"timestamp_ms": 1200, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.8}}, {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Welcome to SkyWay Airlines!"}}}, {"timestamp_ms": 2100, "source": "audit_log", "event_type": "assistant", "data": "Welcome to SkyWay Airlines!"}, - {"timestamp_ms": 2200, "source": "pipecat", "event_type": "turn_end", "data": {}}, - {"timestamp_ms": 2200, "source": "pipecat", "event_type": "turn_start", "data": {}}, {"timestamp_ms": 2300, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.3}}, {"timestamp_ms": 2500, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "I need to change my flight date."}}}, {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "elevenlabs_user", "audio_timestamp": 3.0}}, {"timestamp_ms": 3100, "source": "audit_log", "event_type": "user", "data": "I need to change my flight date."}, - {"timestamp_ms": 3500, "source": "pipecat", "event_type": "llm_response", "data": {"frame": "Sure, I can help you change your flight."}}, {"timestamp_ms": 3550, "source": "audit_log", "event_type": "assistant", "data": "Sure, I can help you change your flight. What is your confirmation number and last name so I can pull up the reservation?"}, {"timestamp_ms": 3600, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 3.6}}, {"timestamp_ms": 4200, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Sure, I can help you change your flight."}}}, {"timestamp_ms": 4500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 4.5}}, - {"timestamp_ms": 4600, "source": "pipecat", "event_type": "turn_end", "data": {}}, {"timestamp_ms": 5000, "source": "elevenlabs", "event_type": "connection_state", "data": {"data": {"state": "session_ended"}}} ], "expected": { "transcribed_assistant_turns": {"0": "Welcome to SkyWay Airlines!", "1": "Sure, I can help you change your flight. [speaker likely cut itself off]"}, "transcribed_user_turns": {"1": "I need to change my flight date."}, - "intended_assistant_turns": {"0": "Welcome to SkyWay Airlines!", "1": "Sure, I can help you change your flight. [speaker likely cut itself off]"}, + "intended_assistant_turns": {"0": "", "1": ""}, "intended_user_turns": {"1": "I need to change my flight date."}, "audio_timestamps_assistant_turns": {"0": [[1.0, 1.8]], "1": [[3.6, 4.5]]}, "audio_timestamps_user_turns": {"1": [[2.3, 3.0]]}, @@ -831,9 +814,9 @@ "num_user_turns": 1, "num_tool_calls": 0, "conversation_trace": [ - {"role": "assistant", "content": "Welcome to SkyWay Airlines!", "type": "intended", "turn_id": 0}, + {"role": "assistant", "content": "Welcome to SkyWay Airlines!", "type": "transcribed", "turn_id": 0}, {"role": "user", "content": "I need to change my flight date.", "type": "intended", "turn_id": 1}, - {"role": "assistant", "content": "Sure, I can help you change your flight. [speaker likely cut itself off]", "type": "intended", "turn_id": 1} + {"role": "assistant", "content": "Sure, I can help you change your flight. [speaker likely cut itself off]", "type": "transcribed", "turn_id": 1} ], "assistant_interrupted_turns": [], "user_interrupted_turns": [] @@ -989,7 +972,6 @@ {"timestamp_ms": 1100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 1.0}}, {"timestamp_ms": 1500, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "Hello, how can I help you today?"}}}, {"timestamp_ms": 1800, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 1.8}}, - {"timestamp_ms": 2000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 2.0}}, {"timestamp_ms": 2100, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "My flight got canceled and I want a full refund."}}}, {"timestamp_ms": 2200, "source": "audit_log", "event_type": "user", "data": "My flight got cancelled and I want a full refund."}, @@ -999,7 +981,6 @@ {"timestamp_ms": 2550, "source": "audit_log", "event_type": "assistant", "data": "I'm sorry to hear that. Could you provide your confirmation number?"}, {"timestamp_ms": 3000, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 3.0}}, {"timestamp_ms": 3100, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I'm sorry to hear that. Could you provide your confirmation number?"}}}, - {"timestamp_ms": 4000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 4.0}}, {"timestamp_ms": 4100, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Confirmation code is Z5OROH and last name is White."}}}, {"timestamp_ms": 4200, "source": "audit_log", "event_type": "user", "data": "Confirmation code is Z5OROH."}, @@ -1012,7 +993,6 @@ {"timestamp_ms": 5100, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "pipecat_agent", "audio_timestamp": 5.1}}, {"timestamp_ms": 5500, "source": "elevenlabs", "event_type": "audio_end", "data": {"user": "pipecat_agent", "audio_timestamp": 5.5}}, {"timestamp_ms": 5600, "source": "elevenlabs", "event_type": "assistant_speech", "data": {"data": {"text": "I found your reservation. Your flight was canceled, so you are eligible for a full refund. Shall I proceed?"}}}, - {"timestamp_ms": 6000, "source": "elevenlabs", "event_type": "audio_start", "data": {"user": "elevenlabs_user", "audio_timestamp": 6.0}}, {"timestamp_ms": 6100, "source": "elevenlabs", "event_type": "user_speech", "data": {"data": {"text": "Yes, go ahead."}}}, {"timestamp_ms": 6200, "source": "audit_log", "event_type": "user", "data": "Yes, go ahead."}, diff --git a/tests/integration/test_processor_real_artifacts.py b/tests/integration/test_processor_real_artifacts.py index 84b7d20f..65160435 100644 --- a/tests/integration/test_processor_real_artifacts.py +++ b/tests/integration/test_processor_real_artifacts.py @@ -81,7 +81,7 @@ def test_tools_called(self, processor_context, expected_context): def test_conversation_metadata(self, processor_context, expected_context): assert processor_context.conversation_finished == expected_context["conversation_finished"] assert processor_context.conversation_ended_reason == expected_context["conversation_ended_reason"] - assert processor_context.is_audio_native == expected_context["is_audio_native"] + assert processor_context.pipeline_type.value == expected_context.get("pipeline_type", "cascade") def test_transcribed_assistant_turns(self, processor_context, expected_context): expected = _convert_expected_value( diff --git a/tests/unit/metrics/test_faithfulness.py b/tests/unit/metrics/test_faithfulness.py index e2bf8af1..e893fbcf 100644 --- a/tests/unit/metrics/test_faithfulness.py +++ b/tests/unit/metrics/test_faithfulness.py @@ -23,7 +23,7 @@ def test_get_prompt_variables_cascade(self): agent_role="Assistant", agent_tools=[{"name": "search"}], current_date_time="2026-01-01", - is_audio_native=False, + pipeline_type="cascade", ) variables = self.metric.get_prompt_variables(ctx, "User: hi\nBot: hello") assert variables["agent_instructions"] == "Be helpful" @@ -33,7 +33,7 @@ def test_get_prompt_variables_cascade(self): assert "speech-to-text" in variables["disambiguation_context"] def test_get_prompt_variables_s2s(self): - ctx = make_metric_context(is_audio_native=True) + ctx = make_metric_context(pipeline_type="s2s") variables = self.metric.get_prompt_variables(ctx, "transcript") assert "speech-to-speech" in variables["user_turns_disclaimer"] assert "raw audio" in variables["disambiguation_context"] diff --git a/tests/unit/metrics/test_processor_histories.py b/tests/unit/metrics/test_processor_histories.py index b03a175c..52deb409 100644 --- a/tests/unit/metrics/test_processor_histories.py +++ b/tests/unit/metrics/test_processor_histories.py @@ -10,6 +10,7 @@ import pytest from eva.metrics.processor import MetricsContextProcessor, _ProcessorContext +from eva.models.config import PipelineType FIXTURES_PATH = Path(__file__).parent.parent.parent / "fixtures" / "processor_histories.json" @@ -53,7 +54,8 @@ def test_expected_outputs(self, case): ctx = _ProcessorContext() ctx.record_id = case["id"] ctx.history = case["history"] - ctx.is_audio_native = case.get("is_audio_native", False) + pipeline_type_str = case.get("pipeline_type", "cascade") + ctx.pipeline_type = PipelineType(pipeline_type_str) MetricsContextProcessor._extract_turns_from_history(ctx) MetricsContextProcessor._reconcile_transcript_with_tools(ctx) diff --git a/tests/unit/metrics/test_user_behavioral_fidelity.py b/tests/unit/metrics/test_user_behavioral_fidelity.py index f689a957..803d0608 100644 --- a/tests/unit/metrics/test_user_behavioral_fidelity.py +++ b/tests/unit/metrics/test_user_behavioral_fidelity.py @@ -25,7 +25,7 @@ def test_get_prompt_variables_cascade(self): {"name": "search_flights", "tool_type": "read"}, {"name": "book_flight", "tool_type": "write"}, ], - is_audio_native=False, + pipeline_type="cascade", intended_user_turns={0: "Hi, I need to book a flight"}, ) variables = self.metric.get_prompt_variables(ctx, "User: hi\nBot: hello") @@ -44,7 +44,7 @@ def test_get_prompt_variables_s2s(self): user_goal="Book a flight", user_persona="Friendly traveler", agent_tools=[], - is_audio_native=True, + pipeline_type="s2s", intended_user_turns={0: "Hi"}, ) variables = self.metric.get_prompt_variables(ctx, "transcript text")