Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -1727,8 +1727,13 @@ def _on_input_speech_stopped(self, ev: llm.InputSpeechStoppedEvent) -> None:
)

def _on_input_audio_transcription_completed(self, ev: llm.InputTranscriptionCompleted) -> None:
# forward item_id so every interim/final transcript of the same utterance shares a
# stable id, letting consumers correlate them and dedup per-utterance on the
# provider-agnostic event surface (see UserInputTranscribedEvent.item_id)
self._session._user_input_transcribed(
UserInputTranscribedEvent(transcript=ev.transcript, is_final=ev.is_final)
UserInputTranscribedEvent(
transcript=ev.transcript, is_final=ev.is_final, item_id=ev.item_id
)
)

if ev.is_final:
Expand Down
8 changes: 8 additions & 0 deletions livekit-agents/livekit/agents/voice/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,14 @@ class UserInputTranscribedEvent(BaseModel):
is_final: bool
speaker_id: str | None = None
language: LanguageCode | None = None
item_id: str | None = None
"""Stable id of the transcribed input item this transcript belongs to.

For realtime models, every interim and final ``UserInputTranscribedEvent`` of a single
user utterance shares the same ``item_id``, allowing consumers to correlate the partial
transcripts and react exactly once per utterance (e.g. render a placeholder once) without
dropping down to provider-specific events. ``None`` when no upstream item id is available
(e.g. the STT pipeline or the empty end-of-speech placeholder)."""
Comment thread
ByteMaster-1 marked this conversation as resolved.
created_at: float = Field(default_factory=time.time)


Expand Down
51 changes: 51 additions & 0 deletions tests/test_agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
UserStateChangedEvent,
function_tool,
inference,
llm,
vad,
)
from livekit.agents.llm import (
Expand Down Expand Up @@ -1705,3 +1706,53 @@ async def test_pipeline_multi_segment_interrupted() -> None:
assert len(assistant_msgs) == 1
assert assistant_msgs[0].interrupted is True
assert "How are you?" not in (assistant_msgs[0].text_content or "")


def _make_activity() -> MagicMock:
act = MagicMock(spec=AgentActivity)
act._session = MagicMock()
# the is_final branch upserts a chat message onto the agent's chat_ctx
act._agent = MagicMock()
return act


def _emitted_event(act: MagicMock) -> UserInputTranscribedEvent:
act._session._user_input_transcribed.assert_called_once()
(event,) = act._session._user_input_transcribed.call_args.args
return event


def test_item_id_forwarded_from_realtime_transcription() -> None:
"""Realtime input transcription must propagate item_id onto the high-level event."""
act = _make_activity()
ev = llm.InputTranscriptionCompleted(item_id="item_123", transcript="hello", is_final=False)

AgentActivity._on_input_audio_transcription_completed(act, ev)

event = _emitted_event(act)
assert event.item_id == "item_123"
assert event.transcript == "hello"
assert event.is_final is False


def test_item_id_shared_across_interim_and_final() -> None:
"""All transcripts of one utterance share the same item_id, enabling per-utterance dedup."""
act = _make_activity()

AgentActivity._on_input_audio_transcription_completed(
act, llm.InputTranscriptionCompleted(item_id="item_abc", transcript="hel", is_final=False)
)
AgentActivity._on_input_audio_transcription_completed(
act,
llm.InputTranscriptionCompleted(item_id="item_abc", transcript="hello", is_final=True),
)

item_ids = {
call.args[0].item_id for call in act._session._user_input_transcribed.call_args_list
}
assert item_ids == {"item_abc"}


def test_item_id_defaults_to_none() -> None:
"""Paths without an upstream item id (STT, end-of-speech placeholder) keep item_id=None."""
assert UserInputTranscribedEvent(transcript="", is_final=False).item_id is None
Loading