Skip to content

Commit 89fd60b

Browse files
committed
Add flags to signal capabilities and requirements in LLM
1 parent 5059336 commit 89fd60b

File tree

3 files changed

+56
-59
lines changed

3 files changed

+56
-59
lines changed

agents-core/vision_agents/core/agents/agents.py

Lines changed: 43 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ async def _handle_output_text_delta(event: LLMResponseChunkEvent):
340340

341341
async def _setup_speech_events(self):
342342
self.logger.info("_setup_speech_events")
343+
343344
@self.events.subscribe
344345
async def on_error(event: STTErrorEvent):
345346
self.logger.error("stt error event %s", event)
@@ -413,8 +414,8 @@ async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
413414

414415
@self.events.subscribe
415416
async def on_stt_transcript_event_create_response(event: STTTranscriptEvent):
416-
if self.realtime_mode or not self.llm:
417-
# when running in realtime mode, there is no need to send the response to the LLM
417+
if self.llm.handles_audio:
418+
# There is no need to send the response to the LLM if it handles audio itself.
418419
return
419420

420421
user_id = event.user_id()
@@ -538,7 +539,6 @@ async def finish(self):
538539
)
539540
return
540541

541-
542542
with self.span("agent.finish"):
543543
# If connection is None or already closed, return immediately
544544
if not self._connection:
@@ -788,7 +788,7 @@ async def on_audio_received(event: AudioReceivedEvent):
788788

789789
# Always listen to remote video tracks so we can forward frames to Realtime providers
790790
@self.edge.events.subscribe
791-
async def on_track(event: TrackAddedEvent):
791+
async def on_video_track_added(event: TrackAddedEvent):
792792
track_id = event.track_id
793793
track_type = event.track_type
794794
user = event.user
@@ -802,7 +802,7 @@ async def on_track(event: TrackAddedEvent):
802802
f"🎥 Track re-added: {track_type_name} ({track_id}), switching to it"
803803
)
804804

805-
if self.realtime_mode and isinstance(self.llm, Realtime):
805+
if self.llm.handles_video:
806806
# Get the existing forwarder and switch to this track
807807
_, _, forwarder = self._active_video_tracks[track_id]
808808
track = self.edge.add_track_subscriber(track_id)
@@ -818,7 +818,7 @@ async def on_track(event: TrackAddedEvent):
818818
task.add_done_callback(_log_task_exception)
819819

820820
@self.edge.events.subscribe
821-
async def on_track_removed(event: TrackRemovedEvent):
821+
async def on_video_track_removed(event: TrackRemovedEvent):
822822
track_id = event.track_id
823823
track_type = event.track_type
824824
if not track_id:
@@ -836,11 +836,7 @@ async def on_track_removed(event: TrackRemovedEvent):
836836
self._active_video_tracks.pop(track_id, None)
837837

838838
# If this was the active track, switch to any other available track
839-
if (
840-
track_id == self._current_video_track_id
841-
and self.realtime_mode
842-
and isinstance(self.llm, Realtime)
843-
):
839+
if self.llm.handles_video and track_id == self._current_video_track_id:
844840
self.logger.info(
845841
"🎥 Active video track removed, switching to next available"
846842
)
@@ -866,7 +862,7 @@ async def _reply_to_audio(
866862
)
867863

868864
# when in Realtime mode call the Realtime directly (non-blocking)
869-
if self.realtime_mode and isinstance(self.llm, Realtime):
865+
if self.llm.handles_audio:
870866
# TODO: this behaviour should be easy to change in the agent class
871867
asyncio.create_task(
872868
self.llm.simple_audio_response(pcm_data, participant)
@@ -967,7 +963,7 @@ async def recv(self):
967963
# If Realtime provider supports video, switch to this new track
968964
track_type_name = TrackType.Name(track_type)
969965

970-
if self.realtime_mode:
966+
if self.llm.handles_video:
971967
if self._video_track:
972968
# We have a video publisher (e.g., YOLO processor)
973969
# Create a separate forwarder for the PROCESSED video track
@@ -1089,8 +1085,8 @@ async def recv(self):
10891085

10901086
async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None:
10911087
"""Handle turn detection events."""
1092-
# In realtime mode, the LLM handles turn detection, interruption, and responses itself
1093-
if self.realtime_mode:
1088+
# Skip the turn event handling if the model doesn't require TTS or SST audio itself.
1089+
if not (self.llm.needs_tts and self.llm.needs_stt):
10941090
return
10951091

10961092
if isinstance(event, TurnStartedEvent):
@@ -1105,71 +1101,63 @@ async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None
11051101
except Exception as e:
11061102
self.logger.error(f"Error stopping TTS: {e}")
11071103
else:
1108-
participant_id = event.participant.user_id if event.participant else "unknown"
1104+
participant_id = (
1105+
event.participant.user_id if event.participant else "unknown"
1106+
)
11091107
self.logger.info(
11101108
f"👉 Turn started - participant speaking {participant_id} : {event.confidence}"
11111109
)
11121110
else:
11131111
# Agent itself started speaking - this is normal
1114-
participant_id = event.participant.user_id if event.participant else "unknown"
1115-
self.logger.debug(
1116-
f"👉 Turn started - agent speaking {participant_id}"
1112+
participant_id = (
1113+
event.participant.user_id if event.participant else "unknown"
11171114
)
1115+
self.logger.debug(f"👉 Turn started - agent speaking {participant_id}")
11181116
elif isinstance(event, TurnEndedEvent):
1119-
participant_id = event.participant.user_id if event.participant else "unknown"
1117+
participant_id = (
1118+
event.participant.user_id if event.participant else "unknown"
1119+
)
11201120
self.logger.info(
11211121
f"👉 Turn ended - participant {participant_id} finished (confidence: {event.confidence})"
11221122
)
1123+
if not event.participant or event.participant.user_id == self.agent_user.id:
1124+
# Exit early if the event is triggered by the model response.
1125+
return
11231126

1124-
# When turn detection is enabled, trigger LLM response when user's turn ends
1127+
# When turn detection is enabled, trigger LLM response when user's turn ends.
11251128
# This is the signal that the user has finished speaking and expects a response
1126-
if event.participant and event.participant.user_id != self.agent_user.id:
1127-
# Get the accumulated transcript for this speaker
1128-
transcript = self._pending_user_transcripts.get(
1129-
event.participant.user_id, ""
1129+
transcript = self._pending_user_transcripts.get(
1130+
event.participant.user_id, ""
1131+
)
1132+
if transcript.strip():
1133+
self.logger.info(
1134+
f"🤖 Triggering LLM response after turn ended for {event.participant.user_id}"
11301135
)
11311136

1132-
if transcript and transcript.strip():
1133-
self.logger.info(
1134-
f"🤖 Triggering LLM response after turn ended for {event.participant.user_id}"
1135-
)
1136-
1137-
# Create participant object if we have metadata
1138-
participant = None
1139-
if hasattr(event, "custom") and event.custom:
1140-
# Try to extract participant info from custom metadata
1141-
participant = event.custom.get("participant")
1137+
# Create participant object if we have metadata
1138+
participant = None
1139+
if hasattr(event, "custom") and event.custom:
1140+
# Try to extract participant info from custom metadata
1141+
participant = event.custom.get("participant")
11421142

1143-
# Trigger LLM response with the complete transcript
1144-
if self.llm:
1145-
await self.simple_response(transcript, participant)
1143+
# Trigger LLM response with the complete transcript
1144+
await self.simple_response(transcript, participant)
11461145

1147-
# Clear the pending transcript for this speaker
1148-
self._pending_user_transcripts[event.participant.user_id] = ""
1146+
# Clear the pending transcript for this speaker
1147+
self._pending_user_transcripts[event.participant.user_id] = ""
11491148

11501149
async def _on_stt_error(self, error):
11511150
"""Handle STT service errors."""
11521151
self.logger.error(f"❌ STT Error: {error}")
11531152

1154-
@property
1155-
def realtime_mode(self) -> bool:
1156-
"""Check if the agent is in Realtime mode.
1157-
1158-
Returns:
1159-
True if `llm` is a `Realtime` implementation; otherwise False.
1160-
"""
1161-
if self.llm is not None and isinstance(self.llm, Realtime):
1162-
return True
1163-
return False
1164-
11651153
@property
11661154
def publish_audio(self) -> bool:
11671155
"""Whether the agent should publish an outbound audio track.
11681156
11691157
Returns:
11701158
True if TTS is configured or when in Realtime mode.
11711159
"""
1172-
if self.tts is not None or self.realtime_mode:
1160+
if self.tts is not None or self.llm.handles_audio:
11731161
return True
11741162
return False
11751163

@@ -1203,9 +1191,7 @@ def _needs_audio_or_video_input(self) -> bool:
12031191
# Video input needed for:
12041192
# - Video processors (for frame analysis)
12051193
# - Realtime mode with video (multimodal LLMs)
1206-
needs_video = len(self.video_processors) > 0 or (
1207-
self.realtime_mode and isinstance(self.llm, Realtime)
1208-
)
1194+
needs_video = len(self.video_processors) > 0 or self.llm.handles_video
12091195

12101196
return needs_audio or needs_video
12111197

@@ -1256,7 +1242,7 @@ def image_processors(self) -> List[Any]:
12561242

12571243
def _validate_configuration(self):
12581244
"""Validate the agent configuration."""
1259-
if self.realtime_mode:
1245+
if self.llm.handles_audio:
12601246
# Realtime mode - should not have separate STT/TTS
12611247
if self.stt or self.tts:
12621248
self.logger.warning(
@@ -1293,7 +1279,7 @@ def _prepare_rtc(self):
12931279

12941280
# Set up audio track if TTS is available
12951281
if self.publish_audio:
1296-
if self.realtime_mode and isinstance(self.llm, Realtime):
1282+
if self.llm.handles_audio:
12971283
self._audio_track = self.llm.output_track
12981284
self.logger.info("🎵 Using Realtime provider output track for audio")
12991285
else:

agents-core/vision_agents/core/llm/llm.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,12 @@ def __init__(self, original: T, text: str, exception: Optional[Exception] = None
3434

3535

3636
class LLM(abc.ABC):
37-
# if we want to use realtime/ sts behaviour
38-
sts: bool = False
37+
# Instruct the Agent that this model requires STT and TTS services, and it doesn't handle audio and video
38+
# on its own.
39+
needs_stt: bool = True
40+
needs_tts: bool = True
41+
handles_audio: bool = False
42+
handles_video: bool = False
3943

4044
before_response_listener: BeforeCb
4145
after_response_listener: AfterCb

agents-core/vision_agents/core/llm/realtime.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ class Realtime(LLM, abc.ABC):
4242
fps: int = 1
4343
session_id: str # UUID to identify this session
4444

45+
# Instruct the Agent that this model can handle audio and video
46+
# without additional STT and TTS services.
47+
handles_audio: bool = True
48+
handles_video: bool = True
49+
needs_stt = False
50+
needs_tts = False
51+
4552
def __init__(
4653
self,
4754
fps: int = 1, # the number of video frames per second to send (for implementations that support setting fps)

0 commit comments

Comments
 (0)