@@ -340,6 +340,7 @@ async def _handle_output_text_delta(event: LLMResponseChunkEvent):
340340
341341 async def _setup_speech_events (self ):
342342 self .logger .info ("_setup_speech_events" )
343+
343344 @self .events .subscribe
344345 async def on_error (event : STTErrorEvent ):
345346 self .logger .error ("stt error event %s" , event )
@@ -413,8 +414,8 @@ async def _on_tts_audio_write_to_output(event: TTSAudioEvent):
413414
414415 @self .events .subscribe
415416 async def on_stt_transcript_event_create_response (event : STTTranscriptEvent ):
416- if self .realtime_mode or not self . llm :
417- # when running in realtime mode, there is no need to send the response to the LLM
417+ if self .llm . handles_audio :
418+ # There is no need to send the response to the LLM if it handles audio itself.
418419 return
419420
420421 user_id = event .user_id ()
@@ -538,7 +539,6 @@ async def finish(self):
538539 )
539540 return
540541
541-
542542 with self .span ("agent.finish" ):
543543 # If connection is None or already closed, return immediately
544544 if not self ._connection :
@@ -788,7 +788,7 @@ async def on_audio_received(event: AudioReceivedEvent):
788788
789789 # Always listen to remote video tracks so we can forward frames to Realtime providers
790790 @self .edge .events .subscribe
791- async def on_track (event : TrackAddedEvent ):
791+ async def on_video_track_added (event : TrackAddedEvent ):
792792 track_id = event .track_id
793793 track_type = event .track_type
794794 user = event .user
@@ -802,7 +802,7 @@ async def on_track(event: TrackAddedEvent):
802802 f"🎥 Track re-added: { track_type_name } ({ track_id } ), switching to it"
803803 )
804804
805- if self .realtime_mode and isinstance ( self . llm , Realtime ) :
805+ if self .llm . handles_video :
806806 # Get the existing forwarder and switch to this track
807807 _ , _ , forwarder = self ._active_video_tracks [track_id ]
808808 track = self .edge .add_track_subscriber (track_id )
@@ -818,7 +818,7 @@ async def on_track(event: TrackAddedEvent):
818818 task .add_done_callback (_log_task_exception )
819819
820820 @self .edge .events .subscribe
821- async def on_track_removed (event : TrackRemovedEvent ):
821+ async def on_video_track_removed (event : TrackRemovedEvent ):
822822 track_id = event .track_id
823823 track_type = event .track_type
824824 if not track_id :
@@ -836,11 +836,7 @@ async def on_track_removed(event: TrackRemovedEvent):
836836 self ._active_video_tracks .pop (track_id , None )
837837
838838 # If this was the active track, switch to any other available track
839- if (
840- track_id == self ._current_video_track_id
841- and self .realtime_mode
842- and isinstance (self .llm , Realtime )
843- ):
839+ if self .llm .handles_video and track_id == self ._current_video_track_id :
844840 self .logger .info (
845841 "🎥 Active video track removed, switching to next available"
846842 )
@@ -866,7 +862,7 @@ async def _reply_to_audio(
866862 )
867863
868864 # when in Realtime mode call the Realtime directly (non-blocking)
869- if self .realtime_mode and isinstance ( self . llm , Realtime ) :
865+ if self .llm . handles_audio :
870866 # TODO: this behaviour should be easy to change in the agent class
871867 asyncio .create_task (
872868 self .llm .simple_audio_response (pcm_data , participant )
@@ -967,7 +963,7 @@ async def recv(self):
967963 # If Realtime provider supports video, switch to this new track
968964 track_type_name = TrackType .Name (track_type )
969965
970- if self .realtime_mode :
966+ if self .llm . handles_video :
971967 if self ._video_track :
972968 # We have a video publisher (e.g., YOLO processor)
973969 # Create a separate forwarder for the PROCESSED video track
@@ -1089,8 +1085,8 @@ async def recv(self):
10891085
10901086 async def _on_turn_event (self , event : TurnStartedEvent | TurnEndedEvent ) -> None :
10911087 """Handle turn detection events."""
1092- # In realtime mode, the LLM handles turn detection, interruption, and responses itself
1093- if self .realtime_mode :
1088+ # Skip the turn event handling if the model doesn't require TTS or SST audio itself.
1089+ if not ( self .llm . needs_tts and self . llm . needs_stt ) :
10941090 return
10951091
10961092 if isinstance (event , TurnStartedEvent ):
@@ -1105,71 +1101,63 @@ async def _on_turn_event(self, event: TurnStartedEvent | TurnEndedEvent) -> None
11051101 except Exception as e :
11061102 self .logger .error (f"Error stopping TTS: { e } " )
11071103 else :
1108- participant_id = event .participant .user_id if event .participant else "unknown"
1104+ participant_id = (
1105+ event .participant .user_id if event .participant else "unknown"
1106+ )
11091107 self .logger .info (
11101108 f"👉 Turn started - participant speaking { participant_id } : { event .confidence } "
11111109 )
11121110 else :
11131111 # Agent itself started speaking - this is normal
1114- participant_id = event .participant .user_id if event .participant else "unknown"
1115- self .logger .debug (
1116- f"👉 Turn started - agent speaking { participant_id } "
1112+ participant_id = (
1113+ event .participant .user_id if event .participant else "unknown"
11171114 )
1115+ self .logger .debug (f"👉 Turn started - agent speaking { participant_id } " )
11181116 elif isinstance (event , TurnEndedEvent ):
1119- participant_id = event .participant .user_id if event .participant else "unknown"
1117+ participant_id = (
1118+ event .participant .user_id if event .participant else "unknown"
1119+ )
11201120 self .logger .info (
11211121 f"👉 Turn ended - participant { participant_id } finished (confidence: { event .confidence } )"
11221122 )
1123+ if not event .participant or event .participant .user_id == self .agent_user .id :
1124+ # Exit early if the event is triggered by the model response.
1125+ return
11231126
1124- # When turn detection is enabled, trigger LLM response when user's turn ends
1127+ # When turn detection is enabled, trigger LLM response when user's turn ends.
11251128 # This is the signal that the user has finished speaking and expects a response
1126- if event .participant and event .participant .user_id != self .agent_user .id :
1127- # Get the accumulated transcript for this speaker
1128- transcript = self ._pending_user_transcripts .get (
1129- event .participant .user_id , ""
1129+ transcript = self ._pending_user_transcripts .get (
1130+ event .participant .user_id , ""
1131+ )
1132+ if transcript .strip ():
1133+ self .logger .info (
1134+ f"🤖 Triggering LLM response after turn ended for { event .participant .user_id } "
11301135 )
11311136
1132- if transcript and transcript .strip ():
1133- self .logger .info (
1134- f"🤖 Triggering LLM response after turn ended for { event .participant .user_id } "
1135- )
1136-
1137- # Create participant object if we have metadata
1138- participant = None
1139- if hasattr (event , "custom" ) and event .custom :
1140- # Try to extract participant info from custom metadata
1141- participant = event .custom .get ("participant" )
1137+ # Create participant object if we have metadata
1138+ participant = None
1139+ if hasattr (event , "custom" ) and event .custom :
1140+ # Try to extract participant info from custom metadata
1141+ participant = event .custom .get ("participant" )
11421142
1143- # Trigger LLM response with the complete transcript
1144- if self .llm :
1145- await self .simple_response (transcript , participant )
1143+ # Trigger LLM response with the complete transcript
1144+ await self .simple_response (transcript , participant )
11461145
1147- # Clear the pending transcript for this speaker
1148- self ._pending_user_transcripts [event .participant .user_id ] = ""
1146+ # Clear the pending transcript for this speaker
1147+ self ._pending_user_transcripts [event .participant .user_id ] = ""
11491148
11501149 async def _on_stt_error (self , error ):
11511150 """Handle STT service errors."""
11521151 self .logger .error (f"❌ STT Error: { error } " )
11531152
1154- @property
1155- def realtime_mode (self ) -> bool :
1156- """Check if the agent is in Realtime mode.
1157-
1158- Returns:
1159- True if `llm` is a `Realtime` implementation; otherwise False.
1160- """
1161- if self .llm is not None and isinstance (self .llm , Realtime ):
1162- return True
1163- return False
1164-
11651153 @property
11661154 def publish_audio (self ) -> bool :
11671155 """Whether the agent should publish an outbound audio track.
11681156
11691157 Returns:
11701158 True if TTS is configured or when in Realtime mode.
11711159 """
1172- if self .tts is not None or self .realtime_mode :
1160+ if self .tts is not None or self .llm . handles_audio :
11731161 return True
11741162 return False
11751163
@@ -1203,9 +1191,7 @@ def _needs_audio_or_video_input(self) -> bool:
12031191 # Video input needed for:
12041192 # - Video processors (for frame analysis)
12051193 # - Realtime mode with video (multimodal LLMs)
1206- needs_video = len (self .video_processors ) > 0 or (
1207- self .realtime_mode and isinstance (self .llm , Realtime )
1208- )
1194+ needs_video = len (self .video_processors ) > 0 or self .llm .handles_video
12091195
12101196 return needs_audio or needs_video
12111197
@@ -1256,7 +1242,7 @@ def image_processors(self) -> List[Any]:
12561242
12571243 def _validate_configuration (self ):
12581244 """Validate the agent configuration."""
1259- if self .realtime_mode :
1245+ if self .llm . handles_audio :
12601246 # Realtime mode - should not have separate STT/TTS
12611247 if self .stt or self .tts :
12621248 self .logger .warning (
@@ -1293,7 +1279,7 @@ def _prepare_rtc(self):
12931279
12941280 # Set up audio track if TTS is available
12951281 if self .publish_audio :
1296- if self .realtime_mode and isinstance ( self . llm , Realtime ) :
1282+ if self .llm . handles_audio :
12971283 self ._audio_track = self .llm .output_track
12981284 self .logger .info ("🎵 Using Realtime provider output track for audio" )
12991285 else :
0 commit comments