diff --git a/_posts/2024-05-09-speech-conversational-llms.md b/_posts/2024-05-09-speech-conversational-llms.md new file mode 100644 index 00000000..32ee3e5a --- /dev/null +++ b/_posts/2024-05-09-speech-conversational-llms.md @@ -0,0 +1,67 @@ +--- +title: Speech LLMs for Conversations +date: 2024-05-09 +tags: [llm, speech, conversations] +categories: [Machine Learning] +layout: post +authors: [Shangeth, lepisma] +latex: True +--- + +With LLMs making conversational systems has become easier. You no longer need to +focus on the low-level details of categorizing semantics and designing +responses. Instead, you can concentrate on controlling high-level behaviors via +an LLM. This is the trend that we see most of the world moving towards as +products are using vendor combinations of ASR, LLM, and TTS with some dialog +management stitched in between. While this is going to be the norm soon, we want +to keep exploring areas from where the next set of quality improvements will +come. + +[Earlier](/speech-first-conversational-ai-revisited/) we discussed how spoken +conversations are richer than pure text and how the gap would be not bridged by +LLMs purely working on transcriptions. In one of our recent experiments we build +an efficient multi-modal LLM that takes speech directly to provide better +conversational experience. For production usage, the constraint here is that +this should happen without losing the flexibility that you get in a text-only +LLM around writing prompts, making changes, evaluating, and debugging. + +Below is a conversation with our recent in-house Speech LLM based conversational +system. Notice that because of the extra information in speech some micro +personalizations can happen like usage of gendered pronouns[^1]. You also get +lower impact of transcription errors and in general better responses in +non-speech signals. With access to both speech and text domains, the model +allows for more fluent turn-taking, though not demonstrated in the current +conversation. In addition, our approach also reduces the combined model size +(<2B) for taking speech to response, leading to lower compute latency as +compared to larger systems. + + + +
+ + + +The model above doesn't yet control speech synthesis beyond the textual markers +it can generate, but that's something to be added soon (you might have noticed +erratic pitch shifts in the call above since TTS vendors don't contextualize +based on past conversations). Stay tuned for more details on how we take this +and similar research areas forward. + +[^1]: Of course concerns around paralinguistic prediction accuracies are + extremely important to take something like this in production. diff --git a/assets/audios/posts/speech-conversational-llms/audio.m4a b/assets/audios/posts/speech-conversational-llms/audio.m4a new file mode 100644 index 00000000..a9739a88 Binary files /dev/null and b/assets/audios/posts/speech-conversational-llms/audio.m4a differ diff --git a/assets/audios/posts/speech-conversational-llms/metadata.vtt b/assets/audios/posts/speech-conversational-llms/metadata.vtt new file mode 100644 index 00000000..0fa14575 --- /dev/null +++ b/assets/audios/posts/speech-conversational-llms/metadata.vtt @@ -0,0 +1,34 @@ +WEBVTT + +00:00:04.940 --> 00:00:06.880 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "female, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:00:17.530 --> 00:00:20.840 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "female, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:00:29.080 --> 00:00:33.400 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:00:47.420 --> 00:00:52.020 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:01:07.190 --> 00:01:11.510 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, oceania", "title_alt": "", "subjects_alt": ""} + +00:01:20.900 --> 00:01:22.530 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:01:32.440 --> 00:01:37.700 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:01:44.770 --> 00:01:48.200 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:01:52.680 --> 00:01:53.790 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:02:01.050 --> 00:02:09.710 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} + +00:02:33.770 --> 00:02:36.590 +{"keywords_alt": "", "gpspoints": {"gps_zoom": "", "gps_text_alt": "", "gps_text": "", "": ""}, "synopsis": "", "subjects": "", "hyperlinks": {"hyperlink_text_alt": "", "hyperlink_text": "", "hyperlink": ""}, "synopsis_alt": "", "title": "", "keywords": "male, middle-aged, neutral, america", "title_alt": "", "subjects_alt": ""} \ No newline at end of file diff --git a/assets/audios/posts/speech-conversational-llms/transcript.vtt b/assets/audios/posts/speech-conversational-llms/transcript.vtt new file mode 100644 index 00000000..12eeaa68 --- /dev/null +++ b/assets/audios/posts/speech-conversational-llms/transcript.vtt @@ -0,0 +1,75 @@ +WEBVTT +Kind: captions +Language: en + +00:00:00.000 --> 00:00:03.040 +