From 84d8fb4e38a7e84376bb0c2f5e8e11d48342fec8 Mon Sep 17 00:00:00 2001 From: kercre123 Date: Tue, 7 May 2024 22:52:54 -0500 Subject: [PATCH] Only process the chunks that have active frames in them (vosk) --- chipper/go.mod | 2 ++ chipper/go.sum | 4 ++++ chipper/pkg/wirepod/preqs/stream_houndify.go | 2 +- chipper/pkg/wirepod/speechrequest/speechrequest.go | 11 +++++++---- chipper/pkg/wirepod/stt/coqui/Coqui.go | 2 +- chipper/pkg/wirepod/stt/houndify/Houndify.go | 2 +- chipper/pkg/wirepod/stt/leopard/Leopard.go | 2 +- chipper/pkg/wirepod/stt/vosk/Vosk.go | 8 ++++---- chipper/pkg/wirepod/stt/whisper.cpp/WhisperCpp.go | 2 +- chipper/pkg/wirepod/stt/whisper/Whisper.go | 2 +- 10 files changed, 23 insertions(+), 14 deletions(-) diff --git a/chipper/go.mod b/chipper/go.mod index 8606bd00..128497e2 100644 --- a/chipper/go.mod +++ b/chipper/go.mod @@ -37,6 +37,7 @@ require ( github.com/akavel/rsrc v0.10.2 // indirect github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc // indirect github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect + github.com/alphacep/vosk-api/go v0.3.50 // indirect github.com/cenkalti/backoff v2.2.1+incompatible // indirect github.com/currantlabs/ble v0.0.0-20171229162446-c1d21c164cf8 // indirect github.com/dchest/jsmin v0.0.0-20220218165748-59f39799265f // indirect @@ -49,6 +50,7 @@ require ( github.com/hashicorp/hcl v1.0.0 // indirect github.com/jamesruan/sodium v0.0.0-20181216154042-9620b83ffeae // indirect github.com/josephspurrier/goversioninfo v1.4.0 // indirect + github.com/kercre123/vosk-api v1.0.1 // indirect github.com/kr/text v0.2.0 // indirect github.com/magiconair/properties v1.8.1 // indirect github.com/mattn/go-colorable v0.1.8 // indirect diff --git a/chipper/go.sum b/chipper/go.sum index e1a78acf..cf494332 100644 --- a/chipper/go.sum +++ b/chipper/go.sum @@ -55,6 +55,8 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5Vpd github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alphacep/vosk-api/go v0.3.50 h1:2vSN41RCU1WdHEqBrhKtTggfKL6Yu5Dmj+urVszwiuw= +github.com/alphacep/vosk-api/go v0.3.50/go.mod h1:9X8IJsHnFk/b1xyvjlZifo+ZL5VTAx3LW+JQce/eRcA= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= @@ -306,6 +308,8 @@ github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7V github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= github.com/karrick/godirwalk v1.8.0/go.mod h1:H5KPZjojv4lE+QYImBI8xVtrBRgYrIVsaRPx4tDPEn4= github.com/karrick/godirwalk v1.10.3/go.mod h1:RoGL9dQei4vP9ilrpETWE8CLOZ1kiN0LhBygSwrAsHA= +github.com/kercre123/vosk-api v1.0.1 h1:D5CeAMNHPj87M9fKrqP+a2gEQefq7sJCpaiuRscbiJY= +github.com/kercre123/vosk-api v1.0.1/go.mod h1:mJlLhtYS207jVY9QffYGxhX6Up0UfSQ3p0uNbXsf3Zc= github.com/kercre123/vosk-api/go v1.0.2 h1:NDJUNv2ddw128amiVZ2xE2gKfKHeBRRhboSh+yiH6Wg= github.com/kercre123/vosk-api/go v1.0.2/go.mod h1:oVZG/VFmg23uNDzjShcw7UhZHWYG2zXgBm5FqioE2Ao= github.com/kercre123/zeroconf v1.0.1 h1:Mbd8mN6xnNWYIqBN38x3jJjiPP2RmK4orzbGZsa1EOY= diff --git a/chipper/pkg/wirepod/preqs/stream_houndify.go b/chipper/pkg/wirepod/preqs/stream_houndify.go index 5e4f9fde..0fa49dd5 100644 --- a/chipper/pkg/wirepod/preqs/stream_houndify.go +++ b/chipper/pkg/wirepod/preqs/stream_houndify.go @@ -28,7 +28,7 @@ func StreamAudioToHoundify(sreq sr.SpeechRequest, client houndify.Client) string default: var chunk []byte chunk, err = sreq.GetNextStreamChunkOpus() - speechDone = sreq.DetectEndOfSpeech() + speechDone, _ = sreq.DetectEndOfSpeech() if err != nil { fmt.Println("End of stream") return diff --git a/chipper/pkg/wirepod/speechrequest/speechrequest.go b/chipper/pkg/wirepod/speechrequest/speechrequest.go index c6f504c4..4b3a2557 100755 --- a/chipper/pkg/wirepod/speechrequest/speechrequest.go +++ b/chipper/pkg/wirepod/speechrequest/speechrequest.go @@ -101,7 +101,7 @@ func BytesToIntVAD(stream opus.OggStream, data []byte, die bool, isOpus bool) [] } // Uses VAD to detect when the user stops speaking -func (req *SpeechRequest) DetectEndOfSpeech() bool { +func (req *SpeechRequest) DetectEndOfSpeech() (bool, bool) { // changes InactiveFrames and ActiveFrames in req inactiveNumMax := 23 vad := req.VADInst @@ -111,7 +111,7 @@ func (req *SpeechRequest) DetectEndOfSpeech() bool { if err != nil { logger.Println("VAD err:") logger.Println(err) - return true + return true, false } if active { req.ActiveFrames = req.ActiveFrames + 1 @@ -121,10 +121,13 @@ func (req *SpeechRequest) DetectEndOfSpeech() bool { } if req.InactiveFrames >= inactiveNumMax && req.ActiveFrames > 18 { logger.Println("(Bot " + req.Device + ") End of speech detected.") - return true + return true, true } } - return false + if req.ActiveFrames < 5 { + return false, false + } + return false, true } // Converts a vtt.*Request to a SpeechRequest, which allows functions like DetectEndOfSpeech to work diff --git a/chipper/pkg/wirepod/stt/coqui/Coqui.go b/chipper/pkg/wirepod/stt/coqui/Coqui.go index 757f788f..47dfb995 100755 --- a/chipper/pkg/wirepod/stt/coqui/Coqui.go +++ b/chipper/pkg/wirepod/stt/coqui/Coqui.go @@ -70,7 +70,7 @@ func STT(req sr.SpeechRequest) (string, error) { return "", err } coquiStream.FeedAudioContent(sr.BytesToSamples(chunk)) - speechIsDone = req.DetectEndOfSpeech() + speechIsDone, _ = req.DetectEndOfSpeech() if speechIsDone { break } diff --git a/chipper/pkg/wirepod/stt/houndify/Houndify.go b/chipper/pkg/wirepod/stt/houndify/Houndify.go index 43827059..26eefb3e 100755 --- a/chipper/pkg/wirepod/stt/houndify/Houndify.go +++ b/chipper/pkg/wirepod/stt/houndify/Houndify.go @@ -58,7 +58,7 @@ func STT(sreq sr.SpeechRequest) (string, error) { default: var chunk []byte chunk, err = sreq.GetNextStreamChunkOpus() - speechDone = sreq.DetectEndOfSpeech() + speechDone, _ = sreq.DetectEndOfSpeech() if err != nil { fmt.Println("End of stream") return diff --git a/chipper/pkg/wirepod/stt/leopard/Leopard.go b/chipper/pkg/wirepod/stt/leopard/Leopard.go index 0e192281..d946e85c 100755 --- a/chipper/pkg/wirepod/stt/leopard/Leopard.go +++ b/chipper/pkg/wirepod/stt/leopard/Leopard.go @@ -77,7 +77,7 @@ func STT(req sr.SpeechRequest) (transcribedText string, err error) { BotNumMu.Unlock() return "", err } - speechIsDone = req.DetectEndOfSpeech() + speechIsDone, _ = req.DetectEndOfSpeech() if speechIsDone { break } diff --git a/chipper/pkg/wirepod/stt/vosk/Vosk.go b/chipper/pkg/wirepod/stt/vosk/Vosk.go index 4405efc1..286c3203 100755 --- a/chipper/pkg/wirepod/stt/vosk/Vosk.go +++ b/chipper/pkg/wirepod/stt/vosk/Vosk.go @@ -185,7 +185,6 @@ func getRec(withGrm bool) (*vosk.VoskRecognizer, int) { func STT(req sr.SpeechRequest) (string, error) { logger.Println("(Bot " + req.Device + ", Vosk) Processing...") - speechIsDone := false var withGrm bool if (vars.APIConfig.Knowledge.IntentGraph || req.IsKG) || !GrammerEnable { logger.Println("Using general recognizer") @@ -203,9 +202,10 @@ func STT(req sr.SpeechRequest) (string, error) { if err != nil { return "", err } - rec.AcceptWaveform(chunk) - // has to be split into 320 []byte chunks for VAD - speechIsDone = req.DetectEndOfSpeech() + speechIsDone, doProcess := req.DetectEndOfSpeech() + if doProcess { + rec.AcceptWaveform(chunk) + } if speechIsDone { break } diff --git a/chipper/pkg/wirepod/stt/whisper.cpp/WhisperCpp.go b/chipper/pkg/wirepod/stt/whisper.cpp/WhisperCpp.go index 55425d3f..b4305ba1 100644 --- a/chipper/pkg/wirepod/stt/whisper.cpp/WhisperCpp.go +++ b/chipper/pkg/wirepod/stt/whisper.cpp/WhisperCpp.go @@ -65,7 +65,7 @@ func STT(req sr.SpeechRequest) (string, error) { return "", err } // has to be split into 320 []byte chunks for VAD - speechIsDone = req.DetectEndOfSpeech() + speechIsDone, _ = req.DetectEndOfSpeech() if speechIsDone { break } diff --git a/chipper/pkg/wirepod/stt/whisper/Whisper.go b/chipper/pkg/wirepod/stt/whisper/Whisper.go index e5dd67a6..96e985b1 100755 --- a/chipper/pkg/wirepod/stt/whisper/Whisper.go +++ b/chipper/pkg/wirepod/stt/whisper/Whisper.go @@ -120,7 +120,7 @@ func STT(req sr.SpeechRequest) (string, error) { return "", err } // has to be split into 320 []byte chunks for VAD - speechIsDone = req.DetectEndOfSpeech() + speechIsDone, _ = req.DetectEndOfSpeech() if speechIsDone { break }