forked from rpdrewes/whisper-websocket-server
-
Notifications
You must be signed in to change notification settings - Fork 0
/
minimal-fw-mem.py
138 lines (110 loc) · 6.17 KB
/
minimal-fw-mem.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
Simple implementation of a websocket interface to faster-whisper or
whisper.cpp, intended to act like the "kaldi-gstreamer" voice recognition
server for use with Konele app clients from Android phones.
This uses faster-whisper, which can be installed like this:
pip install faster-whisper
This version keeps all received audio data in memory rather than temp file.
This version also incorporates some simple corrections by regex. For
example, if you are talking to someone named "Conor" and
whisper keeps writing this as "Conner" or "Connor" you can have the
transcription automatically change that. See "corrections" below.
example of open from konele. note the user-id in the url, which we send
back with responses:
"GET /client/ws/speech?lang=und&user-agent=RecognizerIntentActivity%2F1.8.14%3B+Google%2Fbarbet%2Flineage_barbet-userdebug+13+TQ1A.230105.001.A2+48648b48b6%3B+null%2Fnull&calling-package=null&user-id=066be6e0-3e5e-4d58-b2e5-fabdd7b549d3&partial=true&content-type=audio%2Fx-raw%2C+layout%3D%28string%29interleaved%2C+rate%3D%28int%2916000%2C+format%3D%28string%29S16LE%2C+channels%3D%28int%291 HTTP/1.1"
what lib can se use to parse that url?
client.py is a program in the kaldi-gstreamer github (link: FIXME) that
is used to test the server.
when client.py connects, this is written by program (or sublibrary) to stderr:
192.168.1.211 - - [30/Jan/2023 10:42:33] "GET /client/ws/speech?content-type=audio%2Fx-raw%2C+layout%3D%28string%29interleaved%2C+rate%3D%28int%2916000%2C+format%3D%28string%29S16LE%2C+channels%3D%28int%291 HTTP/1.1" 101 46
when konele connects, this is written by this program (or sublibrary) to stderr:
192.168.1.144 - - [30/Jan/2023 10:44:37] "GET /client/ws/speech?lang=und&user-agent=RecognizerIntentActivity%2F1.8.14%3B+Google%2Fbarbet%2Flineage_barbet-userdebug+13+TQ1A.230105.001.A2+48648b48b6%3B+null%2Fnull&calling-package=null&user-id=066be6e0-3e5e-4d58-b2e5-fabdd7b549d3&partial=true&content-type=audio%2Fx-raw%2C+layout%3D%28string%29interleaved%2C+rate%3D%28int%2916000%2C+format%3D%28string%29S16LE%2C+channels%3D%28int%291 HTTP/1.1" 101 46
SO: konele is sending x-raw audio, interleaved, rate 16000, S16LE, channels 1
with this rate, the audio will be 8000 samples/second (16 bits per sample)
the audio encoding is present in both, but only konele sends the
"user-id" info which is presumably sent back by server on each communication?
WHO is writing that URL?
limitations:
audio must be 8k samples/second, each sample 16 bits signed, so 16KB/s of data
only one request at a time
no incremental results yet
no authentication on connection
"""
from wsocket import WSocketApp, WebSocketError, run
import re
import numpy as np
from faster_whisper import WhisperModel
__author__ = "Rich Drewes"
__version__ = "1.0"
__license__ = "MIT"
model_size = "base.en"
# for some reason "tiny" model duplicates recognition output on my test file
# the "base" model does not have this problem
#logger.setLevel(10) # for debugging
# RPD: later versions of wsocket want the "client" argument to on_close,
# but wsocket installed from pip right now does not want that arg
#def on_close(self, message, client):
def on_close(self, message):
#print(repr(client) + " : " + message)
print("close, message:" , message)
# does this override the on_connect in wsocket? or in addition to that (+=)?
def on_connect(client):
# RPD: how print full URL of the connect?
print("RPD: on_connect in minimal.py, client:", repr(client) + " connected")
print("RPD: on_connect in minimal.py client path:", client.path)
#exit()
client.abuf=[] # list of binary data segments that will be joined
model = WhisperModel(model_size, device="cpu", compute_type="int8")
# always make the following user-specific corrections
# for example, 99% of the time when I say Conor Drewes I am referring to
# my son Conor Drewes not Connor Drews!
corrections=[
(r"Conner", "Conor"),
(r"Connor", "Conor"),
(r"Collin", "Colin"),
(r"Drews", "Drewes")
]
def on_message(message, client):
#print(repr(client) + " : " + repr(message))
# messages with class 'bytearray' becomes class 'str' for EOS message
#print(type(message))
try:
if isinstance(message, str): # "EOS" message
# what if this takes so long the client gives up? should send incremental results,
# or a least pretend results, while we process
audio_data = b"".join(client.abuf)
print(f"audio buffer had {len(client.abuf)} segments, final length {len(audio_data)}")
audio_np=np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
segments, _ = model.transcribe(audio_np)
rr=[]
for segment in segments:
rr.append(segment.text)
m=" ".join(rr)
print("recognition result before strip:", m)
m=m.strip()
# sometimes whisper puts in commentary like [soft music] and we strip that out:
print("recognition result before bracket re:", m)
m=re.sub(r"\[.*\]", "", m)
print("recognition result after bracket re:", m)
# was having problems with things like "He said hello" which became He said, "Hello"
m=re.sub(r'"', '\\"', m) # convert " to \\"
print("recognition result after quoting:", m)
for a, b in corrections:
m=re.sub(a, b, m)
print("recognition result after site local corrections:", m)
# send json result to konele client:
msg=f'{{"status": 0, "segment": 0, "result": {{"hypotheses": [{{"transcript": "{m}"}}], "final": true}}, "id": "1aacc69d-3674-438a-b3c5-fc0ed51769a5"}}'
print("msg is", msg)
client.send(msg)
client.close()
else: # hopefully message is <class 'bytearray'>, and we just accummulate audio data
client.abuf.append(message)
print(f"audio buf now has {len(client.abuf)} segments")
except WebSocketError:
print("WebSocketError")
pass
app = WSocketApp()
app.onconnect += on_connect
app.onmessage += on_message
app.onclose += on_close
run(app, host="0.0.0.0", port=9002)