@@ -56,7 +56,7 @@ def parse_response(response):
56
56
return output
57
57
58
58
59
- def transcribe_chunks (client , audio_chunks , model : str , language_code : str , raw : bool = False , word_level : bool = False ):
59
+ def transcribe_chunks_streaming (client , audio_chunks , model : str , language_code : str , raw : bool = False , word_level : bool = False ):
60
60
"""
61
61
Transcribe the given audio chunks
62
62
"""
@@ -66,7 +66,6 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
66
66
67
67
try :
68
68
if raw :
69
- print ('streaming raw' )
70
69
config = lambda chunk_len : RecognitionConfig (
71
70
sample_rate_hertz = SR ,
72
71
encoding = encoding ,
@@ -80,7 +79,6 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
80
79
audio_params = [(config (len (chunk )), RecognitionAudio (content = chunk )) for chunk in audio_chunks ]
81
80
response = client .streaming_recognize_raw (audio_params , uuid = "" )
82
81
else :
83
- print ('streaming with headers' )
84
82
audio = (RecognitionAudio (content = chunk ) for chunk in audio_chunks )
85
83
config = RecognitionConfig (
86
84
sample_rate_hertz = SR ,
@@ -97,6 +95,38 @@ def transcribe_chunks(client, audio_chunks, model: str, language_code: str, raw:
97
95
98
96
pprint (parse_response (response ))
99
97
98
+ def transcribe_chunks_bidi_streaming (client , audio_chunks , model : str , language_code : str , word_level : bool = False ):
99
+ """
100
+ Transcribe the given audio chunks
101
+ """
102
+
103
+ response = {}
104
+ encoding = RecognitionConfig .AudioEncoding .LINEAR16
105
+
106
+ try :
107
+ config = lambda chunk_len : RecognitionConfig (
108
+ sample_rate_hertz = SR ,
109
+ encoding = encoding ,
110
+ language_code = language_code ,
111
+ max_alternatives = 10 ,
112
+ model = model ,
113
+ raw = True ,
114
+ word_level = word_level ,
115
+ data_bytes = chunk_len
116
+ )
117
+
118
+ def audio_params_gen (audio_chunks_gen ):
119
+ for chunk in audio_chunks_gen :
120
+ yield config (len (chunk )), RecognitionAudio (content = chunk )
121
+
122
+ response_gen = client .bidi_streaming_recognize_raw (audio_params_gen (audio_chunks ), uuid = "" )
123
+ except Exception as e :
124
+ traceback .print_exc ()
125
+ print (f'error: { str (e )} ' )
126
+
127
+ for response in response_gen :
128
+ pprint (parse_response (response ))
129
+
100
130
101
131
def decode_files (client , audio_paths : List [str ], model : str , language_code : str , raw : bool = False , pcm : bool = False , word_level : bool = False ):
102
132
"""
@@ -105,7 +135,7 @@ def decode_files(client, audio_paths: List[str], model: str, language_code: str,
105
135
chunked_audios = [chunks_from_file (x , chunk_size = random .randint (1 , 3 ), raw = raw , pcm = pcm ) for x in audio_paths ]
106
136
107
137
threads = [
108
- threading .Thread (target = transcribe_chunks , args = (client , chunks , model , language_code , raw , word_level ))
138
+ threading .Thread (target = transcribe_chunks_streaming , args = (client , chunks , model , language_code , raw , word_level ))
109
139
for chunks in chunked_audios
110
140
]
111
141
@@ -126,6 +156,6 @@ def decode_files(client, audio_paths: List[str], model: str, language_code: str,
126
156
word_level = args ["--word-level" ]
127
157
128
158
if args ["mic" ]:
129
- transcribe_chunks (client , chunks_from_mic (int (args ["--n-secs" ]), SR , 1 ), model , language_code , raw , word_level )
159
+ transcribe_chunks_bidi_streaming (client , chunks_from_mic (int (args ["--n-secs" ]), SR , 1 ), model , language_code , word_level )
130
160
else :
131
161
decode_files (client , args ["<file>" ], model , language_code , raw , pcm , word_level )
0 commit comments