diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3cbe4fd --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/SenseVoiceSmall/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/SenseVoice-Real-Time.iml b/.idea/SenseVoice-Real-Time.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/SenseVoice-Real-Time.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..2400551 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,20 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..fd14ed8 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..d56a259 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..c937120 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +# SenseVoice-Real-Time + +本项目就是一个小的学习项目,实现了一下最简单的语音端点检测VAD(没用模型实现,这点确实该优化一下)、利用SenseVoice实现语音转录、利用CAM++实现说话人确认(声纹锁),实现并不复杂,简单玩玩,没什么实力,如果你找到了我这那纯属我们的缘分哈哈。 + + + +运行步骤: + +1. **下个CUDA版本的torch,版本号<=2.3就行(但是可能会下的比较慢)** + + ``` + pip install torch==2.2.1+cu118 torchaudio==2.2.1+cu118 --index-url https://download.pytorch.org/whl/cu118 + ``` + +2. **安装requirements.txt中的依赖** + + ``` + pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple -i https://pypi.tuna.tsinghua.edu.cn/simple some-package + ``` + +3. **前往`modelscope`下载`SenseVoiceSmall`** + + 链接:https://www.modelscope.cn/models/iic/SenseVoiceSmall/files + + 或者直接通过 git lfs下载 + + ``` + git clone https://www.modelscope.cn/iic/SenseVoiceSmall.git + ``` + + 下载完成后放入根目录下的`SenseVoiceSmall`文件夹中即可 + +4. **前往`modelscope`下载`iic/speech_campplus_sv_zh_en_16k-common_advanced`** + 链接:https://www.modelscope.cn/models/iic/speech_campplus_sv_zh_en_16k-common_advanced/files + 或者直接通过 git lfs下载 + + ``` + git clone https://www.modelscope.cn/iic/speech_campplus_sv_zh_en_16k-common_advanced.git + ``` + + 下载后放入根目录下的`speech_campplus_sv_zh_en_16k-common_advanced`文件夹即可 + +5. **自行录音,手机随便录一段,并通过脚本转换采样率** + + 首先将音频放入speakers文件夹中 + 项目中提供了脚本`audio_convert.py`,将音频转换为WAV格式并把采样率转换为16K,因为`speech_campplus`模型只能处理16K的音频。 + 如果你的音频名字有修改,记得去`demo_record_natural_voice_lock.py`中也把文件名改一下 + + ``` + def main(): + # 创建保存目录(如果目录不存在) + save_directory = "audio_logs" + os.makedirs(save_directory, exist_ok=True) + # 加载声纹锁示例音频,如果你的音频名字修改了则这里也需要修改 + reference_audio = "speakers/speaker_mine_converted.wav" + ``` + + + diff --git a/audio_convert.py b/audio_convert.py new file mode 100644 index 0000000..c8a0f52 --- /dev/null +++ b/audio_convert.py @@ -0,0 +1,11 @@ +from pydub import AudioSegment + +# 读取音频文件,自行修改文件路径文件名 +audio = AudioSegment.from_file("./speaker/speaker_mine.mp3") + +# 转换为单声道并调整采样率 +audio = audio.set_channels(1) +audio = audio.set_frame_rate(16000) # 如果需要将采样率调整为 16kHz + +# 保存转换后的音频文件 +audio.export("./speaker/speaker_mine_converted.wav", format="wav") diff --git a/demo1.py b/demo1.py new file mode 100644 index 0000000..b3a8391 --- /dev/null +++ b/demo1.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# -*- encoding: utf-8 -*- +# Copyright FunASR (https://github.com/FunAudioLLM/SenseVoice). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) + +from funasr import AutoModel +from funasr.utils.postprocess_utils import rich_transcription_postprocess + +model_dir = "./SenseVoiceSmall" + +model = AutoModel( + model=model_dir, + trust_remote_code=True, + remote_code="./model.py", + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + device="cuda:0", +) + +# en +res = model.generate( + input=f"{model.model_path}/example/en.mp3", + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# zh +res = model.generate( + input=f"{model.model_path}/example/zh.mp3", + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# yue +res = model.generate( + input=f"{model.model_path}/example/yue.mp3", + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# ja +res = model.generate( + input=f"{model.model_path}/example/ja.mp3", + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# ko +res = model.generate( + input=f"{model.model_path}/example/ko.mp3", + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) diff --git a/demo_record_natural_voice_lock.py b/demo_record_natural_voice_lock.py new file mode 100644 index 0000000..a46c078 --- /dev/null +++ b/demo_record_natural_voice_lock.py @@ -0,0 +1,186 @@ +import glob +import os + +import pyaudio +import wave +import numpy as np +import time +from modelscope.pipelines import pipeline +from model import SenseVoiceSmall +from funasr.utils.postprocess_utils import rich_transcription_postprocess + +# 音频参数 +CHUNK = 1024 +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 +MAX_TIME = 60 # 最大录音时间(秒) + +# VAD 参数 +THRESHOLD = 500 +SILENCE_LIMIT = 2 + +# 声纹识别参数 +SIMILARITY_THRESHOLD = 0.1 # 相似度阈值,可以根据需要调整 + +# 初始化声纹识别模型 +sv_pipeline = pipeline( + task='speaker-verification', + model='./speech_campplus_sv_zh_en_16k-common_advanced', + model_revision='v1.0.0' +) + +# 初始化 SenseVoiceSmall 模型 +model_dir = "./SenseVoiceSmall" +m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir, device="cuda:0") +m.eval() + + +def is_silent(data_chunk): + return max(data_chunk) < THRESHOLD + + +def record_audio(): + p = pyaudio.PyAudio() + stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) + + print("开始监听...") + audio_buffer = [] + silence_start = None + is_recording = False + + while True: + data = stream.read(CHUNK) + audio_buffer.append(data) + + if len(audio_buffer) > RATE / CHUNK * MAX_TIME: + audio_buffer.pop(0) + + if not is_recording: + if not is_silent(np.frombuffer(data, dtype=np.int16)): + print("检测到声音,开始录音...") + is_recording = True + silence_start = None + else: + if is_silent(np.frombuffer(data, dtype=np.int16)): + if silence_start is None: + silence_start = time.time() + elif time.time() - silence_start > SILENCE_LIMIT: + print("检测到静音,停止录音") + break + else: + silence_start = None + + stream.stop_stream() + stream.close() + p.terminate() + + return b''.join(audio_buffer) + + +def save_audio(data, filename): + wf = wave.open(filename, 'wb') + wf.setnchannels(CHANNELS) + wf.setsampwidth(pyaudio.PyAudio().get_sample_size(FORMAT)) + wf.setframerate(RATE) + wf.writeframes(data) + wf.close() + + +def verify_voice(reference_audio, input_audio): + result = sv_pipeline([reference_audio, input_audio]) + print(result) + return result['score'] # 返回相似度得分 + + +def transcribe_audio(audio_file): + res = m.inference( + data_in=audio_file, + language="auto", + use_itn=False, + ban_emo_unk=False, + **kwargs, + ) + + text = rich_transcription_postprocess(res[0][0]["text"]) + return text + + +# 手动热词 +def replace_diy_hotword(sentence): + # 定义一个Map,用于存储高频错词和对应的替换词 + error_hotkey_map = { + '你冇': '蕾姆', + '我冇': '蕾姆', + '雷母': '蕾姆', + '雷姆': '蕾姆', + '蕾母': '蕾姆', + '雷冇': '蕾姆', + '蕾冇': '蕾姆', + '人母': '蕾姆', + '你悟': '蕾姆', + '你姆': '蕾姆', + '人冇': '蕾姆', + '人姆': '蕾姆', + '李慕': '蕾姆', + # 添加更多的错词和替换词 + } + for wrong_word, correct_word in error_hotkey_map.items(): + # 每次替换都更新 sentence + sentence = sentence.replace(wrong_word, correct_word) + # print("corrected_text:{}".format(sentence)) + return sentence + + +def main(): + # 创建保存目录(如果目录不存在) + save_directory = "audio_logs" + os.makedirs(save_directory, exist_ok=True) + # 加载声纹锁示例音频 + reference_audio = "speakers/speaker_mine_converted.wav" + + max_files = 10 # 最多保留的文件数量 + + while True: + audio_data = record_audio() + # 使用 os.path.join 来拼接保存路径 + output_filename = os.path.join(save_directory, f"recorded_audio_{int(time.time())}.wav") + save_audio(audio_data, output_filename) + + # 检查并删除最旧的文件(如果文件数量超过 max_files) + existing_files = glob.glob(os.path.join(save_directory, "*.wav")) + if len(existing_files) > max_files: + # 按文件的修改时间排序 + existing_files.sort(key=os.path.getmtime) + # 删除最旧的文件 + os.remove(existing_files[0]) + + print("正在进行声纹验证...") + similarity = verify_voice(reference_audio, output_filename) + + if similarity >= SIMILARITY_THRESHOLD: + print(f"声纹验证通过 (相似度: {similarity:.2f})") + print("正在进行语音识别...") + transcribed_text = replace_diy_hotword(transcribe_audio(output_filename)) + print("识别结果:") + print(transcribed_text) + + log_dir = 'speak_log' + if not os.path.exists(log_dir): + os.makedirs(log_dir) + with open(os.path.join(log_dir, 'log.txt'), 'a', encoding='utf-8') as f: + if transcribed_text is not None or transcribed_text != "": + f.write(transcribed_text) + f.write("\n") + f.flush() # 刷新缓冲区,确保数据写入磁盘 + else: + print(f"声纹验证失败 (相似度: {similarity:.2f})") + + print("\n准备进行下一次录音,按 Ctrl+C 退出程序") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + print("程序已退出") diff --git a/model.py b/model.py new file mode 100644 index 0000000..0aee119 --- /dev/null +++ b/model.py @@ -0,0 +1,896 @@ + +import time +import torch +from torch import nn +import torch.nn.functional as F +from typing import Iterable, Optional + +from funasr.register import tables +from funasr.models.ctc.ctc import CTC +from funasr.utils.datadir_writer import DatadirWriter +from funasr.models.paraformer.search import Hypothesis +from funasr.train_utils.device_funcs import force_gatherable +from funasr.losses.label_smoothing_loss import LabelSmoothingLoss +from funasr.metrics.compute_acc import compute_accuracy, th_accuracy +from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank + + +class SinusoidalPositionEncoder(torch.nn.Module): + """ """ + + def __int__(self, d_model=80, dropout_rate=0.1): + pass + + def encode( + self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32 + ): + batch_size = positions.size(0) + positions = positions.type(dtype) + device = positions.device + log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / ( + depth / 2 - 1 + ) + inv_timescales = torch.exp( + torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment) + ) + inv_timescales = torch.reshape(inv_timescales, [batch_size, -1]) + scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape( + inv_timescales, [1, 1, -1] + ) + encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2) + return encoding.type(dtype) + + def forward(self, x): + batch_size, timesteps, input_dim = x.size() + positions = torch.arange(1, timesteps + 1, device=x.device)[None, :] + position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device) + + return x + position_encoding + + +class PositionwiseFeedForward(torch.nn.Module): + """Positionwise feed forward layer. + + Args: + idim (int): Input dimenstion. + hidden_units (int): The number of hidden units. + dropout_rate (float): Dropout rate. + + """ + + def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()): + """Construct an PositionwiseFeedForward object.""" + super(PositionwiseFeedForward, self).__init__() + self.w_1 = torch.nn.Linear(idim, hidden_units) + self.w_2 = torch.nn.Linear(hidden_units, idim) + self.dropout = torch.nn.Dropout(dropout_rate) + self.activation = activation + + def forward(self, x): + """Forward function.""" + return self.w_2(self.dropout(self.activation(self.w_1(x)))) + + +class MultiHeadedAttentionSANM(nn.Module): + """Multi-Head Attention layer. + + Args: + n_head (int): The number of heads. + n_feat (int): The number of features. + dropout_rate (float): Dropout rate. + + """ + + def __init__( + self, + n_head, + in_feat, + n_feat, + dropout_rate, + kernel_size, + sanm_shfit=0, + lora_list=None, + lora_rank=8, + lora_alpha=16, + lora_dropout=0.1, + ): + """Construct an MultiHeadedAttention object.""" + super().__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + # self.linear_q = nn.Linear(n_feat, n_feat) + # self.linear_k = nn.Linear(n_feat, n_feat) + # self.linear_v = nn.Linear(n_feat, n_feat) + + self.linear_out = nn.Linear(n_feat, n_feat) + self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3) + self.attn = None + self.dropout = nn.Dropout(p=dropout_rate) + + self.fsmn_block = nn.Conv1d( + n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False + ) + # padding + left_padding = (kernel_size - 1) // 2 + if sanm_shfit > 0: + left_padding = left_padding + sanm_shfit + right_padding = kernel_size - 1 - left_padding + self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) + + def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None): + b, t, d = inputs.size() + if mask is not None: + mask = torch.reshape(mask, (b, -1, 1)) + if mask_shfit_chunk is not None: + mask = mask * mask_shfit_chunk + inputs = inputs * mask + + x = inputs.transpose(1, 2) + x = self.pad_fn(x) + x = self.fsmn_block(x) + x = x.transpose(1, 2) + x += inputs + x = self.dropout(x) + if mask is not None: + x = x * mask + return x + + def forward_qkv(self, x): + """Transform query, key and value. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + + Returns: + torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k). + torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k). + torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k). + + """ + b, t, d = x.size() + q_k_v = self.linear_q_k_v(x) + q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) + q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time1, d_k) + k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time2, d_k) + v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose( + 1, 2 + ) # (batch, head, time2, d_k) + + return q_h, k_h, v_h, v + + def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None): + """Compute attention context vector. + + Args: + value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k). + scores (torch.Tensor): Attention score (#batch, n_head, time1, time2). + mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2). + + Returns: + torch.Tensor: Transformed value (#batch, time1, d_model) + weighted by the attention score (#batch, time1, time2). + + """ + n_batch = value.size(0) + if mask is not None: + if mask_att_chunk_encoder is not None: + mask = mask * mask_att_chunk_encoder + + mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) + + min_value = -float( + "inf" + ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) + scores = scores.masked_fill(mask, min_value) + self.attn = torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + + p_attn = self.dropout(self.attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h, v = self.forward_qkv(x) + fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) + return att_outs + fsmn_memory + + def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): + """Compute scaled dot product attention. + + Args: + query (torch.Tensor): Query tensor (#batch, time1, size). + key (torch.Tensor): Key tensor (#batch, time2, size). + value (torch.Tensor): Value tensor (#batch, time2, size). + mask (torch.Tensor): Mask tensor (#batch, 1, time2) or + (#batch, time1, time2). + + Returns: + torch.Tensor: Output tensor (#batch, time1, d_model). + + """ + q_h, k_h, v_h, v = self.forward_qkv(x) + if chunk_size is not None and look_back > 0 or look_back == -1: + if cache is not None: + k_h_stride = k_h[:, :, : -(chunk_size[2]), :] + v_h_stride = v_h[:, :, : -(chunk_size[2]), :] + k_h = torch.cat((cache["k"], k_h), dim=2) + v_h = torch.cat((cache["v"], v_h), dim=2) + + cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2) + cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2) + if look_back != -1: + cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :] + cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :] + else: + cache_tmp = { + "k": k_h[:, :, : -(chunk_size[2]), :], + "v": v_h[:, :, : -(chunk_size[2]), :], + } + cache = cache_tmp + fsmn_memory = self.forward_fsmn(v, None) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, None) + return att_outs + fsmn_memory, cache + + +class LayerNorm(nn.LayerNorm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, input): + output = F.layer_norm( + input.float(), + self.normalized_shape, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ) + return output.type_as(input) + + +def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None): + if maxlen is None: + maxlen = lengths.max() + row_vector = torch.arange(0, maxlen, 1).to(lengths.device) + matrix = torch.unsqueeze(lengths, dim=-1) + mask = row_vector < matrix + mask = mask.detach() + + return mask.type(dtype).to(device) if device is not None else mask.type(dtype) + + +class EncoderLayerSANM(nn.Module): + def __init__( + self, + in_size, + size, + self_attn, + feed_forward, + dropout_rate, + normalize_before=True, + concat_after=False, + stochastic_depth_rate=0.0, + ): + """Construct an EncoderLayer object.""" + super(EncoderLayerSANM, self).__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = LayerNorm(in_size) + self.norm2 = LayerNorm(size) + self.dropout = nn.Dropout(dropout_rate) + self.in_size = in_size + self.size = size + self.normalize_before = normalize_before + self.concat_after = concat_after + if self.concat_after: + self.concat_linear = nn.Linear(size + size, size) + self.stochastic_depth_rate = stochastic_depth_rate + self.dropout_rate = dropout_rate + + def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + """Compute encoded features. + + Args: + x_input (torch.Tensor): Input tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + skip_layer = False + # with stochastic depth, residual connection `x + f(x)` becomes + # `x <- x + 1 / (1 - p) * f(x)` at training time. + stoch_layer_coeff = 1.0 + if self.training and self.stochastic_depth_rate > 0: + skip_layer = torch.rand(1).item() < self.stochastic_depth_rate + stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate) + + if skip_layer: + if cache is not None: + x = torch.cat([cache, x], dim=1) + return x, mask + + residual = x + if self.normalize_before: + x = self.norm1(x) + + if self.concat_after: + x_concat = torch.cat( + ( + x, + self.self_attn( + x, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ), + ), + dim=-1, + ) + if self.in_size == self.size: + x = residual + stoch_layer_coeff * self.concat_linear(x_concat) + else: + x = stoch_layer_coeff * self.concat_linear(x_concat) + else: + if self.in_size == self.size: + x = residual + stoch_layer_coeff * self.dropout( + self.self_attn( + x, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ) + ) + else: + x = stoch_layer_coeff * self.dropout( + self.self_attn( + x, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ) + ) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm2(x) + + return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + + def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): + """Compute encoded features. + + Args: + x_input (torch.Tensor): Input tensor (#batch, time, size). + mask (torch.Tensor): Mask tensor for the input (#batch, time). + cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size). + + Returns: + torch.Tensor: Output tensor (#batch, time, size). + torch.Tensor: Mask tensor (#batch, time). + + """ + + residual = x + if self.normalize_before: + x = self.norm1(x) + + if self.in_size == self.size: + attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back) + x = residual + attn + else: + x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back) + + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + x = residual + self.feed_forward(x) + if not self.normalize_before: + x = self.norm2(x) + + return x, cache + + +@tables.register("encoder_classes", "SenseVoiceEncoderSmall") +class SenseVoiceEncoderSmall(nn.Module): + """ + Author: Speech Lab of DAMO Academy, Alibaba Group + SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition + https://arxiv.org/abs/2006.01713 + """ + + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + tp_blocks: int = 0, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + stochastic_depth_rate: float = 0.0, + input_layer: Optional[str] = "conv2d", + pos_enc_class=SinusoidalPositionEncoder, + normalize_before: bool = True, + concat_after: bool = False, + positionwise_layer_type: str = "linear", + positionwise_conv_kernel_size: int = 1, + padding_idx: int = -1, + kernel_size: int = 11, + sanm_shfit: int = 0, + selfattention_layer_type: str = "sanm", + **kwargs, + ): + super().__init__() + self._output_size = output_size + + self.embed = SinusoidalPositionEncoder() + + self.normalize_before = normalize_before + + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + ) + + encoder_selfattn_layer = MultiHeadedAttentionSANM + encoder_selfattn_layer_args0 = ( + attention_heads, + input_size, + output_size, + attention_dropout_rate, + kernel_size, + sanm_shfit, + ) + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + output_size, + attention_dropout_rate, + kernel_size, + sanm_shfit, + ) + + self.encoders0 = nn.ModuleList( + [ + EncoderLayerSANM( + input_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args0), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + ) + for i in range(1) + ] + ) + self.encoders = nn.ModuleList( + [ + EncoderLayerSANM( + output_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + ) + for i in range(num_blocks - 1) + ] + ) + + self.tp_encoders = nn.ModuleList( + [ + EncoderLayerSANM( + output_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + dropout_rate, + ) + for i in range(tp_blocks) + ] + ) + + self.after_norm = LayerNorm(output_size) + + self.tp_norm = LayerNorm(output_size) + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs_pad: torch.Tensor, + ilens: torch.Tensor, + ): + """Embed positions in tensor.""" + masks = sequence_mask(ilens, device=ilens.device)[:, None, :] + + xs_pad *= self.output_size() ** 0.5 + + xs_pad = self.embed(xs_pad) + + # forward encoder1 + for layer_idx, encoder_layer in enumerate(self.encoders0): + encoder_outs = encoder_layer(xs_pad, masks) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + for layer_idx, encoder_layer in enumerate(self.encoders): + encoder_outs = encoder_layer(xs_pad, masks) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + xs_pad = self.after_norm(xs_pad) + + # forward encoder2 + olens = masks.squeeze(1).sum(1).int() + + for layer_idx, encoder_layer in enumerate(self.tp_encoders): + encoder_outs = encoder_layer(xs_pad, masks) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + xs_pad = self.tp_norm(xs_pad) + return xs_pad, olens + + +@tables.register("model_classes", "SenseVoiceSmall") +class SenseVoiceSmall(nn.Module): + """CTC-attention hybrid Encoder-Decoder model""" + + def __init__( + self, + specaug: str = None, + specaug_conf: dict = None, + normalize: str = None, + normalize_conf: dict = None, + encoder: str = None, + encoder_conf: dict = None, + ctc_conf: dict = None, + input_size: int = 80, + vocab_size: int = -1, + ignore_id: int = -1, + blank_id: int = 0, + sos: int = 1, + eos: int = 2, + length_normalized_loss: bool = False, + **kwargs, + ): + + super().__init__() + + if specaug is not None: + specaug_class = tables.specaug_classes.get(specaug) + specaug = specaug_class(**specaug_conf) + if normalize is not None: + normalize_class = tables.normalize_classes.get(normalize) + normalize = normalize_class(**normalize_conf) + encoder_class = tables.encoder_classes.get(encoder) + encoder = encoder_class(input_size=input_size, **encoder_conf) + encoder_output_size = encoder.output_size() + + if ctc_conf is None: + ctc_conf = {} + ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf) + + self.blank_id = blank_id + self.sos = sos if sos is not None else vocab_size - 1 + self.eos = eos if eos is not None else vocab_size - 1 + self.vocab_size = vocab_size + self.ignore_id = ignore_id + self.specaug = specaug + self.normalize = normalize + self.encoder = encoder + self.error_calculator = None + + self.ctc = ctc + + self.length_normalized_loss = length_normalized_loss + self.encoder_output_size = encoder_output_size + + self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13} + self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13} + self.textnorm_dict = {"withitn": 14, "woitn": 15} + self.textnorm_int_dict = {25016: 14, 25017: 15} + self.embed = torch.nn.Embedding(7 + len(self.lid_dict) + len(self.textnorm_dict), input_size) + self.emo_dict = {"unk": 25009, "happy": 25001, "sad": 25002, "angry": 25003, "neutral": 25004} + + self.criterion_att = LabelSmoothingLoss( + size=self.vocab_size, + padding_idx=self.ignore_id, + smoothing=kwargs.get("lsm_weight", 0.0), + normalize_length=self.length_normalized_loss, + ) + + @staticmethod + def from_pretrained(model:str=None, **kwargs): + from funasr import AutoModel + model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs) + + return model, kwargs + + def forward( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + text: torch.Tensor, + text_lengths: torch.Tensor, + **kwargs, + ): + """Encoder + Decoder + Calc loss + Args: + speech: (Batch, Length, ...) + speech_lengths: (Batch, ) + text: (Batch, Length) + text_lengths: (Batch,) + """ + # import pdb; + # pdb.set_trace() + if len(text_lengths.size()) > 1: + text_lengths = text_lengths[:, 0] + if len(speech_lengths.size()) > 1: + speech_lengths = speech_lengths[:, 0] + + batch_size = speech.shape[0] + + # 1. Encoder + encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, text) + + loss_ctc, cer_ctc = None, None + loss_rich, acc_rich = None, None + stats = dict() + + loss_ctc, cer_ctc = self._calc_ctc_loss( + encoder_out[:, 4:, :], encoder_out_lens - 4, text[:, 4:], text_lengths - 4 + ) + + loss_rich, acc_rich = self._calc_rich_ce_loss( + encoder_out[:, :4, :], text[:, :4] + ) + + loss = loss_ctc + loss_rich + # Collect total loss stats + stats["loss_ctc"] = torch.clone(loss_ctc.detach()) if loss_ctc is not None else None + stats["loss_rich"] = torch.clone(loss_rich.detach()) if loss_rich is not None else None + stats["loss"] = torch.clone(loss.detach()) if loss is not None else None + stats["acc_rich"] = acc_rich + + # force_gatherable: to-device and to-tensor if scalar for DataParallel + if self.length_normalized_loss: + batch_size = int((text_lengths + 1).sum()) + loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device) + return loss, stats, weight + + def encode( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + text: torch.Tensor, + **kwargs, + ): + """Frontend + Encoder. Note that this method is used by asr_inference.py + Args: + speech: (Batch, Length, ...) + speech_lengths: (Batch, ) + ind: int + """ + + # Data augmentation + if self.specaug is not None and self.training: + speech, speech_lengths = self.specaug(speech, speech_lengths) + + # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN + if self.normalize is not None: + speech, speech_lengths = self.normalize(speech, speech_lengths) + + + lids = torch.LongTensor([[self.lid_int_dict[int(lid)] if torch.rand(1) > 0.2 and int(lid) in self.lid_int_dict else 0 ] for lid in text[:, 0]]).to(speech.device) + language_query = self.embed(lids) + + styles = torch.LongTensor([[self.textnorm_int_dict[int(style)]] for style in text[:, 3]]).to(speech.device) + style_query = self.embed(styles) + speech = torch.cat((style_query, speech), dim=1) + speech_lengths += 1 + + event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(speech.size(0), 1, 1) + input_query = torch.cat((language_query, event_emo_query), dim=1) + speech = torch.cat((input_query, speech), dim=1) + speech_lengths += 3 + + encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths) + + return encoder_out, encoder_out_lens + + def _calc_ctc_loss( + self, + encoder_out: torch.Tensor, + encoder_out_lens: torch.Tensor, + ys_pad: torch.Tensor, + ys_pad_lens: torch.Tensor, + ): + # Calc CTC loss + loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens) + + # Calc CER using CTC + cer_ctc = None + if not self.training and self.error_calculator is not None: + ys_hat = self.ctc.argmax(encoder_out).data + cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) + return loss_ctc, cer_ctc + + def _calc_rich_ce_loss( + self, + encoder_out: torch.Tensor, + ys_pad: torch.Tensor, + ): + decoder_out = self.ctc.ctc_lo(encoder_out) + # 2. Compute attention loss + loss_rich = self.criterion_att(decoder_out, ys_pad.contiguous()) + acc_rich = th_accuracy( + decoder_out.view(-1, self.vocab_size), + ys_pad.contiguous(), + ignore_label=self.ignore_id, + ) + + return loss_rich, acc_rich + + + def inference( + self, + data_in, + data_lengths=None, + key: list = ["wav_file_tmp_name"], + tokenizer=None, + frontend=None, + **kwargs, + ): + + + meta_data = {} + if ( + isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank" + ): # fbank + speech, speech_lengths = data_in, data_lengths + if len(speech.shape) < 3: + speech = speech[None, :, :] + if speech_lengths is None: + speech_lengths = speech.shape[1] + else: + # extract fbank feats + time1 = time.perf_counter() + audio_sample_list = load_audio_text_image_video( + data_in, + fs=frontend.fs, + audio_fs=kwargs.get("fs", 16000), + data_type=kwargs.get("data_type", "sound"), + tokenizer=tokenizer, + ) + time2 = time.perf_counter() + meta_data["load_data"] = f"{time2 - time1:0.3f}" + speech, speech_lengths = extract_fbank( + audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend + ) + time3 = time.perf_counter() + meta_data["extract_feat"] = f"{time3 - time2:0.3f}" + meta_data["batch_data_time"] = ( + speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 + ) + + speech = speech.to(device=kwargs["device"]) + speech_lengths = speech_lengths.to(device=kwargs["device"]) + + language = kwargs.get("language", "auto") + language_query = self.embed( + torch.LongTensor( + [[self.lid_dict[language] if language in self.lid_dict else 0]] + ).to(speech.device) + ).repeat(speech.size(0), 1, 1) + + use_itn = kwargs.get("use_itn", False) + textnorm = kwargs.get("text_norm", None) + if textnorm is None: + textnorm = "withitn" if use_itn else "woitn" + textnorm_query = self.embed( + torch.LongTensor([[self.textnorm_dict[textnorm]]]).to(speech.device) + ).repeat(speech.size(0), 1, 1) + speech = torch.cat((textnorm_query, speech), dim=1) + speech_lengths += 1 + + event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat( + speech.size(0), 1, 1 + ) + input_query = torch.cat((language_query, event_emo_query), dim=1) + speech = torch.cat((input_query, speech), dim=1) + speech_lengths += 3 + + # Encoder + encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths) + if isinstance(encoder_out, tuple): + encoder_out = encoder_out[0] + + # c. Passed the encoder result and the beam search + ctc_logits = self.ctc.log_softmax(encoder_out) + if kwargs.get("ban_emo_unk", False): + ctc_logits[:, :, self.emo_dict["unk"]] = -float("inf") + + results = [] + b, n, d = encoder_out.size() + if isinstance(key[0], (list, tuple)): + key = key[0] + if len(key) < b: + key = key * b + for i in range(b): + x = ctc_logits[i, : encoder_out_lens[i].item(), :] + yseq = x.argmax(dim=-1) + yseq = torch.unique_consecutive(yseq, dim=-1) + + ibest_writer = None + if kwargs.get("output_dir") is not None: + if not hasattr(self, "writer"): + self.writer = DatadirWriter(kwargs.get("output_dir")) + ibest_writer = self.writer[f"1best_recog"] + + mask = yseq != self.blank_id + token_int = yseq[mask].tolist() + + # Change integer-ids to tokens + text = tokenizer.decode(token_int) + + result_i = {"key": key[i], "text": text} + results.append(result_i) + + if ibest_writer is not None: + ibest_writer["text"][key[i]] = text + + return results, meta_data + + def export(self, **kwargs): + from export_meta import export_rebuild_model + + if "max_seq_len" not in kwargs: + kwargs["max_seq_len"] = 512 + models = export_rebuild_model(model=self, **kwargs) + return models diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e16fbcc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +modelscope +huggingface +huggingface_hub +funasr>=1.1.3 +numpy<=1.26.4 +gradio +pyaudio +addict +datasets +simplejson +sortedcontainers \ No newline at end of file diff --git a/speak_log/log.txt b/speak_log/log.txt new file mode 100644 index 0000000..a0160dc --- /dev/null +++ b/speak_log/log.txt @@ -0,0 +1 @@ +窗外的麻雀在电线杆上多嘴