Skip to content

Commit 087f51f

Browse files
committed
feat: 切换默认声学模型到m251bn
1 parent f19b207 commit 087f51f

File tree

7 files changed

+36
-28
lines changed

7 files changed

+36
-28
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ $ python3 client_http.py
101101

102102
请注意,开启API服务器之后,需要使用本ASRT项目对应的客户端软件来进行语音识别,详见Wiki文档[下载ASRT语音识别客户端SDK和Demo](https://wiki.ailemon.net/docs/asrt-doc/download)
103103

104-
如果要训练和使用非251版模型,请在代码中 `import speech_model_zoo` 的相应位置做修改。
104+
如果要训练和使用非251bn版模型,请在代码中 `import speech_model_zoo` 的相应位置做修改。
105105

106106
使用docker直接部署ASRT:
107107
```shell

README_EN.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ To test whether it is successful or not that calls api service interface:
9797
$ python3 client_http.py
9898
```
9999

100-
If you want to train and use other model(not Model 251), make changes in the corresponding position of the `import speech_model_zoo` in the code files.
100+
If you want to train and use other model(not Model 251bn), make changes in the corresponding position of the `import speech_model_zoo` in the code files.
101101

102102
If there is any problem during the execution of the program or during use, it can be promptly put forward in the issue, and I will reply as soon as possible.
103103

asrserver.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import http.server
2727
import socket
2828
from speech_model import ModelSpeech
29-
from speech_model_zoo import SpeechModel251
29+
from speech_model_zoo import SpeechModel251BN
3030
from speech_features import Spectrogram
3131
from LanguageModel2 import ModelLanguage
3232

@@ -35,13 +35,13 @@
3535
CHANNELS = 1
3636
# 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块
3737
OUTPUT_SIZE = 1428
38-
sm251 = SpeechModel251(
38+
sm251bn = SpeechModel251BN(
3939
input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
4040
output_size=OUTPUT_SIZE
4141
)
4242
feat = Spectrogram()
43-
ms = ModelSpeech(sm251, feat, max_label_length=64)
44-
ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
43+
ms = ModelSpeech(sm251bn, feat, max_label_length=64)
44+
ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
4545

4646
ml = ModelLanguage('model_language')
4747
ml.LoadModel()

asrserver_http.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,37 +23,43 @@
2323
ASRT语音识别基于HTTP协议的API服务器程序
2424
"""
2525

26+
import argparse
2627
import base64
2728
import json
2829
from flask import Flask, Response, request
2930

3031
from speech_model import ModelSpeech
31-
from speech_model_zoo import SpeechModel251
32+
from speech_model_zoo import SpeechModel251BN
3233
from speech_features import Spectrogram
3334
from LanguageModel2 import ModelLanguage
3435
from utils.ops import decode_wav_bytes
3536

3637
API_STATUS_CODE_OK = 200000 # OK
3738
API_STATUS_CODE_CLIENT_ERROR = 400000
3839
API_STATUS_CODE_CLIENT_ERROR_FORMAT = 400001 # 请求数据格式错误
39-
API_STATUS_CODE_CLIENT_ERROR_FORMAT = 400002 # 请求数据配置不支持
40+
API_STATUS_CODE_CLIENT_ERROR_CONFIG = 400002 # 请求数据配置不支持
4041
API_STATUS_CODE_SERVER_ERROR = 500000
4142
API_STATUS_CODE_SERVER_ERROR_RUNNING = 500001 # 服务器运行中出错
4243

44+
parser = argparse.ArgumentParser(description='ASRT HTTP+Json RESTful API Service')
45+
parser.add_argument('--listen', default='0.0.0.0', type=str, help='the network to listen')
46+
parser.add_argument('--port', default='20001', type=str, help='the port to listen')
47+
args = parser.parse_args()
48+
4349
app = Flask("ASRT API Service")
4450

4551
AUDIO_LENGTH = 1600
4652
AUDIO_FEATURE_LENGTH = 200
4753
CHANNELS = 1
4854
# 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块
4955
OUTPUT_SIZE = 1428
50-
sm251 = SpeechModel251(
56+
sm251bn = SpeechModel251BN(
5157
input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
5258
output_size=OUTPUT_SIZE
5359
)
5460
feat = Spectrogram()
55-
ms = ModelSpeech(sm251, feat, max_label_length=64)
56-
ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
61+
ms = ModelSpeech(sm251bn, feat, max_label_length=64)
62+
ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
5763

5864
ml = ModelLanguage('model_language')
5965
ml.LoadModel()
@@ -149,7 +155,7 @@ def recognition_post(level):
149155
json_data = AsrtApiResponse(API_STATUS_CODE_OK, 'all level')
150156
json_data.result = result
151157
buffer = json_data.to_json()
152-
print('output:', buffer)
158+
print('ASRT Result:', result,'output:', buffer)
153159
return Response(buffer, mimetype='application/json')
154160
else:
155161
request_data = request.get_json()
@@ -165,6 +171,8 @@ def recognition_post(level):
165171
# request_data['samples'][-100:])
166172
json_data = AsrtApiResponse(API_STATUS_CODE_SERVER_ERROR, str(except_general))
167173
buffer = json_data.to_json()
174+
#print("input:", request_data, "\n", "output:", buffer)
175+
print("output:", buffer, "error:", except_general)
168176
return Response(buffer, mimetype='application/json')
169177

170178

@@ -173,4 +181,4 @@ def recognition_post(level):
173181
#app.run(host='0.0.0.0', port=20001)
174182
# for production env
175183
import waitress
176-
waitress.serve(app, host='0.0.0.0', port=20001)
184+
waitress.serve(app, host=args.listen, port=args.port)

evaluate_speech_model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import os
2727

2828
from speech_model import ModelSpeech
29-
from speech_model_zoo import SpeechModel251
29+
from speech_model_zoo import SpeechModel251BN
3030
from data_loader import DataLoader
3131
from speech_features import Spectrogram
3232

@@ -37,14 +37,14 @@
3737
CHANNELS = 1
3838
# 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块
3939
OUTPUT_SIZE = 1428
40-
sm251 = SpeechModel251(
40+
sm251bn = SpeechModel251BN(
4141
input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
4242
output_size=OUTPUT_SIZE
4343
)
4444
feat = Spectrogram()
4545
evalue_data = DataLoader('dev')
46-
ms = ModelSpeech(sm251, feat, max_label_length=64)
46+
ms = ModelSpeech(sm251bn, feat, max_label_length=64)
4747

48-
ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
48+
ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
4949
ms.evaluate_model(data_loader=evalue_data, data_count=-1,
5050
out_report=True, show_ratio=True, show_per_step=100)

predict_speech_file.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import os
2727

2828
from speech_model import ModelSpeech
29-
from speech_model_zoo import SpeechModel251
29+
from speech_model_zoo import SpeechModel251BN
3030
from speech_features import Spectrogram
3131
from LanguageModel2 import ModelLanguage
3232

@@ -37,14 +37,14 @@
3737
CHANNELS = 1
3838
# 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块
3939
OUTPUT_SIZE = 1428
40-
sm251 = SpeechModel251(
40+
sm251bn = SpeechModel251BN(
4141
input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
4242
output_size=OUTPUT_SIZE
4343
)
4444
feat = Spectrogram()
45-
ms = ModelSpeech(sm251, feat, max_label_length=64)
45+
ms = ModelSpeech(sm251bn, feat, max_label_length=64)
4646

47-
ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
47+
ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
4848
res = ms.recognize_speech_from_file('filename.wav')
4949
print('*[提示] 声学模型语音识别结果:\n', res)
5050

train_speech_model.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@
2828
from tensorflow.keras.optimizers import Adam
2929

3030
from speech_model import ModelSpeech
31-
from speech_model_zoo import SpeechModel251
31+
from speech_model_zoo import SpeechModel251BN
3232
from data_loader import DataLoader
33-
from speech_features import Spectrogram
33+
from speech_features import SpecAugment
3434

3535
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
3636

@@ -39,16 +39,16 @@
3939
CHANNELS = 1
4040
# 默认输出的拼音的表示大小是1428,即1427个拼音+1个空白块
4141
OUTPUT_SIZE = 1428
42-
sm251 = SpeechModel251(
42+
sm251bn = SpeechModel251BN(
4343
input_shape=(AUDIO_LENGTH, AUDIO_FEATURE_LENGTH, CHANNELS),
4444
output_size=OUTPUT_SIZE
4545
)
46-
feat = Spectrogram()
46+
feat = SpecAugment()
4747
train_data = DataLoader('train')
4848
opt = Adam(lr = 0.0001, beta_1 = 0.9, beta_2 = 0.999, decay = 0.0, epsilon = 10e-8)
49-
ms = ModelSpeech(sm251, feat, max_label_length=64)
49+
ms = ModelSpeech(sm251bn, feat, max_label_length=64)
5050

51-
#ms.load_model('save_models/' + sm251.get_model_name() + '.model.h5')
51+
#ms.load_model('save_models/' + sm251bn.get_model_name() + '.model.h5')
5252
ms.train_model(optimizer=opt, data_loader=train_data,
5353
epochs=50, save_step=1, batch_size=16, last_epoch=0)
54-
ms.save_model('save_models/' + sm251.get_model_name())
54+
ms.save_model('save_models/' + sm251bn.get_model_name())

0 commit comments

Comments
 (0)