Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create unit tests for speech recognition #36

Merged
merged 1 commit into from
Jul 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,15 @@ Upon initiation, Transcribe will begin transcribing microphone input and speaker

The --api flag will use the whisper api for transcriptions. This significantly enhances transcription speed and accuracy, and it works in most languages (rather than just English without the flag). However, keep in mind, using the Whisper API consumes OpenAI credits than using the local model. This increased cost is attributed to the advanced features and capabilities that the Whisper API provides. Despite the additional expense, the substantial improvements in speed and transcription accuracy may make it a worthwhile for your use case.

### Crating Windows installs
### 🎬 Running Transcribe

Unit Tests

```
python -m unittest discover --verbose .\tests
```

### Creating Windows installs

Install Winrar from https://www.win-rar.com/.

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ pyinstaller==5.13.0
torch
pyperclip
PyYAML
numpy
soundfile
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# placeholder file to make this folder a module - this allows tests in this folder to be discovered by `python -m unittest discover`
Binary file added tests/audio-mono-16-bit-44100Hz.aiff
Binary file not shown.
Binary file added tests/audio-mono-16-bit-44100Hz.flac
Binary file not shown.
Binary file added tests/audio-mono-24-bit-44100Hz.flac
Binary file not shown.
Binary file added tests/audio-stereo-16-bit-44100Hz.aiff
Binary file not shown.
Binary file added tests/audio-stereo-16-bit-44100Hz.flac
Binary file not shown.
Binary file added tests/audio-stereo-24-bit-44100Hz.flac
Binary file not shown.
Binary file added tests/chinese.flac
Binary file not shown.
Binary file added tests/french.aiff
Binary file not shown.
146 changes: 146 additions & 0 deletions tests/test_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/usr/bin/env python3

import unittest
from os import path

import custom_speech_recognition as sr


class TestAudioFile(unittest.TestCase):
def assertSimilar(self, bytes_1, bytes_2):
for i, (byte_1, byte_2) in enumerate(zip(bytes_1, bytes_2)):
if abs(byte_1 - byte_2) > 2:
raise AssertionError("{} is really different from {} at index {}".format(bytes_1, bytes_2, i))

def test_get_segment(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertEqual(audio.get_raw_data(), audio.get_segment().get_raw_data())
self.assertEqual(audio.get_raw_data()[8:], audio.get_segment(0.022675738 * 2).get_raw_data())
self.assertEqual(audio.get_raw_data()[:16], audio.get_segment(None, 0.022675738 * 4).get_raw_data())
self.assertEqual(audio.get_raw_data()[8:16], audio.get_segment(0.022675738 * 2, 0.022675738 * 4).get_raw_data())

def test_wav_mono_8_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-8-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 1)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\xff\x00\xff\x00\xff\xff\x00\xff\x00\xff\x00\xff\x00\x00\xff\x00\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\xff")

def test_wav_mono_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xff\xff\x01\x00\xff\xff\x00\x00\x01\x00\xfe\xff\x01\x00\xfe\xff\x04\x00\xfc\xff\x04\x00\xfe\xff\xff\xff\x03\x00\xfe\xff")

def test_wav_mono_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xff\xff\x00\x01\x00\x00\xff\xff\x00\x00\x00\x00\x01\x00\x00\xfe\xff\x00\x01\x00\x00\xfe\xff\x00\x04\x00\x00\xfb")
else:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x01\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\xfe\xff\x00\x00\x01\x00")

def test_wav_mono_32_bit(self):
r = sr.Recognizer()
audio_file_path = path.join(path.dirname(path.realpath(__file__)), "audio-mono-32-bit-44100Hz.wav")
with sr.AudioFile(audio_file_path) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 4)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x01\x00\x00\x00\xff\xff\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\xfe\xff\x00\x00\x01\x00")

def test_wav_stereo_8_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-8-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 1)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\xff\x00\xff\x00\x00\xff\x7f\x7f\x00\xff\x00\xff\x00\x00\xff\x00\x7f\x7f\x7f\x00\x00\xff\x00\xff\x00\xff\x00\x7f\x7f\x7f\x7f")

def test_wav_stereo_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x02\x00\xfb\xff\x04\x00\xfe\xff\xfe\xff\x07\x00\xf6\xff\x07\x00\xf9\xff\t\x00\xf5\xff\x0c\x00\xf8\xff\x02\x00\x04\x00\xfa\xff")

def test_wav_stereo_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xfe\xff\x00\x02\x00\x00\xfe\xff\x00\x00\x00\x00\x02\x00\x00\xfc\xff\x00\x02\x00\x00\xfc\xff\x00\x08\x00\x00\xf6")
else:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\xfc\xff\x00\x00\x02\x00")

def test_wav_stereo_32_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-32-bit-44100Hz.wav")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 4)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\xfc\xff\x00\x00\x02\x00")

def test_aiff_mono_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.aiff")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xff\xff\x01\x00\xff\xff\x01\x00\xfe\xff\x02\x00\xfd\xff\x04\x00\xfc\xff\x03\x00\x00\x00\xfe\xff\x03\x00\xfd\xff")

def test_aiff_stereo_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.aiff")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xfe\xff\x02\x00\xfe\xff\xff\xff\x04\x00\xfa\xff\x04\x00\xfa\xff\t\x00\xf6\xff\n\x00\xfa\xff\xff\xff\x08\x00\xf5\xff")

def test_flac_mono_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-16-bit-44100Hz.flac")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\xff\xff\x01\x00\xff\xff\x00\x00\x01\x00\xfe\xff\x02\x00\xfc\xff\x06\x00\xf9\xff\x06\x00\xfe\xff\xfe\xff\x05\x00\xfa\xff")

def test_flac_mono_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-mono-24-bit-44100Hz.flac")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\xff\xfe\xff\x02\x01\x00\xfd\xfe\xff\x04\x00\x00\xfc\x00\x00\x04\xfe\xff\xfb\x00\x00\x05\xfe\xff\xfc\x03\x00\x04\xfb")
else:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\xff\xfe\xff\x00\x02\x01\x00\x00\xfd\xfe\xff\x00\x04\x00\x00\x00\xfc\x00\x00\x00\x04\xfe\xff\x00\xfb\x00\x00")

def test_flac_stereo_16_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-16-bit-44100Hz.flac")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
self.assertEqual(audio.sample_width, 2)
self.assertSimilar(audio.get_raw_data()[:32], b"\xff\xff\xff\xff\x02\x00\xfe\xff\x00\x00\x01\x00\xfd\xff\x01\x00\xff\xff\x04\x00\xfa\xff\x05\x00\xff\xff\xfd\xff\x08\x00\xf6\xff")

def test_flac_stereo_24_bit(self):
r = sr.Recognizer()
with sr.AudioFile(path.join(path.dirname(path.realpath(__file__)), "audio-stereo-24-bit-44100Hz.flac")) as source: audio = r.record(source)
self.assertIsInstance(audio, sr.AudioData)
self.assertEqual(audio.sample_rate, 44100)
if audio.sample_width == 3:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\xfe\xff\x00\x02\x00\x00\xfe\xff\x00\x00\x00\xff\x01\x00\x02\xfc\xff\xfe\x01\x00\x02\xfc\xff\xfe\x07\x00\x01\xf6")
else:
self.assertSimilar(audio.get_raw_data()[:32], b"\x00\x00\x00\x00\x00\x00\xfe\xff\x00\x00\x02\x00\x00\x00\xfe\xff\x00\x00\x00\x00\x00\xff\x01\x00\x00\x02\xfc\xff\x00\xfe\x01\x00")


if __name__ == "__main__":
unittest.main()
102 changes: 102 additions & 0 deletions tests/test_recognition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import unittest

import custom_speech_recognition as sr


class TestRecognition(unittest.TestCase):
def setUp(self):
self.AUDIO_FILE_EN = os.path.join(os.path.dirname(os.path.realpath(__file__)), "english.wav")
self.AUDIO_FILE_FR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "french.aiff")
self.AUDIO_FILE_ZH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "chinese.flac")
self.WHISPER_CONFIG = {"temperature": 0}

# def test_sphinx_english(self):
# r = sr.Recognizer()
# with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
# self.assertEqual(r.recognize_sphinx(audio), "one two three")

def test_google_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
result = r.recognize_google(audio)
self.assertIn(result, ["1 2"])

def test_google_french(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_google(audio, language="fr-FR"), u"et c'est la dictée numéro 1")

def test_google_chinese(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚")

@unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable")
def test_wit_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_wit(audio, key=os.environ["WIT_AI_KEY"]), "one two three")

@unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable")
def test_bing_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"]), "123.")

@unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable")
def test_bing_french(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"], language="fr-FR"), u"Essaye la dictée numéro un.")

@unittest.skipUnless("BING_KEY" in os.environ, "requires Microsoft Bing Voice Recognition key to be specified in BING_KEY environment variable")
def test_bing_chinese(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_bing(audio, key=os.environ["BING_KEY"], language="zh-CN"), u"砸自己的脚。")

@unittest.skipUnless("HOUNDIFY_CLIENT_ID" in os.environ and "HOUNDIFY_CLIENT_KEY" in os.environ, "requires Houndify client ID and client key to be specified in HOUNDIFY_CLIENT_ID and HOUNDIFY_CLIENT_KEY environment variables")
def test_houndify_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_houndify(audio, client_id=os.environ["HOUNDIFY_CLIENT_ID"], client_key=os.environ["HOUNDIFY_CLIENT_KEY"]), "one two three")

@unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables")
def test_ibm_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"]), "one two three ")

@unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables")
def test_ibm_french(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="fr-FR"), u"si la dictée numéro un ")

@unittest.skipUnless("IBM_USERNAME" in os.environ and "IBM_PASSWORD" in os.environ, "requires IBM Speech to Text username and password to be specified in IBM_USERNAME and IBM_PASSWORD environment variables")
def test_ibm_chinese(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_ibm(audio, username=os.environ["IBM_USERNAME"], password=os.environ["IBM_PASSWORD"], language="zh-CN"), u"砸 自己 的 脚 ")

def test_whisper_english(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, language="english", **self.WHISPER_CONFIG), " 1, 2, 3.")

def test_whisper_french(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, language="french", **self.WHISPER_CONFIG), " et c'est la dictée numéro 1.")

def test_whisper_chinese(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")

if __name__ == "__main__":
unittest.main()