Skip to content

Commit

Permalink
Merge pull request #124 from lotka/models-support/openai-api-whisper
Browse files Browse the repository at this point in the history
Added support for the OpenAI Whisper API
  • Loading branch information
abdeladim-s committed Mar 30, 2024
2 parents eb3797e + 50086f5 commit b2f3665
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 1 deletion.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
>- 🗣️ VAD preprocessing, reduces hallucination & batching with no WER degradation.
* [x] :new: [jianfch/stable-ts](https://github.com/jianfch/stable-ts)
* >**Stabilizing Timestamps for Whisper**: This library modifies [Whisper](https://github.com/openai/whisper) to produce more reliable timestamps and extends its functionality.
* [x] [API/openai/whisper](https://platform.openai.com/docs/guides/speech-to-text)
* > OpenAI Whisper via their API
* Web UI
* Fully offline, no third party services
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ pywhispercpp==1.1.1
dl_translate==0.3.0
faster_whisper
whisperx @ git+https://github.com/m-bain/whisperx.git
stable-ts
stable-ts
openai
7 changes: 7 additions & 0 deletions src/subsai/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from subsai.models.whispercpp_model import WhisperCppModel
from subsai.utils import get_available_devices, available_translation_models
from subsai.models.stable_ts_model import StableTsModel
from subsai.models.whisper_api_model import WhisperAPIModel

AVAILABLE_MODELS = {
'openai/whisper': {
Expand Down Expand Up @@ -62,6 +63,12 @@
'url': 'https://github.com/jianfch/stable-ts',
'config_schema': StableTsModel.config_schema,
},
'API/openai/whisper': {
'class': WhisperAPIModel,
'description': 'API for the OpenAI large-v2 Whisper model, requires an API key.',
'url': 'https://platform.openai.com/docs/guides/speech-to-text',
'config_schema': WhisperAPIModel.config_schema,
},
}

BASIC_TOOLS_CONFIGS = {
Expand Down
152 changes: 152 additions & 0 deletions src/subsai/models/whisper_api_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Whisper API Model
See [openai/whisper](https://platform.openai.com/docs/guides/speech-to-text)
"""

import os
import ffmpeg
import tempfile
from subsai.models.abstract_model import AbstractModel
from subsai.utils import _load_config
from openai import OpenAI
from pysubs2 import SSAFile
from pydub import AudioSegment

TMPDIR = tempfile.gettempdir()
OPENAI_API_SIZE_LIMIT_MB = 24

def split_filename(filepath):
path, full_filename = os.path.split(filepath)
filename, ext = os.path.splitext(full_filename)
return path,filename,ext

def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
# Construct the output file name
path,filename,ext = split_filename(video_file)
output_file = os.path.join(TMPDIR,f"{filename}.{output_ext}")


print('Saving audio to {} with ffmpeg...'.format(output_file))
# Execute the ffmpeg conversion
(
ffmpeg
.input(video_file)
.output(output_file)
.overwrite_output()
.run(quiet=True)
)
return output_file

class WhisperAPIModel(AbstractModel):
model_name = 'openai/whisper'
config_schema = {
# load model config
'model_type': {
'type': list,
'description': "OpenAI Whisper API, currently only supports large-v2 which is named as whisper-1/ \
There is a 25mb upload limit so audio is chunked locally, this may lead to lower performance.",
'options': ['whisper-1'],
'default': 'whisper-1'
},
'api_key': {
'type': str,
'description': "Your OpenAI API key",
'options': None,
'default': os.environ.get('OPENAI_KEY', None)
},
'language': {
'type': str,
'description': "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.",
'options': None,
'default': None
},
'prompt': {
'type': str,
'description': "An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.",
'options': None,
'default': None
},
'temperature': {
'type': float,
'description': "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use log probability to automatically increase the temperature until certain thresholds are hit.",
'options': None,
'default': 0
}
}

def __init__(self, model_config):
# config
self.model_type = _load_config('model_type', model_config, self.config_schema)
self.api_key = _load_config('api_key', model_config, self.config_schema)
self.language = _load_config('language', model_config, self.config_schema)
self.prompt = _load_config('prompt', model_config, self.config_schema)
self.temperature = _load_config('temperature', model_config, self.config_schema)

self.client = OpenAI(api_key=self.api_key)

def chunk_audio(self,audio_file_path) -> list:
# Load the audio file
audio = AudioSegment.from_mp3(audio_file_path)

# Desired chunk size in megabytes (MB)
chunk_size_bits = OPENAI_API_SIZE_LIMIT_MB * 1024 * 1024 * 8
bitrate = audio.frame_rate * audio.frame_width
chunk_duration_ms = ((chunk_size_bits) / bitrate) * 1000

chunks = []

# Split the audio into chunks
current_ms = 0
while current_ms < len(audio):
# Calculate the end of the current chunk
end_ms = current_ms + chunk_duration_ms
# Create a chunk from the current position to the end position
chunk = audio[current_ms:int(end_ms)]
# Add the chunk to the list of chunks and include offset
chunks.append((chunk,current_ms))
# Update the current position
current_ms = end_ms

return chunks


def transcribe(self, media_file) -> str:

audio_file_path = convert_video_to_audio_ffmpeg(media_file)

chunks = self.chunk_audio(audio_file_path)

results = ''

for i, (chunk,offset) in enumerate(chunks):
chunk_path = os.path.join(TMPDIR,f'chunk_{i}.mp3')
print('Transcribing audio chunk {}/{}'.format(i,len(chunks)))
chunk.export(chunk_path, format='mp3')
audio_file = open(chunk_path, "rb")

# Use OpenAI Whisper API
result = self.client.audio.transcriptions.create(
model=self.model_type,
language=self.language,
prompt=self.prompt,
temperature=self.temperature,
file=audio_file,
response_format="srt"
)

with open(chunk_path+'.srt','w') as f:
f.write(result)

# shift subtitles by offset
result = SSAFile.from_string(result)
result.shift(ms=offset)
results += result.to_string('srt')

results = ''.join(results)

return SSAFile.from_string(results)

0 comments on commit b2f3665

Please sign in to comment.