feat: sync with parent repo (#2)

* local vad model * move model to assets * Remove typo in error message * Fix link in README.md * Added Romanian phoneme-based ASR model (m-bain#791) Co-authored-by: Barabazs <[email protected]> * feat: add new align models (m-bain#922) Co-authored-by: Barabazs <[email protected]> * feat: update Norwegian models (m-bain#687) Updated Norwegian Bokmål and Norwegian Nynorsk models Co-authored-by: Barabazs <[email protected]> * fix: Force ctranslate to version 4.4.0 Force ctranslate to version 4.4.0 due libcudnn_ops_infer.so.8: SYSTRAN/faster-whisper#729 Co-authored-by: Icaro Bombonato <[email protected]> * Update MANIFEST.in to include necessary files * chore: bump version * feat: update faster-whisper to 1.0.2 (m-bain#814) * Update faster-whisper to 1.0.2 to enable model distil-large-v3 * feat: add hotwords option to default_asr_options --------- Co-authored-by: Barabazs <[email protected]> * feat: add support for faster-whisper 1.0.3 (m-bain#875) --------- Co-authored-by: Barabazs <[email protected]> * feat: update versions for pyannote:3.3.2 and faster-whisper:1.1.0 (m-bain#936) * chore: bump faster-whisper to 1.1.0 * chore: bump pyannote to 3.3.2 * feat: add multilingual option in load_model function --------- Co-authored-by: Barabazs <[email protected]> * feat: add verbose output (m-bain#759) --------- Co-authored-by: Abhishek Sharma <[email protected]> Co-authored-by: Barabazs <[email protected]> * feat: add local_files_only option on whisperx.load_model for offline mode (m-bain#867) Adds the parameter local_files_only (default False for consistency) to whisperx.load_model so that the user can avoid downloading the file and return the path to the local cached file if it exists. --------- Co-authored-by: Barabazs <[email protected]> * feat: use model_dir as cache_dir for wav2vec2 (m-bain#681) * feat: add Python compatibility testing workflow feat: restrict Python versions to 3.9 - 3.12 * feat: add build and release workflow * chore: clean up MANIFEST.in by removing unnecessary asset inclusions * chore: update gitignore * fix: update README image source and enhance setup.py for long description * docs: update installation instructions in README * chore: update license in setup.py * fix: add UTF-8 encoding when reading README.md * chore: update ctranslate2 version to restrict <4.5.0 * chore: bump whisperX to 3.3.0 * fix: update import statement for conjunctions module * refactor: simplify imports for better type inference * refactor: add type hints * feat: include speaker information in WriteTXT when diarizing * refactor: replace NamedTuple with TranscriptionOptions in FasterWhisperPipeline --------- Co-authored-by: Max Bain <[email protected]> Co-authored-by: Max Bain <[email protected]> Co-authored-by: Alex Zamoshchin <[email protected]> Co-authored-by: Jim O’Regan <[email protected]> Co-authored-by: Ruhollah Majdoddin <[email protected]> Co-authored-by: Barabazs <[email protected]> Co-authored-by: Ismael Ruiz Ranz <[email protected]> Co-authored-by: pere <[email protected]> Co-authored-by: Icaro Bombonato <[email protected]> Co-authored-by: Frost Ming <[email protected]> Co-authored-by: moritzbrantner <[email protected]> Co-authored-by: Hasan Naseer <[email protected]> Co-authored-by: Abhishek Sharma <[email protected]> Co-authored-by: Abhishek Sharma <[email protected]> Co-authored-by: Roque Giordano <[email protected]> Co-authored-by: bnitsan <[email protected]> Co-authored-by: Philippe Anel <[email protected]>
opus-pro · Jan 6, 2025 · fbe8e8e · fbe8e8e
1 parent 6e405a8
commit fbe8e8e
Show file tree

Hide file tree

Showing 11 changed files with 178 additions and 82 deletions.
diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@
 </p>
 
 
-<img width="1216" align="center" alt="whisperx-arch" src="figures/pipeline.png">
+<img width="1216" align="center" alt="whisperx-arch" src="https://raw.githubusercontent.com/m-bain/whisperX/refs/heads/main/figures/pipeline.png">
 
 
 <!-- <p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy + quality via forced phoneme alignment and voice-activity based batching for fast inference.</p> -->
@@ -80,21 +80,40 @@ GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be inst
 
 See other methods [here.](https://pytorch.org/get-started/previous-versions/#v200)
 
-### 3. Install this repo
+### 3. Install WhisperX
 
-`pip install git+https://github.com/m-bain/whisperx.git`
+You have several installation options:
 
-If already installed, update package to most recent commit
+#### Option A: Stable Release (recommended)
+Install the latest stable version from PyPI:
 
-`pip install git+https://github.com/m-bain/whisperx.git --upgrade`
+```bash
+pip install whisperx
+```
+
+#### Option B: Development Version
+Install the latest development version directly from GitHub (may be unstable):
 
-If wishing to modify this package, clone and install in editable mode:
+```bash
+pip install git+https://github.com/m-bain/whisperx.git
 ```
-$ git clone https://github.com/m-bain/whisperX.git
-$ cd whisperX
-$ pip install -e .
+
+If already installed, update to the most recent commit:
+
+```bash
+pip install git+https://github.com/m-bain/whisperx.git --upgrade
 ```
 
+#### Option C: Development Mode
+If you wish to modify the package, clone and install in editable mode:
+```bash
+git clone https://github.com/m-bain/whisperX.git
+cd whisperX
+pip install -e .
+```
+
+> **Note**: The development version may contain experimental features and bugs. Use the stable PyPI release for production environments.
+
 You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
 
 ### Speaker Diarization

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 torch>=2
 torchaudio>=2
 faster-whisper==1.1.0
-ctranslate2==4.4.0
+ctranslate2<4.5.0
 transformers
 pandas
 setuptools>=65

diff --git a/setup.py b/setup.py
@@ -1,19 +1,22 @@
 import os
-import platform
 
 import pkg_resources
 from setuptools import find_packages, setup
 
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+
 setup(
     name="whisperx",
     py_modules=["whisperx"],
-    version="3.2.0",
+    version="3.3.0",
     description="Time-Accurate Automatic Speech Recognition using Whisper.",
-    readme="README.md",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
     python_requires=">=3.9, <3.13",
     author="Max Bain",
     url="https://github.com/m-bain/whisperx",
-    license="MIT",
+    license="BSD-2-Clause",
     packages=find_packages(exclude=["tests*"]),
     install_requires=[
         str(r)

diff --git a/whisperx/SubtitlesProcessor.py b/whisperx/SubtitlesProcessor.py
@@ -1,5 +1,5 @@
 import math
-from conjunctions import get_conjunctions, get_comma
+from .conjunctions import get_conjunctions, get_comma
 from typing import TextIO
 
 def normal_round(n):

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
@@ -3,7 +3,7 @@
 C. Max Bain
 """
 from dataclasses import dataclass
-from typing import Iterable, Union, List
+from typing import Iterable, Optional, Union, List
 
 import numpy as np
 import pandas as pd
@@ -65,7 +65,7 @@
 }
 
 
-def load_align_model(language_code, device, model_name=None, model_dir=None):
+def load_align_model(language_code: str, device: str, model_name: Optional[str] = None, model_dir=None):
     if model_name is None:
         # use default model
         if language_code in DEFAULT_ALIGN_MODELS_TORCH:

diff --git a/whisperx/asr.py b/whisperx/asr.py
@@ -1,17 +1,20 @@
 import os
 import warnings
-from typing import List, Union, Optional, NamedTuple
+from typing import List, NamedTuple, Optional, Union
 
 import ctranslate2
 import faster_whisper
 import numpy as np
 import torch
+from faster_whisper.tokenizer import Tokenizer
+from faster_whisper.transcribe import TranscriptionOptions, get_ctranslate2_storage
 from transformers import Pipeline
 from transformers.pipelines.pt_utils import PipelineIterator
 
 from .audio import N_SAMPLES, SAMPLE_RATE, load_audio, log_mel_spectrogram
-from .vad import load_vad_model, merge_chunks
-from .types import TranscriptionResult, SingleSegment
+from .types import SingleSegment, TranscriptionResult
+from .vad import VoiceActivitySegmentation, load_vad_model, merge_chunks
+
 
 def find_numeral_symbol_tokens(tokenizer):
     numeral_symbol_tokens = []
@@ -28,7 +31,13 @@ class WhisperModel(faster_whisper.WhisperModel):
     Currently only works in non-timestamp mode and fixed prompt for all samples in batch.
     '''
 
-    def generate_segment_batched(self, features: np.ndarray, tokenizer: faster_whisper.tokenizer.Tokenizer, options: faster_whisper.transcribe.TranscriptionOptions, encoder_output = None):
+    def generate_segment_batched(
+        self,
+        features: np.ndarray,
+        tokenizer: Tokenizer,
+        options: TranscriptionOptions,
+        encoder_output=None,
+    ):
         batch_size = features.shape[0]
         all_tokens = []
         prompt_reset_since = 0
@@ -81,7 +90,7 @@ def encode(self, features: np.ndarray) -> ctranslate2.StorageView:
         # unsqueeze if batch size = 1
         if len(features.shape) == 2:
             features = np.expand_dims(features, 0)
-        features = faster_whisper.transcribe.get_ctranslate2_storage(features)
+        features = get_ctranslate2_storage(features)
 
         return self.model.encode(features, to_cpu=to_cpu)
 
@@ -94,17 +103,17 @@ class FasterWhisperPipeline(Pipeline):
     # - add support for custom inference kwargs
 
     def __init__(
-            self,
-            model,
-            vad,
-            vad_params: dict,
-            options : NamedTuple,
-            tokenizer=None,
-            device: Union[int, str, "torch.device"] = -1,
-            framework = "pt",
-            language : Optional[str] = None,
-            suppress_numerals: bool = False,
-            **kwargs
+        self,
+        model: WhisperModel,
+        vad: VoiceActivitySegmentation,
+        vad_params: dict,
+        options: TranscriptionOptions,
+        tokenizer: Optional[Tokenizer] = None,
+        device: Union[int, str, "torch.device"] = -1,
+        framework="pt",
+        language: Optional[str] = None,
+        suppress_numerals: bool = False,
+        **kwargs,
     ):
         self.model = model
         self.tokenizer = tokenizer
@@ -156,7 +165,13 @@ def postprocess(self, model_outputs):
         return model_outputs
 
     def get_iterator(
-        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+        self,
+        inputs,
+        num_workers: int,
+        batch_size: int,
+        preprocess_params: dict,
+        forward_params: dict,
+        postprocess_params: dict,
     ):
         dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
         if "TOKENIZERS_PARALLELISM" not in os.environ:
@@ -171,7 +186,16 @@ def stack(items):
         return final_iterator
 
     def transcribe(
-        self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, chunk_size=30, print_progress = False, combined_progress=False, verbose=False
+        self,
+        audio: Union[str, np.ndarray],
+        batch_size: Optional[int] = None,
+        num_workers=0,
+        language: Optional[str] = None,
+        task: Optional[str] = None,
+        chunk_size=30,
+        print_progress=False,
+        combined_progress=False,
+        verbose=False,
     ) -> TranscriptionResult:
         if isinstance(audio, str):
             audio = load_audio(audio)
@@ -193,17 +217,23 @@ def data(audio, segments):
         if self.tokenizer is None:
             language = language or self.detect_language(audio)
             task = task or "transcribe"
-            self.tokenizer = faster_whisper.tokenizer.Tokenizer(self.model.hf_tokenizer,
-                                                                self.model.model.is_multilingual, task=task,
-                                                                language=language)
+            self.tokenizer = Tokenizer(
+                self.model.hf_tokenizer,
+                self.model.model.is_multilingual,
+                task=task,
+                language=language,
+            )
         else:
             language = language or self.tokenizer.language_code
             task = task or self.tokenizer.task
             if task != self.tokenizer.task or language != self.tokenizer.language_code:
-                self.tokenizer = faster_whisper.tokenizer.Tokenizer(self.model.hf_tokenizer,
-                                                                    self.model.model.is_multilingual, task=task,
-                                                                    language=language)
-
+                self.tokenizer = Tokenizer(
+                    self.model.hf_tokenizer,
+                    self.model.model.is_multilingual,
+                    task=task,
+                    language=language,
+                )
+
         if self.suppress_numerals:
             previous_suppress_tokens = self.options.suppress_tokens
             numeral_symbol_tokens = find_numeral_symbol_tokens(self.tokenizer)
@@ -243,8 +273,7 @@ def data(audio, segments):
 
         return {"segments": segments, "language": language}
 
-
-    def detect_language(self, audio: np.ndarray):
+    def detect_language(self, audio: np.ndarray) -> str:
         if audio.shape[0] < N_SAMPLES:
             print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
         model_n_mels = self.model.feat_kwargs.get("feature_size")
@@ -258,33 +287,36 @@ def detect_language(self, audio: np.ndarray):
         print(f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio...")
         return language
 
-def load_model(whisper_arch,
-               device,
-               device_index=0,
-               compute_type="float16",
-               asr_options=None,
-               language : Optional[str] = None,
-               vad_model=None,
-               vad_options=None,
-               model : Optional[WhisperModel] = None,
-               task="transcribe",
-               download_root=None,
-               local_files_only=False,
-               threads=4):
-    '''Load a Whisper model for inference.
+
+def load_model(
+    whisper_arch: str,
+    device: str,
+    device_index=0,
+    compute_type="float16",
+    asr_options: Optional[dict] = None,
+    language: Optional[str] = None,
+    vad_model: Optional[VoiceActivitySegmentation] = None,
+    vad_options: Optional[dict] = None,
+    model: Optional[WhisperModel] = None,
+    task="transcribe",
+    download_root: Optional[str] = None,
+    local_files_only=False,
+    threads=4,
+) -> FasterWhisperPipeline:
+    """Load a Whisper model for inference.
     Args:
-        whisper_arch: str - The name of the Whisper model to load.
-        device: str - The device to load the model on.
-        compute_type: str - The compute type to use for the model.
-        options: dict - A dictionary of options to use for the model.
-        language: str - The language of the model. (use English for now)
-        model: Optional[WhisperModel] - The WhisperModel instance to use.
-        download_root: Optional[str] - The root directory to download the model to.
-        local_files_only: bool - If `True`, avoid downloading the file and return the path to the local cached file if it exists.
-        threads: int - The number of cpu threads to use per worker, e.g. will be multiplied by num workers.
+        whisper_arch - The name of the Whisper model to load.
+        device - The device to load the model on.
+        compute_type - The compute type to use for the model.
+        options - A dictionary of options to use for the model.
+        language - The language of the model. (use English for now)
+        model - The WhisperModel instance to use.
+        download_root - The root directory to download the model to.
+        local_files_only - If `True`, avoid downloading the file and return the path to the local cached file if it exists.
+        threads - The number of cpu threads to use per worker, e.g. will be multiplied by num workers.
     Returns:
         A Whisper pipeline.
-    '''
+    """
 
     if whisper_arch.endswith(".en"):
         language = "en"
@@ -297,7 +329,7 @@ def load_model(whisper_arch,
                          local_files_only=local_files_only,
                          cpu_threads=threads)
     if language is not None:
-        tokenizer = faster_whisper.tokenizer.Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
+        tokenizer = Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
     else:
         print("No language specified, language will be first be detected for each audio file (increases inference time).")
         tokenizer = None
@@ -338,7 +370,7 @@ def load_model(whisper_arch,
     suppress_numerals = default_asr_options["suppress_numerals"]
     del default_asr_options["suppress_numerals"]
 
-    default_asr_options = faster_whisper.transcribe.TranscriptionOptions(**default_asr_options)
+    default_asr_options = TranscriptionOptions(**default_asr_options)
 
     default_vad_options = {
         "vad_onset": 0.500,

diff --git a/whisperx/audio.py b/whisperx/audio.py
@@ -22,7 +22,7 @@
 TOKENS_PER_SECOND = exact_div(SAMPLE_RATE, N_SAMPLES_PER_TOKEN)  # 20ms per audio token
 
 
-def load_audio(file: str, sr: int = SAMPLE_RATE):
+def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
     """
     Open an audio file and read as mono waveform, resampling as necessary
 

diff --git a/whisperx/conjunctions.py b/whisperx/conjunctions.py
@@ -1,5 +1,8 @@
 # conjunctions.py
 
+from typing import Set
+
+
 conjunctions_by_language = {
     'en': {'and', 'whether', 'or', 'as', 'but', 'so', 'for', 'nor', 'which', 'yet', 'although', 'since', 'unless', 'when', 'while', 'because', 'if', 'how', 'that', 'than', 'who', 'where', 'what', 'near', 'before', 'after', 'across', 'through', 'until', 'once', 'whereas', 'even', 'both', 'either', 'neither', 'though'},
     'fr': {'et', 'ou', 'mais', 'parce', 'bien', 'pendant', 'quand', 'où', 'comme', 'si', 'que', 'avant', 'après', 'aussitôt', 'jusqu’à', 'à', 'malgré', 'donc', 'tant', 'puisque', 'ni', 'soit', 'bien', 'encore', 'dès', 'lorsque'},
@@ -36,8 +39,9 @@
     'ur': '،'  
 }
 
-def get_conjunctions(lang_code):
+def get_conjunctions(lang_code: str) -> Set[str]:
     return conjunctions_by_language.get(lang_code, set())
 
-def get_comma(lang_code):
-    return commas_by_language.get(lang_code, ',')
+
+def get_comma(lang_code: str) -> str:
+    return commas_by_language.get(lang_code, ",")