hugging face implementation #128

absadiki · absadiki · commit ce9834d8e7ba · 2024-04-03T14:58:30.000-04:00
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ dl_translate==0.3.0
 faster_whisper
 whisperx @ git+https://github.com/m-bain/whisperx.git
 stable-ts
-openai
+openai
+transformers
diff --git a/src/subsai/configs.py b/src/subsai/configs.py
@@ -9,6 +9,7 @@
     DEFAULT_APPLY_OFFSET_SECONDS, DEFAULT_FRAME_RATE, DEFAULT_VAD
 
 from subsai.models.faster_whisper_model import FasterWhisperModel
+from subsai.models.hugging_face_model import HuggingFaceModel
 from subsai.models.whisperX_model import WhisperXModel
 from subsai.models.whisper_model import WhisperModel
 from subsai.models.whisper_timestamped_model import WhisperTimeStamped
@@ -69,6 +70,13 @@
         'url': 'https://platform.openai.com/docs/guides/speech-to-text',
         'config_schema': WhisperAPIModel.config_schema,
     },
+    'HuggingFace': {
+        'class': HuggingFaceModel,
+        'description': 'Hugging Face implementation of Whisper. '
+                       'Any speech recognition pretrained model from the Hugging Face hub can be used as well',
+        'url': 'https://huggingface.co/tasks/automatic-speech-recognition',
+        'config_schema': HuggingFaceModel.config_schema,
+    },
 }
 
 BASIC_TOOLS_CONFIGS = {
diff --git a/src/subsai/models/hugging_face_model.py b/src/subsai/models/hugging_face_model.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Hugging Face Model
+
+See [automatic-speech-recognition](https://huggingface.co/tasks/automatic-speech-recognition)
+"""
+
+import pysubs2
+from pysubs2 import SSAFile, SSAEvent
+from subsai.models.abstract_model import AbstractModel
+from subsai.utils import _load_config, get_available_devices
+
+from transformers import pipeline
+
+
+devices = get_available_devices()
+
+class HuggingFaceModel(AbstractModel):
+    model_name = 'HuggingFaceModel'
+    config_schema = {
+        # load model config
+        'model_id': {
+            'type': str,
+            'description': 'The model id from the Hugging Face Hub.',
+            'options': None,
+            'default': 'openai/whisper-tiny'
+        },
+        'device': {
+            'type': list,
+            'description': 'Pytorch device',
+            'options': devices,
+            'default': devices[0]
+        },
+        'segment_type': {
+            'type': list,
+            'description': "Sentence-level or word-level timestamps",
+            'options': ['sentence', 'word'],
+            'default': 'sentence'
+        },
+        'chunk_length_s': {
+            'type': float,
+            'description': '(`float`, *optional*, defaults to 0):'
+                           'The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).',
+            'options': None,
+            'default': 30
+        }
+    }
+
+    def __init__(self, model_config):
+        super(HuggingFaceModel, self).__init__(model_config=model_config,
+                                               model_name=self.model_name)
+        # config
+        self._model_id = _load_config('model_id', model_config, self.config_schema)
+        self._device = _load_config('device', model_config, self.config_schema)
+        self.segment_type = _load_config('segment_type', model_config, self.config_schema)
+        self._chunk_length_s = _load_config('chunk_length_s', model_config, self.config_schema)
+
+
+        self.model = pipeline(
+            "automatic-speech-recognition",
+            model=self._model_id,
+            device=self._device,
+        )
+
+    def transcribe(self, media_file):
+        results = self.model(
+            media_file,
+            chunk_length_s=self._chunk_length_s,
+            return_timestamps=True if self.segment_type == 'sentence' else 'word',
+        )
+        subs = SSAFile()
+        for chunk in results['chunks']:
+            event = SSAEvent(start=pysubs2.make_time(s=chunk['timestamp'][0]),
+                             end=pysubs2.make_time(s=chunk['timestamp'][1]))
+            event.plaintext = chunk['text']
+            subs.append(event)
+        return subs