diff --git a/openadapt/alembic/versions/98c8851a5321_add_audio_info.py b/openadapt/alembic/versions/98c8851a5321_add_audio_info.py
new file mode 100644
index 000000000..a3db85869
--- /dev/null
+++ b/openadapt/alembic/versions/98c8851a5321_add_audio_info.py
@@ -0,0 +1,53 @@
+"""add_audio_info
+
+Revision ID: 98c8851a5321
+Revises: d714cc86fce8
+Create Date: 2024-05-29 16:56:25.832333
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+import openadapt
+
+# revision identifiers, used by Alembic.
+revision = "98c8851a5321"
+down_revision = "d714cc86fce8"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "audio_info",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "timestamp",
+            openadapt.models.ForceFloat(precision=10, scale=2, asdecimal=False),
+            nullable=True,
+        ),
+        sa.Column("flac_data", sa.LargeBinary(), nullable=True),
+        sa.Column("transcribed_text", sa.String(), nullable=True),
+        sa.Column(
+            "recording_timestamp",
+            openadapt.models.ForceFloat(precision=10, scale=2, asdecimal=False),
+            nullable=True,
+        ),
+        sa.Column("recording_id", sa.Integer(), nullable=True),
+        sa.Column("sample_rate", sa.Integer(), nullable=True),
+        sa.Column("words_with_timestamps", sa.Text(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["recording_id"],
+            ["recording.id"],
+            name=op.f("fk_audio_info_recording_id_recording"),
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_audio_info")),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("audio_info")
+    # ### end Alembic commands ###
diff --git a/openadapt/app/dashboard/api/recordings.py b/openadapt/app/dashboard/api/recordings.py
index 79ee5b9e0..2f349650e 100644
--- a/openadapt/app/dashboard/api/recordings.py
+++ b/openadapt/app/dashboard/api/recordings.py
@@ -1,5 +1,7 @@
 """API endpoints for recordings."""
 
+import json
+
 from fastapi import APIRouter, WebSocket
 from loguru import logger
 
@@ -80,6 +82,22 @@ async def get_recording_detail(websocket: WebSocket, recording_id: int) -> None:
                 {"type": "num_events", "value": len(action_events)}
             )
 
+            try:
+                # TODO: change to use recording_id once scrubbing PR is merged
+                audio_info = crud.get_audio_info(session, recording.timestamp)[0]
+                words_with_timestamps = json.loads(audio_info.words_with_timestamps)
+                words_with_timestamps = [
+                    {
+                        "word": word["word"],
+                        "start": word["start"] + action_events[0].timestamp,
+                        "end": word["end"] + action_events[0].timestamp,
+                    }
+                    for word in words_with_timestamps
+                ]
+            except IndexError:
+                words_with_timestamps = []
+            word_index = 0
+
             def convert_to_str(event_dict: dict) -> dict:
                 """Convert the keys to strings."""
                 if "key" in event_dict:
@@ -104,7 +122,18 @@ def convert_to_str(event_dict: dict) -> dict:
                     width, height = 0, 0
                 event_dict["screenshot"] = image
                 event_dict["dimensions"] = {"width": width, "height": height}
-
+                words = []
+                # each word in words_with_timestamp is a dict of word, start, end
+                # we want to add the word to the event_dict if the start is
+                # before the event timestamp
+                while (
+                    word_index < len(words_with_timestamps)
+                    and words_with_timestamps[word_index]["start"]
+                    < event_dict["timestamp"]
+                ):
+                    words.append(words_with_timestamps[word_index]["word"])
+                    word_index += 1
+                event_dict["words"] = words
                 convert_to_str(event_dict)
                 await websocket.send_json({"type": "action_event", "value": event_dict})
 
diff --git a/openadapt/app/dashboard/components/ActionEvent/ActionEvent.tsx b/openadapt/app/dashboard/components/ActionEvent/ActionEvent.tsx
index 1b54a5273..dd05f6be2 100644
--- a/openadapt/app/dashboard/components/ActionEvent/ActionEvent.tsx
+++ b/openadapt/app/dashboard/components/ActionEvent/ActionEvent.tsx
@@ -122,6 +122,12 @@ export const ActionEvent = ({
                                 <TableCellWithBorder>{event.parent_id}</TableCellWithBorder>
                             </TableRowWithBorder>
                         )}
+                        {event.words && event.words.length > 0 && (
+                            <TableRowWithBorder>
+                                <TableCellWithBorder>transcription</TableCellWithBorder>
+                                <TableCellWithBorder>{event.words.join(' ')}</TableCellWithBorder>
+                            </TableRowWithBorder>
+                        )}
                         <TableRowWithBorder>
                             <TableCellWithBorder>children</TableCellWithBorder>
                             <TableCellWithBorder>{event.children?.length || 0}</TableCellWithBorder>
diff --git a/openadapt/app/dashboard/types/action-event.ts b/openadapt/app/dashboard/types/action-event.ts
index 86b358189..c7a8faef6 100644
--- a/openadapt/app/dashboard/types/action-event.ts
+++ b/openadapt/app/dashboard/types/action-event.ts
@@ -26,4 +26,5 @@ export type ActionEvent = {
     mask: string | null;
     dimensions?: { width: number, height: number };
     children?: ActionEvent[];
+    words?: string[];
 }
diff --git a/openadapt/app/tray.py b/openadapt/app/tray.py
index 3b235a2c1..7e803a446 100644
--- a/openadapt/app/tray.py
+++ b/openadapt/app/tray.py
@@ -76,10 +76,6 @@ def __init__(self) -> None:
 
         self.app.setQuitOnLastWindowClosed(False)
 
-        # since the lock is a file, delete it when starting the app so that
-        # new instances can start even if the previous one crashed
-        crud.release_db_lock(raise_exception=False)
-
         # currently required for pyqttoast
         # TODO: remove once https://github.com/niklashenning/pyqt-toast/issues/9
         # is addressed
diff --git a/openadapt/config.defaults.json b/openadapt/config.defaults.json
index 1e06afee0..742b02fec 100644
--- a/openadapt/config.defaults.json
+++ b/openadapt/config.defaults.json
@@ -19,6 +19,7 @@
     "RECORD_READ_ACTIVE_ELEMENT_STATE": false,
     "REPLAY_STRIP_ELEMENT_STATE": true,
     "RECORD_VIDEO": true,
+    "RECORD_AUDIO": true,
     "RECORD_FULL_VIDEO": false,
     "RECORD_IMAGES": false,
     "LOG_MEMORY": false,
diff --git a/openadapt/config.py b/openadapt/config.py
index a0cdc3813..c739f1f7c 100644
--- a/openadapt/config.py
+++ b/openadapt/config.py
@@ -29,6 +29,7 @@
 PERFORMANCE_PLOTS_DIR_PATH = (DATA_DIR_PATH / "performance").absolute()
 CAPTURE_DIR_PATH = (DATA_DIR_PATH / "captures").absolute()
 VIDEO_DIR_PATH = DATA_DIR_PATH / "videos"
+DATABASE_LOCK_FILE_PATH = DATA_DIR_PATH / "openadapt.db.lock"
 
 STOP_STRS = [
     "oa.stop",
@@ -136,6 +137,7 @@ class SegmentationAdapter(str, Enum):
     RECORD_WINDOW_DATA: bool = False
     RECORD_READ_ACTIVE_ELEMENT_STATE: bool = False
     RECORD_VIDEO: bool
+    RECORD_AUDIO: bool
     # if false, only write video events corresponding to screenshots
     RECORD_FULL_VIDEO: bool
     RECORD_IMAGES: bool
diff --git a/openadapt/db/crud.py b/openadapt/db/crud.py
index 6af5a8ef5..39b97f7bb 100644
--- a/openadapt/db/crud.py
+++ b/openadapt/db/crud.py
@@ -11,13 +11,15 @@
 
 from loguru import logger
 from sqlalchemy.orm import Session as SaSession
+import psutil
 import sqlalchemy as sa
 
 from openadapt import utils
-from openadapt.config import DATA_DIR_PATH, config
+from openadapt.config import DATABASE_LOCK_FILE_PATH, config
 from openadapt.db.db import Session, get_read_only_session_maker
 from openadapt.models import (
     ActionEvent,
+    AudioInfo,
     MemoryStat,
     PerformanceStat,
     Recording,
@@ -618,6 +620,56 @@ def update_video_start_time(
     )
 
 
+def insert_audio_info(
+    session: SaSession,
+    audio_data: bytes,
+    transcribed_text: str,
+    recording: Recording,
+    timestamp: float,
+    sample_rate: int,
+    word_list: list,
+) -> None:
+    """Create an AudioInfo entry in the database.
+
+    Args:
+        session (sa.orm.Session): The database session.
+        audio_data (bytes): The audio data.
+        transcribed_text (str): The transcribed text.
+        recording (Recording): The recording object.
+        timestamp (float): The timestamp of the audio.
+        sample_rate (int): The sample rate of the audio.
+        word_list (list): A list of words with timestamps.
+    """
+    audio_info = AudioInfo(
+        flac_data=audio_data,
+        transcribed_text=transcribed_text,
+        recording_timestamp=recording.timestamp,
+        recording_id=recording.id,
+        timestamp=timestamp,
+        sample_rate=sample_rate,
+        words_with_timestamps=json.dumps(word_list),
+    )
+    session.add(audio_info)
+    session.commit()
+
+
+# TODO: change to use recording_id once scrubbing PR is merged
+def get_audio_info(
+    session: SaSession,
+    recording_timestamp: float,
+) -> list[AudioInfo]:
+    """Get the audio info for a given recording.
+
+    Args:
+        session (sa.orm.Session): The database session.
+        recording_timestamp (float): The timestamp of the recording.
+
+    Returns:
+        list[AudioInfo]: A list of audio info for the recording.
+    """
+    return _get(session, AudioInfo, recording_timestamp)
+
+
 def post_process_events(session: SaSession, recording: Recording) -> None:
     """Post-process events.
 
@@ -764,11 +816,17 @@ def acquire_db_lock(timeout: int = 60) -> bool:
         if timeout > 0 and time.time() - start > timeout:
             logger.error("Failed to acquire database lock.")
             return False
-        if os.path.exists(DATA_DIR_PATH / "database.lock"):
-            logger.info("Database is locked. Waiting...")
-            time.sleep(1)
+        if os.path.exists(DATABASE_LOCK_FILE_PATH):
+            with open(DATABASE_LOCK_FILE_PATH, "r") as lock_file:
+                lock_info = json.load(lock_file)
+            # check if the process is still running
+            if psutil.pid_exists(lock_info["pid"]):
+                logger.info("Database is locked. Waiting...")
+                time.sleep(1)
+            else:
+                release_db_lock(raise_exception=False)
         else:
-            with open(DATA_DIR_PATH / "database.lock", "w") as lock_file:
+            with open(DATABASE_LOCK_FILE_PATH, "w") as lock_file:
                 lock_file.write(json.dumps({"pid": os.getpid(), "time": time.time()}))
                 logger.info("Database lock acquired.")
             break
@@ -778,7 +836,7 @@ def acquire_db_lock(timeout: int = 60) -> bool:
 def release_db_lock(raise_exception: bool = True) -> None:
     """Release the database lock."""
     try:
-        os.remove(DATA_DIR_PATH / "database.lock")
+        os.remove(DATABASE_LOCK_FILE_PATH)
     except Exception as e:
         if raise_exception:
             logger.error("Failed to release database lock.")
diff --git a/openadapt/models.py b/openadapt/models.py
index 9133b041a..ac299ff56 100644
--- a/openadapt/models.py
+++ b/openadapt/models.py
@@ -81,6 +81,7 @@ class Recording(db.Base):
         "ScrubbedRecording",
         back_populates="recording",
     )
+    audio_info = sa.orm.relationship("AudioInfo", back_populates="recording")
 
     _processed_action_events = None
 
@@ -723,6 +724,23 @@ def convert_png_to_binary(self, image: Image.Image) -> bytes:
         return buffer.getvalue()
 
 
+class AudioInfo(db.Base):
+    """Class representing the audio from a recording in the database."""
+
+    __tablename__ = "audio_info"
+
+    id = sa.Column(sa.Integer, primary_key=True)
+    timestamp = sa.Column(ForceFloat)
+    flac_data = sa.Column(sa.LargeBinary)
+    transcribed_text = sa.Column(sa.String)
+    recording_timestamp = sa.Column(ForceFloat)
+    recording_id = sa.Column(sa.ForeignKey("recording.id"))
+    sample_rate = sa.Column(sa.Integer)
+    words_with_timestamps = sa.Column(sa.Text)
+
+    recording = sa.orm.relationship("Recording", back_populates="audio_info")
+
+
 class PerformanceStat(db.Base):
     """Class representing a performance statistic in the database."""
 
diff --git a/openadapt/record.py b/openadapt/record.py
index 485b39ee9..093ec0346 100644
--- a/openadapt/record.py
+++ b/openadapt/record.py
@@ -31,7 +31,11 @@
     from tqdm import tqdm
     import fire
 
+import numpy as np
 import psutil
+import sounddevice
+import soundfile
+import whisper
 
 from openadapt import utils, video, window
 from openadapt.config import config
@@ -988,6 +992,107 @@ def read_mouse_events(
     mouse_listener.stop()
 
 
+def record_audio(
+    recording: Recording,
+    terminate_processing: multiprocessing.Event,
+    started_counter: multiprocessing.Value,
+) -> None:
+    """Record audio narration during the recording and store data in database.
+
+    Args:
+        recording: The recording object.
+        terminate_processing: An event to signal the termination of the process.
+        started_counter: Value to increment once started.
+    """
+    utils.configure_logging(logger, LOG_LEVEL)
+    utils.set_start_time(recording.timestamp)
+
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+    audio_frames = []  # to store audio frames
+
+    def audio_callback(
+        indata: np.ndarray, frames: int, time: Any, status: sounddevice.CallbackFlags
+    ) -> None:
+        """Callback function used when new audio frames are recorded.
+
+        Note: time is of type cffi.FFI.CData, but since we don't use this argument
+        and we also don't use the cffi library, the Any type annotation is used.
+        """
+        # called whenever there is new audio frames
+        audio_frames.append(indata.copy())
+
+    # open InputStream and start recording while ActionEvents are recorded
+    audio_stream = sounddevice.InputStream(
+        callback=audio_callback, samplerate=16000, channels=1
+    )
+    logger.info("Audio recording started.")
+    start_timestamp = utils.get_timestamp()
+    audio_stream.start()
+
+    # NOTE: listener may not have actually started by now
+    # TODO: handle race condition, e.g. by sending synthetic events from main thread
+    with started_counter.get_lock():
+        started_counter.value += 1
+
+    terminate_processing.wait()
+    audio_stream.stop()
+    audio_stream.close()
+
+    # Concatenate into one Numpy array
+    concatenated_audio = np.concatenate(audio_frames, axis=0)
+    # convert concatenated_audio to format expected by whisper
+    converted_audio = concatenated_audio.flatten().astype(np.float32)
+
+    # Convert audio to text using OpenAI's Whisper
+    logger.info("Transcribing audio...")
+    model = whisper.load_model("base")
+    result_info = model.transcribe(converted_audio, word_timestamps=True, fp16=False)
+    logger.info(f"The narrated text is: {result_info['text']}")
+    # empty word_list if the user didn't say anything
+    word_list = []
+    # segments could be empty
+    if len(result_info["segments"]) > 0:
+        # there won't be a 'words' list if the user didn't say anything
+        if "words" in result_info["segments"][0]:
+            word_list = result_info["segments"][0]["words"]
+
+    # compress and convert to bytes to save to database
+    logger.info(
+        "Size of uncompressed audio data: {} bytes".format(converted_audio.nbytes)
+    )
+    # Create an in-memory file-like object
+    file_obj = io.BytesIO()
+    # Write the audio data using lossless compression
+    soundfile.write(
+        file_obj, converted_audio, int(audio_stream.samplerate), format="FLAC"
+    )
+    # Get the compressed audio data as bytes
+    compressed_audio_bytes = file_obj.getvalue()
+
+    logger.info(
+        "Size of compressed audio data: {} bytes".format(len(compressed_audio_bytes))
+    )
+
+    file_obj.close()
+
+    # To decompress the audio and restore it to its original form:
+    # restored_audio, restored_samplerate = sf.read(
+    # io.BytesIO(compressed_audio_bytes))
+
+    with crud.get_new_session(read_and_write=True) as session:
+        # Create AudioInfo entry
+        crud.insert_audio_info(
+            session,
+            compressed_audio_bytes,
+            result_info["text"],
+            recording,
+            start_timestamp,
+            int(audio_stream.samplerate),
+            word_list,
+        )
+
+
 @logger.catch
 @utils.trace(logger)
 def record(
@@ -1159,6 +1264,18 @@ def record(
         )
         video_writer.start()
 
+    if config.RECORD_AUDIO:
+        expected_starts += 1
+        audio_recorder = multiprocessing.Process(
+            target=record_audio,
+            args=(
+                recording,
+                terminate_processing,
+                started_counter,
+            ),
+        )
+        audio_recorder.start()
+
     terminate_perf_event = multiprocessing.Event()
     perf_stat_writer = multiprocessing.Process(
         target=performance_stats_writer,
@@ -1232,6 +1349,8 @@ def record(
     window_event_writer.join()
     if config.RECORD_VIDEO:
         video_writer.join()
+    if config.RECORD_AUDIO:
+        audio_recorder.join()
     terminate_perf_event.set()
 
     if PLOT_PERFORMANCE:
diff --git a/openadapt/strategies/mixins/openai.py b/openadapt/strategies/mixins/openai.py
index 80fd8a13a..a2abd3e5b 100644
--- a/openadapt/strategies/mixins/openai.py
+++ b/openadapt/strategies/mixins/openai.py
@@ -188,50 +188,3 @@ def _get_completion(prompt: str) -> str:
     logger.debug(f"appending assistant_message=\n{pformat(assistant_message)}")
     messages.append(assistant_message)
     return messages
-
-
-# XXX TODO not currently in use
-# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
-def num_tokens_from_messages(messages: list, model: str = "gpt-3.5-turbo-0301") -> int:
-    """Returns the number of tokens used by a list of messages."""
-    try:
-        encoding = tiktoken.encoding_for_model(model)
-    except KeyError:
-        logger.info("Warning: model not found. Using cl100k_base encoding.")
-        encoding = tiktoken.get_encoding("cl100k_base")
-    if model == "gpt-3.5-turbo":
-        logger.info(
-            "Warning: gpt-3.5-turbo may change over time. Returning num tokens "
-            "assuming gpt-3.5-turbo-0301."
-        )
-        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
-    elif model == "gpt-4":
-        logger.info(
-            "Warning: gpt-4 may change over time. Returning num tokens "
-            "assuming gpt-4-0314."
-        )
-        return num_tokens_from_messages(messages, model="gpt-4-0314")
-    elif model == "gpt-3.5-turbo-0301":
-        tokens_per_message = (
-            4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
-        )
-        tokens_per_name = -1  # if there's a name, the role is omitted
-    elif model == "gpt-4-0314":
-        tokens_per_message = 3
-        tokens_per_name = 1
-    else:
-        raise NotImplementedError(
-            f"""num_tokens_from_messages() is not implemented for model "
-            "{model}. See "
-            "https://github.com/openai/openai-python/blob/main/chatml.md for "
-            information on how messages are converted to tokens."""
-        )
-    num_tokens = 0
-    for message in messages:
-        num_tokens += tokens_per_message
-        for key, value in message.items():
-            num_tokens += len(encoding.encode(value))
-            if key == "name":
-                num_tokens += tokens_per_name
-    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
-    return num_tokens
diff --git a/openadapt/visualize.py b/openadapt/visualize.py
index 06b8bb439..2859ee6b9 100644
--- a/openadapt/visualize.py
+++ b/openadapt/visualize.py
@@ -186,6 +186,10 @@ def main(
     logger.info(f"{recording=}")
     logger.info(f"{diff_video=}")
 
+    audio_info = row2dict(crud.get_audio_info(recording))
+    # don't display the FLAC data
+    del audio_info["flac_data"]
+
     if diff_video:
         assert recording.config[
             "RECORD_VIDEO"
diff --git a/poetry.lock b/poetry.lock
index 051b8f7d5..cdbed4072 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2757,6 +2757,39 @@ files = [
 [package.dependencies]
 rapidfuzz = ">=2.3.0,<4.0.0"
 
+[[package]]
+name = "llvmlite"
+version = "0.40.1rc1"
+description = "lightweight wrapper around basic LLVM functionality"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "llvmlite-0.40.1rc1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:31b606ae4923a897fe7122fe9a75fa39713279e796b335b83cb929c5d9e8661b"},
+    {file = "llvmlite-0.40.1rc1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d08de4135dd8652f46de42e795b744dcad8cc11de3b6044a7326a61636887655"},
+    {file = "llvmlite-0.40.1rc1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94043bb283395963b48fa964c776670889084be5117cbc4f831ab357005365d1"},
+    {file = "llvmlite-0.40.1rc1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:659b400cf61e567c5c30159f60eac8562133bf5e497f481388d22e6b5dd00044"},
+    {file = "llvmlite-0.40.1rc1-cp310-cp310-win32.whl", hash = "sha256:84f5c569fdcc503a7ce5018d2115ebac3a385743774ed22c6cc8dade673eae33"},
+    {file = "llvmlite-0.40.1rc1-cp310-cp310-win_amd64.whl", hash = "sha256:a775e87d6ee6f6fcdae5ead0dec171243719002fc39c500c4813babb3609f6d9"},
+    {file = "llvmlite-0.40.1rc1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:239eaeef72267566538b9f4cba8a41fb3e39ac99881c2a9a8100aff60c645edb"},
+    {file = "llvmlite-0.40.1rc1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2b8ceb9c436acdc87c3f5ab2dd6e3d003cf938abf55d3470d059abd99dee63d3"},
+    {file = "llvmlite-0.40.1rc1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0475c5107334cf528c607275e0e1cd7836c31fe07c6e45994cd02dd45a95e3b1"},
+    {file = "llvmlite-0.40.1rc1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3bc15e54522695ef16b5225cb40e89ef7f80d2d37cb0a8ddf3ffe3200fa238ff"},
+    {file = "llvmlite-0.40.1rc1-cp311-cp311-win_amd64.whl", hash = "sha256:957c5f18726362fd2426f39997b9090c88a6a1cb11d4330b50b4946fa0c857a7"},
+    {file = "llvmlite-0.40.1rc1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:38bcca23eb8919279619bebb4db6946f0d3dfedd879dfe9f741041789c83e36b"},
+    {file = "llvmlite-0.40.1rc1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:753359a969f0606c30d3ef38988ae46c65ef2d3bcc7afb4ada0c37a2f4416a68"},
+    {file = "llvmlite-0.40.1rc1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fae6c6b04ec4d83b5bd3437dd4ef7a9e6d4461437e615fa0895ac355709b6f10"},
+    {file = "llvmlite-0.40.1rc1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f095b6c6e94fcb26705614d9da2267c739118f1e97ba6bb3ea5c9fbc77764171"},
+    {file = "llvmlite-0.40.1rc1-cp38-cp38-win32.whl", hash = "sha256:92918a7c60bacebf72297b4caeca2bcf2a6cffb50362e915cc1dc202ac556586"},
+    {file = "llvmlite-0.40.1rc1-cp38-cp38-win_amd64.whl", hash = "sha256:2585ea726f6cd012279ea5a0d84d999e436061dc7df67bdaea1cbae998a16f9f"},
+    {file = "llvmlite-0.40.1rc1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d5ad3dd8a0c600533650e14cc908874c2dbeca5ea749acfc262564f15586dc94"},
+    {file = "llvmlite-0.40.1rc1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2daa1de68d1bc0fd78757bb01a96c434373dca83d28460ff16b1accb1f171aff"},
+    {file = "llvmlite-0.40.1rc1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e9930a222cd98487dd4e8f916c3c92c0311ea294136fc4f3cd0bab6265e28b0"},
+    {file = "llvmlite-0.40.1rc1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86e9060e28796c0b38a73f59802d0f6af31a1bb7c6e3b766cb96237d862fe26c"},
+    {file = "llvmlite-0.40.1rc1-cp39-cp39-win32.whl", hash = "sha256:b047d0e35b61dcbeaa1a86afac696c2dd9ca48430cb63638417e837cc1f0e60a"},
+    {file = "llvmlite-0.40.1rc1-cp39-cp39-win_amd64.whl", hash = "sha256:da0b97219fa1053ab9a964e4703fcfca4ef6077614e7dce21de71bbbe6e4a4e9"},
+    {file = "llvmlite-0.40.1rc1.tar.gz", hash = "sha256:8a6465075a0449fd802c9274130abb4f4ccf926972e84e8eac365769b7ec48fc"},
+]
+
 [[package]]
 name = "loguru"
 version = "0.6.0"
@@ -3180,6 +3213,17 @@ files = [
     {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
 ]
 
+[[package]]
+name = "more-itertools"
+version = "10.2.0"
+description = "More routines for operating on iterables, beyond itertools"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "more-itertools-10.2.0.tar.gz", hash = "sha256:8fccb480c43d3e99a00087634c06dd02b0d50fbf088b380de5a41a015ec239e1"},
+    {file = "more_itertools-10.2.0-py3-none-any.whl", hash = "sha256:686b06abe565edfab151cb8fd385a05651e1fdf8f0a14191e4439283421f8684"},
+]
+
 [[package]]
 name = "moviepy"
 version = "1.0.3"
@@ -3503,38 +3547,78 @@ files = [
 [package.dependencies]
 setuptools = "*"
 
+[[package]]
+name = "numba"
+version = "0.57.0"
+description = "compiling Python code using LLVM"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "numba-0.57.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2e2c14c411545e80bf0f1a33232fb0bd6aa3368f86e56eeffc7f6d3ac16ea3fd"},
+    {file = "numba-0.57.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6b3382c56d805ffcdc7b46eb69a906be733dd35b84be14abba8e5fd27d7916b2"},
+    {file = "numba-0.57.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:133cba9b5002bf67f6f73d9b3050d919c1be91326bbdcccfdf3259bcfb1cec0e"},
+    {file = "numba-0.57.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d92a17ee849574665c5d94e9c9b862e469e1231d3dbb9e58e58b30b4bb0cbce9"},
+    {file = "numba-0.57.0-cp310-cp310-win32.whl", hash = "sha256:abc90c3d303a67ae5194770a6f0d0a83edf076683b8a426349a27b91d98e00d1"},
+    {file = "numba-0.57.0-cp310-cp310-win_amd64.whl", hash = "sha256:430f43c96f866ca4fe6008d8aa28bb608911d908ff94f879e0dbad7768ef9869"},
+    {file = "numba-0.57.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:069f7d8fddad4c0eb1d7534c2a18098fe50473dc77832b409176002e9011b96f"},
+    {file = "numba-0.57.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:79daa130fc9e4ebd1eea0a594d1de86d8a4366989f5fab93c482246b502520db"},
+    {file = "numba-0.57.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:274f4db4814ebd5ec81697acfc36df04a865b86610d7714905185b753f3f9baf"},
+    {file = "numba-0.57.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0106ee441e3f69cc6f17cb470c4fcccd592e0606567d43245635d72b071ab88e"},
+    {file = "numba-0.57.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5d31b4d95000d86ffa9652ab5bcfa0ea30e6c3fc40e610147d4f2f00116703d"},
+    {file = "numba-0.57.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3e0b8de39bf17519435937b53276dfb02e2eb8bc27cd211c8eeb01ffed1cab6b"},
+    {file = "numba-0.57.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:18d90fa6fcd5b796999392a8ea67f2fbccecf8dabcea726e2e721c79f40566a6"},
+    {file = "numba-0.57.0-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d4f62528c7c8c5f97e9689fd788e420b68c67ee0a1a9a7715a57fd584b7aef1e"},
+    {file = "numba-0.57.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fd12cf0b431676c08057685e229ea5daaa1ec8efba2506c38671734ace49c2d7"},
+    {file = "numba-0.57.0-cp38-cp38-win32.whl", hash = "sha256:e5f11b1d435fb4d1d1b68fa68ff456d632dc4bfd40b18825ff80d6081d1afb26"},
+    {file = "numba-0.57.0-cp38-cp38-win_amd64.whl", hash = "sha256:5810ed2d6d22eb3c48bedfac2187fc44bb90e05f02d47fd31059e69207ae4106"},
+    {file = "numba-0.57.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eddba74493d4003a42cd61ff7feca4928a94a45553d1fddac77a5cc339f6f4f9"},
+    {file = "numba-0.57.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:110be5e1213d0a3d5fc691e921a000779500620196d99cee9908fce83d1e48df"},
+    {file = "numba-0.57.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f949018ab9c467d38f14fe17db4df0d4a1c664be802189e2d6c5a434d9ffd4f6"},
+    {file = "numba-0.57.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9fc0cd4ec93a1e3877985e10ed5837ed2991c83aa4b7ca574caae5c8b448cc4b"},
+    {file = "numba-0.57.0-cp39-cp39-win32.whl", hash = "sha256:83d4f21c98eed3001e9896a43a1ce9c825999c03f7eb39ddd1c2d07a76708392"},
+    {file = "numba-0.57.0-cp39-cp39-win_amd64.whl", hash = "sha256:9173d00c6753212b68e4fd319cfa96c21b2263949452c97b034e78ce09539dee"},
+    {file = "numba-0.57.0.tar.gz", hash = "sha256:2af6d81067a5bdc13960c6d2519dbabbf4d5d597cf75d640c5aeaefd48c6420a"},
+]
+
+[package.dependencies]
+llvmlite = "==0.40.*"
+numpy = ">=1.21,<1.25"
+
 [[package]]
 name = "numpy"
-version = "1.25.2"
+version = "1.24.4"
 description = "Fundamental package for array computing in Python"
 optional = false
-python-versions = ">=3.9"
+python-versions = ">=3.8"
 files = [
-    {file = "numpy-1.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:db3ccc4e37a6873045580d413fe79b68e47a681af8db2e046f1dacfa11f86eb3"},
-    {file = "numpy-1.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:90319e4f002795ccfc9050110bbbaa16c944b1c37c0baeea43c5fb881693ae1f"},
-    {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfe4a913e29b418d096e696ddd422d8a5d13ffba4ea91f9f60440a3b759b0187"},
-    {file = "numpy-1.25.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f08f2e037bba04e707eebf4bc934f1972a315c883a9e0ebfa8a7756eabf9e357"},
-    {file = "numpy-1.25.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bec1e7213c7cb00d67093247f8c4db156fd03075f49876957dca4711306d39c9"},
-    {file = "numpy-1.25.2-cp310-cp310-win32.whl", hash = "sha256:7dc869c0c75988e1c693d0e2d5b26034644399dd929bc049db55395b1379e044"},
-    {file = "numpy-1.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:834b386f2b8210dca38c71a6e0f4fd6922f7d3fcff935dbe3a570945acb1b545"},
-    {file = "numpy-1.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c5462d19336db4560041517dbb7759c21d181a67cb01b36ca109b2ae37d32418"},
-    {file = "numpy-1.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5652ea24d33585ea39eb6a6a15dac87a1206a692719ff45d53c5282e66d4a8f"},
-    {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d60fbae8e0019865fc4784745814cff1c421df5afee233db6d88ab4f14655a2"},
-    {file = "numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60e7f0f7f6d0eee8364b9a6304c2845b9c491ac706048c7e8cf47b83123b8dbf"},
-    {file = "numpy-1.25.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:bb33d5a1cf360304754913a350edda36d5b8c5331a8237268c48f91253c3a364"},
-    {file = "numpy-1.25.2-cp311-cp311-win32.whl", hash = "sha256:5883c06bb92f2e6c8181df7b39971a5fb436288db58b5a1c3967702d4278691d"},
-    {file = "numpy-1.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:5c97325a0ba6f9d041feb9390924614b60b99209a71a69c876f71052521d42a4"},
-    {file = "numpy-1.25.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b79e513d7aac42ae918db3ad1341a015488530d0bb2a6abcbdd10a3a829ccfd3"},
-    {file = "numpy-1.25.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eb942bfb6f84df5ce05dbf4b46673ffed0d3da59f13635ea9b926af3deb76926"},
-    {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0746410e73384e70d286f93abf2520035250aad8c5714240b0492a7302fdca"},
-    {file = "numpy-1.25.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7806500e4f5bdd04095e849265e55de20d8cc4b661b038957354327f6d9b295"},
-    {file = "numpy-1.25.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8b77775f4b7df768967a7c8b3567e309f617dd5e99aeb886fa14dc1a0791141f"},
-    {file = "numpy-1.25.2-cp39-cp39-win32.whl", hash = "sha256:2792d23d62ec51e50ce4d4b7d73de8f67a2fd3ea710dcbc8563a51a03fb07b01"},
-    {file = "numpy-1.25.2-cp39-cp39-win_amd64.whl", hash = "sha256:76b4115d42a7dfc5d485d358728cdd8719be33cc5ec6ec08632a5d6fca2ed380"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1a1329e26f46230bf77b02cc19e900db9b52f398d6722ca853349a782d4cff55"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c3abc71e8b6edba80a01a52e66d83c5d14433cbcd26a40c329ec7ed09f37901"},
-    {file = "numpy-1.25.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1b9735c27cea5d995496f46a8b1cd7b408b3f34b6d50459d9ac8fe3a20cc17bf"},
-    {file = "numpy-1.25.2.tar.gz", hash = "sha256:fd608e19c8d7c55021dffd43bfe5492fab8cc105cc8986f813f8c3c048b38760"},
+    {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"},
+    {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"},
+    {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"},
+    {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"},
+    {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"},
+    {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"},
+    {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"},
+    {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"},
+    {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"},
+    {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"},
+    {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"},
+    {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"},
+    {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"},
+    {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"},
+    {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"},
+    {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"},
+    {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"},
+    {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"},
+    {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"},
+    {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"},
+    {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"},
+    {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"},
+    {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"},
+    {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"},
+    {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"},
+    {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"},
+    {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"},
+    {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"},
 ]
 
 [[package]]
@@ -3646,6 +3730,32 @@ dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-moc
 embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"]
 wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"]
 
+[[package]]
+name = "openai-whisper"
+version = "20230314"
+description = "Robust Speech Recognition via Large-Scale Weak Supervision"
+optional = false
+python-versions = ">=3.8"
+files = []
+develop = false
+
+[package.dependencies]
+more-itertools = "*"
+numba = "*"
+numpy = "*"
+tiktoken = "0.3.3"
+torch = "*"
+tqdm = "*"
+
+[package.extras]
+dev = ["black", "flake8", "isort", "pytest", "scipy"]
+
+[package.source]
+type = "git"
+url = "https://github.com/openai/whisper.git"
+reference = "HEAD"
+resolved_reference = "ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab"
+
 [[package]]
 name = "opencv-python"
 version = "4.9.0.80"
@@ -6300,6 +6410,49 @@ files = [
     {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"},
 ]
 
+[[package]]
+name = "sounddevice"
+version = "0.4.6"
+description = "Play and Record Sound with Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "sounddevice-0.4.6-py3-none-any.whl", hash = "sha256:5de768ba6fe56ad2b5aaa2eea794b76b73e427961c95acad2ee2ed7f866a4b20"},
+    {file = "sounddevice-0.4.6-py3-none-macosx_10_6_x86_64.macosx_10_6_universal2.whl", hash = "sha256:8b0b806c205dd3e3cd5a97262b2482624fd21db7d47083b887090148a08051c8"},
+    {file = "sounddevice-0.4.6-py3-none-win32.whl", hash = "sha256:e3ba6e674ffa8f79a591d744a1d4ab922fe5bdfd4faf8b25069a08e051010b7b"},
+    {file = "sounddevice-0.4.6-py3-none-win_amd64.whl", hash = "sha256:7830d4f8f8570f2e5552942f81d96999c5fcd9a0b682d6fc5d5c5529df23be2c"},
+    {file = "sounddevice-0.4.6.tar.gz", hash = "sha256:3236b78f15f0415bdf006a620cef073d0c0522851d66f4a961ed6d8eb1482fe9"},
+]
+
+[package.dependencies]
+CFFI = ">=1.0"
+
+[package.extras]
+numpy = ["NumPy"]
+
+[[package]]
+name = "soundfile"
+version = "0.12.1"
+description = "An audio library based on libsndfile, CFFI and NumPy"
+optional = false
+python-versions = "*"
+files = [
+    {file = "soundfile-0.12.1-py2.py3-none-any.whl", hash = "sha256:828a79c2e75abab5359f780c81dccd4953c45a2c4cd4f05ba3e233ddf984b882"},
+    {file = "soundfile-0.12.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:d922be1563ce17a69582a352a86f28ed8c9f6a8bc951df63476ffc310c064bfa"},
+    {file = "soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:bceaab5c4febb11ea0554566784bcf4bc2e3977b53946dda2b12804b4fe524a8"},
+    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2dc3685bed7187c072a46ab4ffddd38cef7de9ae5eb05c03df2ad569cf4dacbc"},
+    {file = "soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:074247b771a181859d2bc1f98b5ebf6d5153d2c397b86ee9e29ba602a8dfe2a6"},
+    {file = "soundfile-0.12.1-py2.py3-none-win32.whl", hash = "sha256:59dfd88c79b48f441bbf6994142a19ab1de3b9bb7c12863402c2bc621e49091a"},
+    {file = "soundfile-0.12.1-py2.py3-none-win_amd64.whl", hash = "sha256:0d86924c00b62552b650ddd28af426e3ff2d4dc2e9047dae5b3d8452e0a49a77"},
+    {file = "soundfile-0.12.1.tar.gz", hash = "sha256:e8e1017b2cf1dda767aef19d2fd9ee5ebe07e050d430f77a0a7c66ba08b8cdae"},
+]
+
+[package.dependencies]
+cffi = ">=1.0"
+
+[package.extras]
+numpy = ["numpy"]
+
 [[package]]
 name = "spacy"
 version = "3.7.4"
@@ -7000,40 +7153,40 @@ all = ["defusedxml", "fsspec", "imagecodecs (>=2023.8.12)", "lxml", "matplotlib"
 
 [[package]]
 name = "tiktoken"
-version = "0.4.0"
+version = "0.3.3"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "tiktoken-0.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:176cad7f053d2cc82ce7e2a7c883ccc6971840a4b5276740d0b732a2b2011f8a"},
-    {file = "tiktoken-0.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:450d504892b3ac80207700266ee87c932df8efea54e05cefe8613edc963c1285"},
-    {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00d662de1e7986d129139faf15e6a6ee7665ee103440769b8dedf3e7ba6ac37f"},
-    {file = "tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5727d852ead18b7927b8adf558a6f913a15c7766725b23dbe21d22e243041b28"},
-    {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c06cd92b09eb0404cedce3702fa866bf0d00e399439dad3f10288ddc31045422"},
-    {file = "tiktoken-0.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9ec161e40ed44e4210d3b31e2ff426b4a55e8254f1023e5d2595cb60044f8ea6"},
-    {file = "tiktoken-0.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:1e8fa13cf9889d2c928b9e258e9dbbbf88ab02016e4236aae76e3b4f82dd8288"},
-    {file = "tiktoken-0.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bb2341836b725c60d0ab3c84970b9b5f68d4b733a7bcb80fb25967e5addb9920"},
-    {file = "tiktoken-0.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ca30367ad750ee7d42fe80079d3092bd35bb266be7882b79c3bd159b39a17b0"},
-    {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3dc3df19ddec79435bb2a94ee46f4b9560d0299c23520803d851008445671197"},
-    {file = "tiktoken-0.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d980fa066e962ef0f4dad0222e63a484c0c993c7a47c7dafda844ca5aded1f3"},
-    {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:329f548a821a2f339adc9fbcfd9fc12602e4b3f8598df5593cfc09839e9ae5e4"},
-    {file = "tiktoken-0.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:b1a038cee487931a5caaef0a2e8520e645508cde21717eacc9af3fbda097d8bb"},
-    {file = "tiktoken-0.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:08efa59468dbe23ed038c28893e2a7158d8c211c3dd07f2bbc9a30e012512f1d"},
-    {file = "tiktoken-0.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f3020350685e009053829c1168703c346fb32c70c57d828ca3742558e94827a9"},
-    {file = "tiktoken-0.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ba16698c42aad8190e746cd82f6a06769ac7edd415d62ba027ea1d99d958ed93"},
-    {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c15d9955cc18d0d7ffcc9c03dc51167aedae98542238b54a2e659bd25fe77ed"},
-    {file = "tiktoken-0.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64e1091c7103100d5e2c6ea706f0ec9cd6dc313e6fe7775ef777f40d8c20811e"},
-    {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e87751b54eb7bca580126353a9cf17a8a8eaadd44edaac0e01123e1513a33281"},
-    {file = "tiktoken-0.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e063b988b8ba8b66d6cc2026d937557437e79258095f52eaecfafb18a0a10c03"},
-    {file = "tiktoken-0.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:9c6dd439e878172dc163fced3bc7b19b9ab549c271b257599f55afc3a6a5edef"},
-    {file = "tiktoken-0.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8d1d97f83697ff44466c6bef5d35b6bcdb51e0125829a9c0ed1e6e39fb9a08fb"},
-    {file = "tiktoken-0.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1b6bce7c68aa765f666474c7c11a7aebda3816b58ecafb209afa59c799b0dd2d"},
-    {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a73286c35899ca51d8d764bc0b4d60838627ce193acb60cc88aea60bddec4fd"},
-    {file = "tiktoken-0.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d0394967d2236a60fd0aacef26646b53636423cc9c70c32f7c5124ebe86f3093"},
-    {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:dae2af6f03ecba5f679449fa66ed96585b2fa6accb7fd57d9649e9e398a94f44"},
-    {file = "tiktoken-0.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:55e251b1da3c293432179cf7c452cfa35562da286786be5a8b1ee3405c2b0dd2"},
-    {file = "tiktoken-0.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:c835d0ee1f84a5aa04921717754eadbc0f0a56cf613f78dfc1cf9ad35f6c3fea"},
-    {file = "tiktoken-0.4.0.tar.gz", hash = "sha256:59b20a819969735b48161ced9b92f05dc4519c17be4015cfb73b65270a243620"},
+    {file = "tiktoken-0.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d1f37fa75ba70c1bc7806641e8ccea1fba667d23e6341a1591ea333914c226a9"},
+    {file = "tiktoken-0.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3d7296c38392a943c2ccc0b61323086b8550cef08dcf6855de9949890dbc1fd3"},
+    {file = "tiktoken-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3c84491965e139a905280ac28b74baaa13445b3678e07f96767089ad1ef5ee7b"},
+    {file = "tiktoken-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65970d77ea85ce6c7fce45131da9258cd58a802ffb29ead8f5552e331c025b2b"},
+    {file = "tiktoken-0.3.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bd3f72d0ba7312c25c1652292121a24c8f1711207b63c6d8dab21afe4be0bf04"},
+    {file = "tiktoken-0.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:719c9e13432602dc496b24f13e3c3ad3ec0d2fbdb9aace84abfb95e9c3a425a4"},
+    {file = "tiktoken-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:dc00772284c94e65045b984ed7e9f95d000034f6b2411df252011b069bd36217"},
+    {file = "tiktoken-0.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4db2c40f79f8f7a21a9fdbf1c6dee32dea77b0d7402355dc584a3083251d2e15"},
+    {file = "tiktoken-0.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e3c0f2231aa3829a1a431a882201dc27858634fd9989898e0f7d991dbc6bcc9d"},
+    {file = "tiktoken-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48c13186a479de16cfa2c72bb0631fa9c518350a5b7569e4d77590f7fee96be9"},
+    {file = "tiktoken-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6674e4e37ab225020135cd66a392589623d5164c6456ba28cc27505abed10d9e"},
+    {file = "tiktoken-0.3.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4a0c1357f6191211c544f935d5aa3cb9d7abd118c8f3c7124196d5ecd029b4af"},
+    {file = "tiktoken-0.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2e948d167fc3b04483cbc33426766fd742e7cefe5346cd62b0cbd7279ef59539"},
+    {file = "tiktoken-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:5dca434c8680b987eacde2dbc449e9ea4526574dbf9f3d8938665f638095be82"},
+    {file = "tiktoken-0.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:984758ebc07cd8c557345697c234f1f221bd730b388f4340dd08dffa50213a01"},
+    {file = "tiktoken-0.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:891012f29e159a989541ae47259234fb29ff88c22e1097567316e27ad33a3734"},
+    {file = "tiktoken-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:210f8602228e4c5d706deeb389da5a152b214966a5aa558eec87b57a1969ced5"},
+    {file = "tiktoken-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd783564f80d4dc44ff0a64b13756ded8390ed2548549aefadbe156af9188307"},
+    {file = "tiktoken-0.3.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:03f64bde9b4eb8338bf49c8532bfb4c3578f6a9a6979fc176d939f9e6f68b408"},
+    {file = "tiktoken-0.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:1ac369367b6f5e5bd80e8f9a7766ac2a9c65eda2aa856d5f3c556d924ff82986"},
+    {file = "tiktoken-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:94600798891f78db780e5aa9321456cf355e54a4719fbd554147a628de1f163f"},
+    {file = "tiktoken-0.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e59db6fca8d5ccea302fe2888917364446d6f4201a25272a1a1c44975c65406a"},
+    {file = "tiktoken-0.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:19340d8ba4d6fd729b2e3a096a547ded85f71012843008f97475f9db484869ee"},
+    {file = "tiktoken-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:542686cbc9225540e3a10f472f82fa2e1bebafce2233a211dee8459e95821cfd"},
+    {file = "tiktoken-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a43612b2a09f4787c050163a216bf51123851859e9ab128ad03d2729826cde9"},
+    {file = "tiktoken-0.3.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a11674f0275fa75fb59941b703650998bd4acb295adbd16fc8af17051aaed19d"},
+    {file = "tiktoken-0.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:65fc0a449630bab28c30b4adec257442a4706d79cffc2337c1d9df3e91825cdd"},
+    {file = "tiktoken-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0b9a7a9a8b781a50ee9289e85e28771d7e113cc0c656eadfb6fc6d3a106ff9bb"},
+    {file = "tiktoken-0.3.3.tar.gz", hash = "sha256:97b58b7bfda945791ec855e53d166e8ec20c6378942b93851a6c919ddf9d0496"},
 ]
 
 [package.dependencies]
@@ -8092,4 +8245,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "3.10.x"
-content-hash = "e96d0e3b1ea3d3882bd117ea1aca380e47ce9658099e11602da584fe0eea46da"
+content-hash = "03253442817b8bd8e787d5dbaaa99894c86a60c3ab865d78f9f6e03d55d4f5e6"
diff --git a/pyproject.toml b/pyproject.toml
index 0542d0795..10fa77ed9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,6 @@ rapidocr-onnxruntime = "1.2.3"
 scikit-learn = "1.2.2"
 scipy = "1.9.3"
 sqlalchemy = "1.4.43"
-tiktoken = "0.4.0"
 torch = "^2.0.0"
 tqdm = "4.64.0"
 transformers = "4.29.2"
@@ -100,6 +99,11 @@ imagehash = "^4.3.1"
 pydantic-settings = "^2.2.1"
 pyqt-toast-notification = "^1.1.0"
 pudb = "^2024.1"
+llvmlite = "0.40.1rc1"
+numba = "0.57.0"
+openai-whisper = {git = "https://github.com/openai/whisper.git"}
+sounddevice = "^0.4.6"
+soundfile = "^0.12.1"
 
 [tool.pytest.ini_options]
 filterwarnings = [