Skip to content

Commit

Permalink
Brings back original VAD parameters naming (#1181)
Browse files Browse the repository at this point in the history
  • Loading branch information
Purfview authored Dec 1, 2024
1 parent 22a5238 commit 8327d8c
Showing 1 changed file with 13 additions and 13 deletions.
26 changes: 13 additions & 13 deletions faster_whisper/vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ class VadOptions:
"""VAD options.
Attributes:
onset: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
probabilities ABOVE this value are considered as SPEECH. It is better to tune this
parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
offset: Silence threshold for determining the end of speech. If a probability is lower than
the offset, it is always considered silence. Values higher than offset are only considered
speech if the previous sample was classified as speech; otherwise, they are treated as
silence. This parameter helps refine the detection of speech transitions, ensuring smoother
segment boundaries.
neg_threshold: Silence threshold for determining the end of speech. If a probability is lower
than neg_threshold, it is always considered silence. Values higher than neg_threshold
are only considered speech if the previous sample was classified as speech; otherwise,
they are treated as silence. This parameter helps refine the detection of speech
transitions, ensuring smoother segment boundaries.
min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
than max_speech_duration_s will be split at the timestamp of the last silence that
Expand All @@ -34,8 +34,8 @@ class VadOptions:
speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
"""

onset: float = 0.5
offset: float = onset - 0.15
threshold: float = 0.5
neg_threshold: float = threshold - 0.15
min_speech_duration_ms: int = 0
max_speech_duration_s: float = float("inf")
min_silence_duration_ms: int = 2000
Expand All @@ -62,7 +62,7 @@ def get_speech_timestamps(
if vad_options is None:
vad_options = VadOptions(**kwargs)

onset = vad_options.onset
threshold = vad_options.threshold
min_speech_duration_ms = vad_options.min_speech_duration_ms
max_speech_duration_s = vad_options.max_speech_duration_s
min_silence_duration_ms = vad_options.min_silence_duration_ms
Expand Down Expand Up @@ -90,20 +90,20 @@ def get_speech_timestamps(
triggered = False
speeches = []
current_speech = {}
offset = vad_options.offset
neg_threshold = vad_options.neg_threshold

# to save potential segment end (and tolerate some silence)
temp_end = 0
# to save potential segment limits in case of maximum segment size reached
prev_end = next_start = 0

for i, speech_prob in enumerate(speech_probs):
if (speech_prob >= onset) and temp_end:
if (speech_prob >= threshold) and temp_end:
temp_end = 0
if next_start < prev_end:
next_start = window_size_samples * i

if (speech_prob >= onset) and not triggered:
if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech["start"] = window_size_samples * i
continue
Expand All @@ -130,7 +130,7 @@ def get_speech_timestamps(
triggered = False
continue

if (speech_prob < offset) and triggered:
if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = window_size_samples * i
# condition to avoid cutting in very short silence
Expand Down

0 comments on commit 8327d8c

Please sign in to comment.