Skip to content

Commit

Permalink
🩹 Do not split Sentences on "i.e. " and "\d. "
Browse files Browse the repository at this point in the history
  • Loading branch information
pajowu committed Aug 7, 2023
1 parent d45d20b commit 18fb74d
Show file tree
Hide file tree
Showing 3 changed files with 291 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
{
"input": [
{
"type": "paragraph",
"speaker": null,
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "im ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "19. ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Jahrhundert. ",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
},
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "letzten ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
],
"lang": "de"
}
],
"expected": [
{
"type": "paragraph",
"speaker": null,
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "im ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "19. ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Jahrhundert. ",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
],
"lang": "de"
},
{
"type": "paragraph",
"speaker": null,
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "letzten ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
],
"lang": "de"
}
]
}
139 changes: 139 additions & 0 deletions worker/tests/data/test_strict_sentence_paragraphs-special-case-zB.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
{
"input": [
{
"type": "paragraph",
"speaker": null,
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "im ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "z.B. ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Jahrhundert. ",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
},
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "letzten ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
],
"lang": "de"
}
],
"expected": [
{
"type": "paragraph",
"speaker": null,
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "im ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "z.B. ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Jahrhundert. ",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
],
"lang": "de"
},
{
"type": "paragraph",
"speaker": null,
"children": [
{
"text": "Willkommen ",
"start": 0.0,
"end": 0.82,
"conf": 0.4493102431297302,
"conf_ts": 0.0
},
{
"text": "zum ",
"start": 0.82,
"end": 1.07,
"conf": 0.9744400978088379,
"conf_ts": 0.005903157405555248
},
{
"text": "letzten ",
"start": 1.07,
"end": 1.65,
"conf": 0.9838394522666931,
"conf_ts": 0.01149927917867899
},
{
"text": "Token.",
"start": 1.65,
"end": 2.06,
"conf": 0.9566531777381897,
"conf_ts": 0.0096774036064744
}
],
"lang": "de"
}
]
}
14 changes: 13 additions & 1 deletion worker/transcribee_worker/whisper_transcribe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
from typing import TYPE_CHECKING, Any, AsyncIterator, List, Optional

import requests
Expand All @@ -14,6 +15,14 @@
else:
from icu import BreakIterator, Locale

# Regexes that prevent the sentence splitting logic from breaking here
DONT_SPLIT_HERE_RES = [
re.compile(r"\s\S\.\S\.\s?$"), # Prevent splitting on "e.g.", "i.e.", "z.B."
re.compile(
r".*\d\.\s?$"
), # Don't split on numerals followed by a dot, e.g. "during the 20. century"
]


def get_model_file(model_name: str):
whisper_models_dir = settings.MODELS_DIR / "whisper"
Expand Down Expand Up @@ -266,7 +275,10 @@ async def strict_sentence_paragraphs(
)
for atom in paragraph.children:
acc_paragraph.children.append(atom)
if offset + len(acc_paragraph.text()) in breaks:
text = acc_paragraph.text()
if offset + len(text) in breaks and not any(
regex.search(text) for regex in DONT_SPLIT_HERE_RES
):
yield acc_paragraph
offset += len(acc_paragraph.text())
acc_paragraph = Paragraph(
Expand Down

0 comments on commit 18fb74d

Please sign in to comment.