🩹 Do not split Sentences on "i.e. " and "\d. "

bugbakery · Aug 7, 2023 · 18fb74d · 18fb74d
1 parent d45d20b
commit 18fb74d
Show file tree

Hide file tree

Showing 3 changed files with 291 additions and 1 deletion.
diff --git a/worker/tests/data/test_strict_sentence_paragraphs-special-case-number-text.json b/worker/tests/data/test_strict_sentence_paragraphs-special-case-number-text.json
@@ -0,0 +1,139 @@
+{
+  "input": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "im ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "19. ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Jahrhundert. ",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "letzten ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ],
+      "lang": "de"
+    }
+  ],
+  "expected": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "im ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "19. ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Jahrhundert. ",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ],
+      "lang": "de"
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "letzten ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ],
+      "lang": "de"
+    }
+  ]
+}
diff --git a/worker/tests/data/test_strict_sentence_paragraphs-special-case-zB.json b/worker/tests/data/test_strict_sentence_paragraphs-special-case-zB.json
@@ -0,0 +1,139 @@
+{
+  "input": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "im ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "z.B. ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Jahrhundert. ",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        },
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "letzten ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ],
+      "lang": "de"
+    }
+  ],
+  "expected": [
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "im ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "z.B. ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Jahrhundert. ",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ],
+      "lang": "de"
+    },
+    {
+      "type": "paragraph",
+      "speaker": null,
+      "children": [
+        {
+          "text": "Willkommen ",
+          "start": 0.0,
+          "end": 0.82,
+          "conf": 0.4493102431297302,
+          "conf_ts": 0.0
+        },
+        {
+          "text": "zum ",
+          "start": 0.82,
+          "end": 1.07,
+          "conf": 0.9744400978088379,
+          "conf_ts": 0.005903157405555248
+        },
+        {
+          "text": "letzten ",
+          "start": 1.07,
+          "end": 1.65,
+          "conf": 0.9838394522666931,
+          "conf_ts": 0.01149927917867899
+        },
+        {
+          "text": "Token.",
+          "start": 1.65,
+          "end": 2.06,
+          "conf": 0.9566531777381897,
+          "conf_ts": 0.0096774036064744
+        }
+      ],
+      "lang": "de"
+    }
+  ]
+}
diff --git a/worker/transcribee_worker/whisper_transcribe.py b/worker/transcribee_worker/whisper_transcribe.py
@@ -1,4 +1,5 @@
 import logging
+import re
 from typing import TYPE_CHECKING, Any, AsyncIterator, List, Optional
 
 import requests
@@ -14,6 +15,14 @@
 else:
     from icu import BreakIterator, Locale
 
+# Regexes that prevent the sentence splitting logic from breaking here
+DONT_SPLIT_HERE_RES = [
+    re.compile(r"\s\S\.\S\.\s?$"),  # Prevent splitting on "e.g.", "i.e.", "z.B."
+    re.compile(
+        r".*\d\.\s?$"
+    ),  # Don't split on numerals followed by a dot, e.g. "during the 20. century"
+]
+
 
 def get_model_file(model_name: str):
     whisper_models_dir = settings.MODELS_DIR / "whisper"
@@ -266,7 +275,10 @@ async def strict_sentence_paragraphs(
             )
         for atom in paragraph.children:
             acc_paragraph.children.append(atom)
-            if offset + len(acc_paragraph.text()) in breaks:
+            text = acc_paragraph.text()
+            if offset + len(text) in breaks and not any(
+                regex.search(text) for regex in DONT_SPLIT_HERE_RES
+            ):
                 yield acc_paragraph
                 offset += len(acc_paragraph.text())
                 acc_paragraph = Paragraph(