Segment boundaries (#61)

joeferraratonic · web-flow · commit 0238ee43c760 · 2025-06-24T14:58:20.000-04:00
* handle repeat words at segment boundaries

* correct overlapping logic

* increment version

* cover case of empy list
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tonic-textual"
-version = "3.10.5"
+version = "3.10.6"
 description = "Wrappers around the Tonic Textual API"
 authors = ["Adam Kamor <adam@tonic.ai>", "Joe Ferrara <joeferrara@tonic.ai>", "Ander Steele <ander@tonic.ai>", "Ethan Philpott <ephilpott@tonic.ai>", "Lyon Van Voorhis <lyon@tonic.ai>", "Kirill Medvedev <kirill@tonic.ai>", "Travis Matthews <travis@tonic.ai>"]
 license = "MIT"
diff --git a/tonic_textual/__init__.py b/tonic_textual/__init__.py
@@ -1 +1 @@
-__version__ = "3.10.5"
+__version__ = "3.10.6"
diff --git a/tonic_textual/helpers/redact_audio_file_helper.py b/tonic_textual/helpers/redact_audio_file_helper.py
@@ -113,7 +113,12 @@ def get_intervals_to_redact(
     """
     transcript_words = []
     for segment in transcript_segments:
-        transcript_words.extend(segment.words)
+        # sometimes the last word of the previous segmant matches the first word of the new segment
+        # this causes issues later on if repeats are not removed
+        if len(transcript_words) > 0 and len(segment.words[0]) > 0 and segment.words[0] == transcript_words[-1]:
+            transcript_words.extend(segment.words[1:])
+        else:
+            transcript_words.extend(segment.words)
     enriched_transcript_words = add_character_indices_to_words(
         transcript_text, transcript_words
     )
@@ -129,7 +134,7 @@ def get_intervals_to_redact(
             # this beeps entire word when span is part of a word
             if word_start < span_end and word_start >= span_start:
                 intersecting_words.append(word_obj)
-            elif word_end >= span_start and word_end < span_end:
+            elif word_end > span_start and word_end <= span_end:
                 intersecting_words.append(word_obj)
             elif word_start > span_end: # done
                 break

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "3.10.5"`
	`1`	`+__version__ = "3.10.6"`