feat: Add transcribed text to dashboard visualisation

KIRA009 · KIRA009 · commit 4f99f001b92c · 2024-05-20T18:05:21.000+05:30
diff --git a/openadapt/alembic/versions/98c8851a5321_add_audio_info.py b/openadapt/alembic/versions/98c8851a5321_add_audio_info.py
@@ -29,6 +29,11 @@ def upgrade() -> None:
             openadapt.models.ForceFloat(precision=10, scale=2, asdecimal=False),
             nullable=True,
         ),
+        sa.Column(
+            "timestamp",
+            openadapt.models.ForceFloat(precision=10, scale=2, asdecimal=False),
+            nullable=True,
+        ),
         sa.Column("sample_rate", sa.Integer(), nullable=True),
         sa.Column("words_with_timestamps", sa.Text(), nullable=True),
         sa.ForeignKeyConstraint(
diff --git a/openadapt/app/dashboard/api/recordings.py b/openadapt/app/dashboard/api/recordings.py
@@ -1,5 +1,7 @@
 """API endpoints for recordings."""
 
+import json
+
 from fastapi import FastAPI, WebSocket
 from loguru import logger
 
@@ -84,6 +86,22 @@ def convert_to_str(event_dict: dict) -> dict:
                         for child in event_dict["children"]:
                             convert_to_str(child)
 
+                try:
+                    # TODO: change to use recording_id once scrubbing PR is merged
+                    audio_info = crud.get_audio_info(recording.timestamp, session)[0]
+                    words_with_timestamps = json.loads(audio_info.words_with_timestamps)
+                    words_with_timestamps = [
+                        {
+                            "word": word["word"],
+                            "start": word["start"] + action_events[0].timestamp,
+                            "end": word["end"] + action_events[0].timestamp,
+                        }
+                        for word in words_with_timestamps
+                    ]
+                except IndexError:
+                    words_with_timestamps = []
+                word_index = 0
+
                 for action_event in action_events:
                     event_dict = row2dict(action_event)
                     try:
@@ -96,6 +114,18 @@ def convert_to_str(event_dict: dict) -> dict:
                         width, height = 0, 0
                     event_dict["screenshot"] = image
                     event_dict["dimensions"] = {"width": width, "height": height}
+                    words = []
+                    # each word in words_with_timestamp is a dict of word, start, end
+                    # we want to add the word to the event_dict if the start is
+                    # before the event timestamp
+                    while (
+                        word_index < len(words_with_timestamps)
+                        and words_with_timestamps[word_index]["start"]
+                        < event_dict["timestamp"]
+                    ):
+                        words.append(words_with_timestamps[word_index]["word"])
+                        word_index += 1
+                    event_dict["words"] = words
                     convert_to_str(event_dict)
                     await websocket.send_json(
                         {"type": "action_event", "value": event_dict}
diff --git a/openadapt/app/dashboard/components/ActionEvent/ActionEvent.tsx b/openadapt/app/dashboard/components/ActionEvent/ActionEvent.tsx
@@ -122,6 +122,12 @@ export const ActionEvent = ({
                                 <TableCellWithBorder>{event.parent_id}</TableCellWithBorder>
                             </TableRowWithBorder>
                         )}
+                        {event.words && event.words.length > 0 && (
+                            <TableRowWithBorder>
+                                <TableCellWithBorder>transcription</TableCellWithBorder>
+                                <TableCellWithBorder>{event.words.join(' ')}</TableCellWithBorder>
+                            </TableRowWithBorder>
+                        )}
                         <TableRowWithBorder>
                             <TableCellWithBorder>children</TableCellWithBorder>
                             <TableCellWithBorder>{event.children?.length || 0}</TableCellWithBorder>
diff --git a/openadapt/app/dashboard/types/action-event.ts b/openadapt/app/dashboard/types/action-event.ts
@@ -26,4 +26,5 @@ export type ActionEvent = {
     mask: string | null;
     dimensions?: { width: number, height: number };
     children?: ActionEvent[];
+    words?: string[];
 }
diff --git a/openadapt/db/crud.py b/openadapt/db/crud.py
@@ -515,6 +515,7 @@ def insert_audio_info(
     audio_data: bytes,
     transcribed_text: str,
     recording_timestamp: float,
+    timestamp: float,
     sample_rate: int,
     word_list: list,
 ) -> None:
@@ -523,16 +524,21 @@ def insert_audio_info(
         flac_data=audio_data,
         transcribed_text=transcribed_text,
         recording_timestamp=recording_timestamp,
+        timestamp=timestamp,
         sample_rate=sample_rate,
         words_with_timestamps=json.dumps(word_list),
     )
     db.add(audio_info)
     db.commit()
 
 
-def get_audio_info(recording_timestamp: float) -> list[AudioInfo]:
+# TODO: change to use recording_id once scrubbing PR is merged
+def get_audio_info(
+    recording_timestamp: float, session: sa.orm.Session = None
+) -> list[AudioInfo]:
     """Get the audio info for a given recording."""
-    return _get(AudioInfo, recording_timestamp)
+    _db = session or db
+    return _get(AudioInfo, recording_timestamp, _db)
 
 
 async def acquire_db_lock() -> bool:
diff --git a/openadapt/models.py b/openadapt/models.py
@@ -662,6 +662,7 @@ class AudioInfo(db.Base):
     __tablename__ = "audio_info"
 
     id = sa.Column(sa.Integer, primary_key=True)
+    timestamp = sa.Column(ForceFloat)
     flac_data = sa.Column(sa.LargeBinary)
     transcribed_text = sa.Column(sa.String)
     recording_timestamp = sa.Column(sa.ForeignKey("recording.timestamp"))
diff --git a/openadapt/record.py b/openadapt/record.py
@@ -1067,6 +1067,7 @@ def audio_callback(
         callback=audio_callback, samplerate=16000, channels=1
     )
     logger.info("Audio recording started.")
+    start_timestamp = utils.get_timestamp()
     audio_stream.start()
 
     # NOTE: listener may not have actually started by now
@@ -1124,6 +1125,7 @@ def audio_callback(
         compressed_audio_bytes,
         result_info["text"],
         recording_timestamp,
+        start_timestamp,
         int(audio_stream.samplerate),
         word_list,
     )

Original file line number	Diff line number	Diff line change
`@@ -26,4 +26,5 @@ export type ActionEvent = {`
`26`	`26`	`mask: string \| null;`
`27`	`27`	`dimensions?: { width: number, height: number };`
`28`	`28`	`children?: ActionEvent[];`
	`29`	`+ words?: string[];`
`29`	`30`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1067,6 +1067,7 @@ def audio_callback(`
`1067`	`1067`	`callback=audio_callback, samplerate=16000, channels=1`
`1068`	`1068`	`)`
`1069`	`1069`	`logger.info("Audio recording started.")`
	`1070`	`+ start_timestamp = utils.get_timestamp()`
`1070`	`1071`	`audio_stream.start()`
`1071`	`1072`
`1072`	`1073`	`# NOTE: listener may not have actually started by now`
`@@ -1124,6 +1125,7 @@ def audio_callback(`
`1124`	`1125`	`compressed_audio_bytes,`
`1125`	`1126`	`result_info["text"],`
`1126`	`1127`	`recording_timestamp,`
	`1128`	`+ start_timestamp,`
`1127`	`1129`	`int(audio_stream.samplerate),`
`1128`	`1130`	`word_list,`
`1129`	`1131`	`)`