use gpt 4o-mini

adrienpoly · Jul 18, 2024 · 7161d1d · 7161d1d
1 parent 5c915fa
commit 7161d1d
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 25 deletions.
diff --git a/app/models/cue.rb b/app/models/cue.rb
@@ -30,4 +30,8 @@ def time_string_to_seconds(time_string)
     seconds = parts[2]
     (hours + minutes + seconds).to_i
   end
+
+  def sound_descriptor?
+    text.match?(/\[(music|sound|audio|applause|laughter|speech|voice|speeches|voices)\]/i)
+  end
 end
diff --git a/app/models/talk/transcript_commands.rb b/app/models/talk/transcript_commands.rb
@@ -23,12 +23,13 @@ def fetch_and_update_raw_transcript!
   def enhance_transcript!
     response = client.chat(
       parameters: {
-        model: "gpt-4o", # Required.
+        model: "gpt-4o-mini", # Required.
         response_format: {type: "json_object"},
         messages: messages
       }
     )
-    enhanced_json_transcript = JSON.parse(response.dig("choices", 0, "message", "content")).dig("transcript")
+    raw_response = JSON.repair(response.dig("choices", 0, "message", "content"))
+    enhanced_json_transcript = JSON.parse(raw_response).dig("transcript")
     update!(enhanced_transcript: Transcript.create_from_json(enhanced_json_transcript))
   end
 
@@ -43,20 +44,45 @@ def messages
 
   def prompt
     <<~PROMPT
-      Here is a raw VTT transcript output.
-      Correct and improve the entire text and format the improved transcript into a JSON structure with the specified schema
+      You are tasked with improving and formatting a raw VTT transcript. Your goal is to correct and enhance the text, organize it into paragraphs, and format it into a specific JSON structure. Follow these instructions carefully to complete the task.
 
-      To help, the metadata for this transcript are:
-      - title: #{title}
-      - desciption: #{description}
-      - speaker name: #{speakers.map(&:name).to_sentence}
-      - event name: #{event_name}
+      First, here is the metadata for the transcript:
+        - title: #{title}
+        - desciption: #{description}
+        - speaker name: #{speakers.map(&:name).to_sentence}
+        - event name: #{event_name}
 
-      json schema:
-      [{start_time: "00:00:00", end_time: "00:00:05", text: "Hello, world!"},....]
+      Now, let's process the raw VTT transcript. Here's what you need to do:
 
-      Raw VTT Transcript:
+      1. Read through the entire raw transcript carefully.
+
+      2. Correct any spelling, grammar, or punctuation errors you find in the text.
+
+      3. Improve the overall readability and coherence of the text without changing its meaning.
+
+      4. Group related sentences into paragraphs. Each paragraph should contain a complete thought or topic.
+
+      5. For each paragraph, use the start time of its first sentence as the paragraph's start time, and the end time of its last sentence as the paragraph's end time.
+
+      6. Format the improved transcript into a JSON structure using this schema:
+        [{start_time: "00:00:00", end_time: "00:00:05", text: "Hello, world!"},...]
+
+      Here is the raw VTT transcript to process:
+
+      <raw_transcript>
       #{raw_transcript.to_vtt}
+      </raw_transcript>
+
+      To complete this task, follow these steps:
+
+      1. Read through the entire raw transcript.
+      2. Make necessary corrections to spelling, grammar, and punctuation.
+      3. Improve the text for clarity and coherence.
+      4. Group related sentences into paragraphs.
+      5. Determine the start and end times for each paragraph.
+      6. Format the improved transcript into the specified JSON structure.
+
+      Remember to preserve the original meaning of the content while making improvements. Ensure that each JSON object in the array represents a paragraph with its corresponding start time, end time, and improved text.
     PROMPT
   end
 

diff --git a/app/views/talks/_transcript.html.erb b/app/views/talks/_transcript.html.erb
@@ -1,15 +1,13 @@
 <%# locals: talk: -%>
 
-
-<% if talk.transcript %>
-  <div class="flex flex-col gap-2">
-    <% talk.transcript.cues.each do |cue| %>
-      <div class="flex gap-4">
-        <span class="cursor-pointer" data-action="click->video-player#seekTo" data-video-player-time-param="<%= cue.start_time_in_seconds %>">
-          <%= cue.start_time %>
-        </span>
-        <span><%= cue.text %></span>
-      </div>
-    <% end %>
-  </div>
-<% end %>
+<div class="flex flex-col gap-2">
+  <% talk.transcript.cues.each do |cue| %>
+    <% next if cue.sound_descriptor? %>
+    <div class="flex gap-4">
+      <span class="cursor-pointer" data-action="click->video-player#seekTo" data-video-player-time-param="<%= cue.start_time_in_seconds %>">
+        <%= cue.start_time %>
+      </span>
+      <span><%= cue.text %></span>
+    </div>
+  <% end %>
+</div>