Skip to content

Commit

Permalink
refactor transcript serializer
Browse files Browse the repository at this point in the history
  • Loading branch information
adrienpoly committed Jul 8, 2024
1 parent 0059582 commit 1d7f4de
Show file tree
Hide file tree
Showing 11 changed files with 123 additions and 144 deletions.
2 changes: 0 additions & 2 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,4 @@ gem "dry-types", "~> 1.7"

gem "google-protobuf", require: false

gem "webvtt-ruby"

gem "active_job-performs", "~> 0.3.1"
2 changes: 0 additions & 2 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,6 @@ GEM
websocket-driver (0.7.6)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
webvtt-ruby (0.4.2)
xpath (3.2.0)
nokogiri (~> 1.8)
zeitwerk (2.6.16)
Expand Down Expand Up @@ -501,7 +500,6 @@ DEPENDENCIES
vite_rails
web-console
webmock
webvtt-ruby

RUBY VERSION
ruby 3.3.1p55
Expand Down
35 changes: 4 additions & 31 deletions app/clients/youtube/transcript.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ module Youtube
class Transcript
attr_reader :response

def get_vtt(video_id)
def get(video_id)
message = {one: "asr", two: "en"}
typedef = MessageType
two = get_base64_protobuf(message, typedef)
Expand All @@ -25,11 +25,11 @@ def get_vtt(video_id)
}

@response = HTTParty.post(url, headers: headers, body: body.to_json)
convert_to_vtt(JSON.parse(response.body))
JSON.parse(@response.body)
end

def self.get_vtt(video_id)
new.get_vtt(video_id)
def self.get(video_id)
new.get(video_id)
end

private
Expand All @@ -43,32 +43,5 @@ def get_base64_protobuf(message, typedef)
encoded_data = encode_message(message, typedef)
Base64.encode64(encoded_data).delete("\n")
end

def convert_to_vtt(transcript)
vtt_content = "WEBVTT\n\n"
events = transcript.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", "initialSegments")
if events
events.each_with_index do |event, index|
segment = event["transcriptSegmentRenderer"]
start_time = format_time(segment["startMs"].to_i)
end_time = format_time(segment["endMs"].to_i)
text = segment.dig("snippet", "runs")&.map { |run| run["text"] }&.join || ""
vtt_content += "#{index + 1}\n"
vtt_content += "#{start_time} --> #{end_time}\n"
vtt_content += "#{text}\n\n"
end
else
vtt_content += "NOTE No transcript data available\n"
end
vtt_content
end

def format_time(ms)
hours = ms / (1000 * 60 * 60)
minutes = (ms % (1000 * 60 * 60)) / (1000 * 60)
seconds = (ms % (1000 * 60)) / 1000
milliseconds = ms % 1000
format("%02d:%02d:%02d.%03d", hours, minutes, seconds, milliseconds)
end
end
end
21 changes: 21 additions & 0 deletions app/models/cue.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
class Cue
attr_reader :start_time, :end_time, :text

def initialize(start_time, end_time, text)
@start_time = start_time
@end_time = end_time
@text = text
end

def to_s
"#{start_time} --> #{end_time}\n#{text}"
end

def to_h
{
start_time: start_time,
end_time: end_time,
text: text
}
end
end
5 changes: 3 additions & 2 deletions app/models/talk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Talk < ApplicationRecord
has_many :speaker_talks, dependent: :destroy, inverse_of: :talk, foreign_key: :talk_id
has_many :speakers, through: :speaker_talks

serialize :transcript, coder: WebVTTSerializer
serialize :transcript, coder: TranscriptSerializer

# validations
validates :title, presence: true
Expand Down Expand Up @@ -129,6 +129,7 @@ def related_talks(limit: 6)
end

def update_transcript!
update!(transcript: Youtube::Transcript.get_vtt(video_id))
youtube_transcript = Youtube::Transcript.get(video_id)
update!(transcript: Transcript.create_from_youtube_transcript(youtube_transcript))
end
end
61 changes: 61 additions & 0 deletions app/models/transcript.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
class Transcript
include Enumerable

attr_reader :cues

def initialize
@cues = []
end

def add_cue(cue)
@cues << cue
end

def to_h
@cues.map { |cue| cue.to_h }
end

def to_json
to_h.to_json
end

def to_vtt
vtt_content = "WEBVTT\n\n"
@cues.each_with_index do |cue, index|
vtt_content += "#{index + 1}\n"
vtt_content += "#{cue}\n\n"
end
vtt_content
end

def each(&)
@cues.each(&)
end

class << self
def create_from_youtube_transcript(youtube_transcript)
transcript = Transcript.new
events = youtube_transcript.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", "initialSegments")
if events
events.each do |event|
segment = event["transcriptSegmentRenderer"]
start_time = format_time(segment["startMs"].to_i)
end_time = format_time(segment["endMs"].to_i)
text = segment.dig("snippet", "runs")&.map { |run| run["text"] }&.join || ""
transcript.add_cue(Cue.new(start_time, end_time, text))
end
else
transcript.add_cue(Cue.new("00:00:00.000", "00:00:00.000", "NOTE No transcript data available"))
end
transcript
end

def format_time(ms)
hours = ms / (1000 * 60 * 60)
minutes = (ms % (1000 * 60 * 60)) / (1000 * 60)
seconds = (ms % (1000 * 60)) / 1000
milliseconds = ms % 1000
format("%02d:%02d:%02d.%03d", hours, minutes, seconds, milliseconds)
end
end
end
16 changes: 16 additions & 0 deletions app/serializers/transcript_serializer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class TranscriptSerializer
def self.dump(transcript)
transcript.to_json
end

def self.load(transcript_json)
return nil if transcript_json.nil? || transcript_json.empty?

cues_array = JSON.parse(transcript_json, symbolize_names: true)
transcript = Transcript.new
cues_array.each do |cue_hash|
transcript.add_cue(Cue.new(cue_hash[:start_time], cue_hash[:end_time], cue_hash[:text]))
end
transcript
end
end
32 changes: 0 additions & 32 deletions app/serializers/webvtt_serializer.rb

This file was deleted.

1 change: 0 additions & 1 deletion config/initializers/inflections.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,4 @@

# These inflection rules are supported but not enabled by default:
ActiveSupport::Inflector.inflections(:en) do |inflect|
inflect.acronym "WebVTT"
end
46 changes: 12 additions & 34 deletions test/clients/youtube/transcript_test.rb
Original file line number Diff line number Diff line change
@@ -1,42 +1,20 @@
require "test_helper"
require "webvtt"

module Youtube
class TranscriptTest < ActiveSupport::TestCase
def setup
@client = Youtube::Transcript.new
end

test "fetch the trasncript from a video in vtt format" do
video_id = "9LfmrkyP81M"

VCR.use_cassette("youtube_video_transcript", match_requests_on: [:method]) do
transcript = @client.get_vtt(video_id)
assert_not_nil transcript

# Save the VTT content to a temporary file to parse it using WebVTT gem
Tempfile.create(["transcript", ".vtt"]) do |file|
file.write(transcript)
file.rewind

# Parse the VTT file
webvtt = WebVTT.read(file.path)
class Youtube::TranscriptTest < ActiveSupport::TestCase
def setup
@client = Youtube::Transcript.new
end

# Ensure it has the correct headers
assert_match(/^WEBVTT/, transcript)
test "fetch the trasncript from a video in vtt format" do
video_id = "9LfmrkyP81M"

# Ensure it has at least one cue
assert_not_empty webvtt.cues
VCR.use_cassette("youtube_video_transcript", match_requests_on: [:method]) do
transcript = @client.get(video_id)
assert_not_nil transcript

# Validate each cue
webvtt.cues.each do |cue|
assert_not_nil cue.start
assert_not_nil cue.end
assert_not_nil cue.text
assert_match(/^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}$/, "#{cue.start} --> #{cue.end}")
end
end
end
transcript = Transcript.create_from_youtube_transcript(transcript)
assert_not_empty transcript.cues
assert transcript.cues.first.is_a?(Cue)
end
end
end
46 changes: 6 additions & 40 deletions test/models/talk_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,48 +27,13 @@

class TalkTest < ActiveSupport::TestCase
include ActiveJob::TestHelper
test "should serialize and deserialize transcript correctly" do
vtt_string = <<~VTT
WEBVTT
00:00.000 --> 00:05.000
Welcome to the talk.
00:06.000 --> 00:10.000
Let's get started.
VTT

talk = Talk.new(title: "Sample Talk", transcript: vtt_string)
assert talk.save

loaded_talk = Talk.find(talk.id)
assert_equal WebVTTSerializer.load(vtt_string), loaded_talk.transcript
end

test "should convert transcript to WebVTT format correctly" do
vtt_string = <<~VTT
WEBVTT
00:00.000 --> 00:05.000
Welcome to the talk.
00:06.000 --> 00:10.000
Let's get started.
VTT

cues = WebVTTSerializer.load(vtt_string)
talk = Talk.new(title: "Sample Talk", transcript: cues)

expected_vtt = WebVTTSerializer.dump(talk.transcript)
assert_equal vtt_string.strip, expected_vtt.strip
end

test "should handle empty transcript" do
talk = Talk.new(title: "Sample Talk", transcript: [])
talk = Talk.new(title: "Sample Talk", transcript: Transcript.new)
assert talk.save

loaded_talk = Talk.find(talk.id)
assert_empty loaded_talk.transcript
assert_equal loaded_talk.transcript.cues, []
assert_equal "Sample Talk", loaded_talk.title
end

test "should update transcript" do
Expand All @@ -80,7 +45,8 @@ class TalkTest < ActiveSupport::TestCase
end
end

assert_not_empty @talk.transcript
assert @talk.transcript.length > 100
assert @talk.transcript.is_a?(Transcript)
assert @talk.transcript.cues.first.is_a?(Cue)
assert @talk.transcript.cues.length > 100
end
end

0 comments on commit 1d7f4de

Please sign in to comment.