-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0059582
commit 1d7f4de
Showing
11 changed files
with
123 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
class Cue | ||
attr_reader :start_time, :end_time, :text | ||
|
||
def initialize(start_time, end_time, text) | ||
@start_time = start_time | ||
@end_time = end_time | ||
@text = text | ||
end | ||
|
||
def to_s | ||
"#{start_time} --> #{end_time}\n#{text}" | ||
end | ||
|
||
def to_h | ||
{ | ||
start_time: start_time, | ||
end_time: end_time, | ||
text: text | ||
} | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
class Transcript | ||
include Enumerable | ||
|
||
attr_reader :cues | ||
|
||
def initialize | ||
@cues = [] | ||
end | ||
|
||
def add_cue(cue) | ||
@cues << cue | ||
end | ||
|
||
def to_h | ||
@cues.map { |cue| cue.to_h } | ||
end | ||
|
||
def to_json | ||
to_h.to_json | ||
end | ||
|
||
def to_vtt | ||
vtt_content = "WEBVTT\n\n" | ||
@cues.each_with_index do |cue, index| | ||
vtt_content += "#{index + 1}\n" | ||
vtt_content += "#{cue}\n\n" | ||
end | ||
vtt_content | ||
end | ||
|
||
def each(&) | ||
@cues.each(&) | ||
end | ||
|
||
class << self | ||
def create_from_youtube_transcript(youtube_transcript) | ||
transcript = Transcript.new | ||
events = youtube_transcript.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", "initialSegments") | ||
if events | ||
events.each do |event| | ||
segment = event["transcriptSegmentRenderer"] | ||
start_time = format_time(segment["startMs"].to_i) | ||
end_time = format_time(segment["endMs"].to_i) | ||
text = segment.dig("snippet", "runs")&.map { |run| run["text"] }&.join || "" | ||
transcript.add_cue(Cue.new(start_time, end_time, text)) | ||
end | ||
else | ||
transcript.add_cue(Cue.new("00:00:00.000", "00:00:00.000", "NOTE No transcript data available")) | ||
end | ||
transcript | ||
end | ||
|
||
def format_time(ms) | ||
hours = ms / (1000 * 60 * 60) | ||
minutes = (ms % (1000 * 60 * 60)) / (1000 * 60) | ||
seconds = (ms % (1000 * 60)) / 1000 | ||
milliseconds = ms % 1000 | ||
format("%02d:%02d:%02d.%03d", hours, minutes, seconds, milliseconds) | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
class TranscriptSerializer | ||
def self.dump(transcript) | ||
transcript.to_json | ||
end | ||
|
||
def self.load(transcript_json) | ||
return nil if transcript_json.nil? || transcript_json.empty? | ||
|
||
cues_array = JSON.parse(transcript_json, symbolize_names: true) | ||
transcript = Transcript.new | ||
cues_array.each do |cue_hash| | ||
transcript.add_cue(Cue.new(cue_hash[:start_time], cue_hash[:end_time], cue_hash[:text])) | ||
end | ||
transcript | ||
end | ||
end |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,20 @@ | ||
require "test_helper" | ||
require "webvtt" | ||
|
||
module Youtube | ||
class TranscriptTest < ActiveSupport::TestCase | ||
def setup | ||
@client = Youtube::Transcript.new | ||
end | ||
|
||
test "fetch the trasncript from a video in vtt format" do | ||
video_id = "9LfmrkyP81M" | ||
|
||
VCR.use_cassette("youtube_video_transcript", match_requests_on: [:method]) do | ||
transcript = @client.get_vtt(video_id) | ||
assert_not_nil transcript | ||
|
||
# Save the VTT content to a temporary file to parse it using WebVTT gem | ||
Tempfile.create(["transcript", ".vtt"]) do |file| | ||
file.write(transcript) | ||
file.rewind | ||
|
||
# Parse the VTT file | ||
webvtt = WebVTT.read(file.path) | ||
class Youtube::TranscriptTest < ActiveSupport::TestCase | ||
def setup | ||
@client = Youtube::Transcript.new | ||
end | ||
|
||
# Ensure it has the correct headers | ||
assert_match(/^WEBVTT/, transcript) | ||
test "fetch the trasncript from a video in vtt format" do | ||
video_id = "9LfmrkyP81M" | ||
|
||
# Ensure it has at least one cue | ||
assert_not_empty webvtt.cues | ||
VCR.use_cassette("youtube_video_transcript", match_requests_on: [:method]) do | ||
transcript = @client.get(video_id) | ||
assert_not_nil transcript | ||
|
||
# Validate each cue | ||
webvtt.cues.each do |cue| | ||
assert_not_nil cue.start | ||
assert_not_nil cue.end | ||
assert_not_nil cue.text | ||
assert_match(/^\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}$/, "#{cue.start} --> #{cue.end}") | ||
end | ||
end | ||
end | ||
transcript = Transcript.create_from_youtube_transcript(transcript) | ||
assert_not_empty transcript.cues | ||
assert transcript.cues.first.is_a?(Cue) | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters