Skip to content

Commit

Permalink
adds the ability to fetch transcript from youtube
Browse files Browse the repository at this point in the history
  • Loading branch information
adrienpoly committed Jul 7, 2024
1 parent 6681fb9 commit 126ca2a
Show file tree
Hide file tree
Showing 18 changed files with 27,329 additions and 21 deletions.
12 changes: 5 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
name: linters

on:
pull_request:
branches:
- "*"
push:
branches:
- main
on: [push]

concurrency: ci-${{ github.ref }}

Expand Down Expand Up @@ -43,6 +37,7 @@ jobs:
runs-on: ubuntu-latest
env:
RAILS_ENV: test
MEILI_MASTER_KEY: masterKey
steps:
- uses: actions/checkout@v4

Expand All @@ -61,6 +56,9 @@ jobs:
- name: Install dependencies
run: yarn install --frozen-lockfile

- name: Meilisearch setup with Docker
run: docker run -d -p 7700:7700 getmeili/meilisearch:v1.1 meilisearch --master-key=masterKey --no-analytics

- name: Build assets
run: bin/vite build --clear --mode=test

Expand Down
6 changes: 6 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,9 @@ gem "view_component", "~> 3.7"
gem "dry-initializer-rails"

gem "dry-types", "~> 1.7"

gem "google-protobuf", require: false

gem "webvtt-ruby"

gem "active_job-performs", "~> 0.3.1"
26 changes: 22 additions & 4 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ GEM
erubi (~> 1.11)
rails-dom-testing (~> 2.2)
rails-html-sanitizer (~> 1.6)
active_job-performs (0.3.1)
activejob (>= 6.1)
activejob (7.1.3.4)
activesupport (= 7.1.3.4)
globalid (>= 0.3.6)
Expand Down Expand Up @@ -170,6 +172,18 @@ GEM
raabro (~> 1.4)
globalid (1.2.1)
activesupport (>= 6.1)
google-protobuf (4.27.1-aarch64-linux)
bigdecimal
rake (>= 13)
google-protobuf (4.27.1-arm64-darwin)
bigdecimal
rake (>= 13)
google-protobuf (4.27.1-x86_64-darwin)
bigdecimal
rake (>= 13)
google-protobuf (4.27.1-x86_64-linux)
bigdecimal
rake (>= 13)
groupdate (6.4.0)
activesupport (>= 6.1)
hashdiff (1.1.0)
Expand All @@ -182,7 +196,7 @@ GEM
activesupport (>= 3.0)
nokogiri (>= 1.6)
io-console (0.7.2)
irb (1.13.1)
irb (1.14.0)
rdoc (>= 4.0.0)
reline (>= 0.4.2)
jbuilder (2.12.0)
Expand Down Expand Up @@ -223,7 +237,7 @@ GEM
actionpack (>= 6.0.0, < 7.2)
method_source (1.1.0)
mini_mime (1.1.5)
minitest (5.23.1)
minitest (5.24.1)
msgpack (1.7.2)
multi_xml (0.7.1)
bigdecimal (~> 3.1)
Expand Down Expand Up @@ -266,7 +280,7 @@ GEM
nio4r (~> 2.0)
raabro (1.4.0)
racc (1.8.0)
rack (3.1.3)
rack (3.1.6)
rack-mini-profiler (3.3.1)
rack (>= 1.2.0)
rack-proxy (0.7.7)
Expand Down Expand Up @@ -427,9 +441,10 @@ GEM
websocket-driver (0.7.6)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
webvtt-ruby (0.4.2)
xpath (3.2.0)
nokogiri (~> 1.8)
zeitwerk (2.6.15)
zeitwerk (2.6.16)

PLATFORMS
aarch64-linux
Expand All @@ -439,6 +454,7 @@ PLATFORMS
x86_64-linux

DEPENDENCIES
active_job-performs (~> 0.3.1)
activerecord-enhancedsqlite3-adapter
ahoy_matey (~> 4.2)
annotate
Expand All @@ -456,6 +472,7 @@ DEPENDENCIES
dry-types (~> 1.7)
erb_lint (~> 0.4.0)
error_highlight (>= 0.4.0)
google-protobuf
groupdate (~> 6.2)
inline_svg (~> 1.9)
jbuilder
Expand Down Expand Up @@ -484,6 +501,7 @@ DEPENDENCIES
vite_rails
web-console
webmock
webvtt-ruby

RUBY VERSION
ruby 3.3.1p55
Expand Down
74 changes: 74 additions & 0 deletions app/clients/youtube/transcript.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
require "message_pb"

module Youtube
class Transcript
attr_reader :response

def get_vtt(video_id)
message = {one: "asr", two: "en"}
typedef = MessageType
two = get_base64_protobuf(message, typedef)

message = {one: video_id, two: two}
params = get_base64_protobuf(message, typedef)

url = "https://www.youtube.com/youtubei/v1/get_transcript"
headers = {"Content-Type" => "application/json"}
body = {
context: {
client: {
clientName: "WEB",
clientVersion: "2.20240313"
}
},
params: params
}

@response = HTTParty.post(url, headers: headers, body: body.to_json)
convert_to_vtt(JSON.parse(response.body))
end

def self.get_vtt(video_id)
new.get_vtt(video_id)
end

private

def encode_message(message, typedef)
encoded_message = typedef.new(message)
encoded_message.to_proto
end

def get_base64_protobuf(message, typedef)
encoded_data = encode_message(message, typedef)
Base64.encode64(encoded_data).delete("\n")
end

def convert_to_vtt(transcript)
vtt_content = "WEBVTT\n\n"
events = transcript.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", "initialSegments")
if events
events.each_with_index do |event, index|
segment = event["transcriptSegmentRenderer"]
start_time = format_time(segment["startMs"].to_i)
end_time = format_time(segment["endMs"].to_i)
text = segment.dig("snippet", "runs")&.map { |run| run["text"] }&.join || ""
vtt_content += "#{index + 1}\n"
vtt_content += "#{start_time} --> #{end_time}\n"
vtt_content += "#{text}\n\n"
end
else
vtt_content += "NOTE No transcript data available\n"
end
vtt_content
end

def format_time(ms)
hours = ms / (1000 * 60 * 60)
minutes = (ms % (1000 * 60 * 60)) / (1000 * 60)
seconds = (ms % (1000 * 60)) / 1000
milliseconds = ms % 1000
format("%02d:%02d:%02d.%03d", hours, minutes, seconds, milliseconds)
end
end
end
4 changes: 2 additions & 2 deletions app/components/application_component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ class ApplicationComponent < ViewComponent::Base
attr_accessor :attributes
option :display, default: proc { true }

def initialize(*args, **options)
super(*args, **options)
def initialize(*, **options)
super
defined_option_keys = self.class.dry_initializer.options.map(&:source)
self.attributes = options.except(*defined_option_keys)
end
Expand Down
13 changes: 13 additions & 0 deletions app/models/talk.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class Talk < ApplicationRecord
include Sluggable
include Suggestable
slug_from :title

# include MeiliSearch
include MeiliSearch::Rails
ActiveRecord_Relation.include Pagy::Meilisearch
extend Pagy::Meilisearch
Expand All @@ -36,12 +38,19 @@ class Talk < ApplicationRecord
has_many :speaker_talks, dependent: :destroy, inverse_of: :talk, foreign_key: :talk_id
has_many :speakers, through: :speaker_talks

serialize :transcript, coder: WebVTTSerializer

# validations
validates :title, presence: true

# delegates
delegate :name, to: :event, prefix: true, allow_nil: true

# jobs
performs :udpate_transcript!, queue_as: :low do
retry_on StandardError, wait: :polynomially_longer
end

# search
meilisearch do
attribute :title
Expand Down Expand Up @@ -118,4 +127,8 @@ def thumbnail_xl
def related_talks(limit: 6)
Talk.order("RANDOM()").excluding(self).limit(limit)
end

def update_transcript!
update!(transcript: Youtube::Transcript.get_vtt(video_id))
end
end
32 changes: 32 additions & 0 deletions app/serializers/webvtt_serializer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
require "webvtt"

class WebVTTSerializer
def self.dump(transcript)
return "" if transcript.blank?

# If transcript is a raw VTT string, convert it to cues first
transcript = self.load(transcript) if transcript.is_a?(String)

webvtt = "WEBVTT\n\n"
transcript.each do |cue|
webvtt += "#{cue[:start_time]} --> #{cue[:end_time]}\n#{cue[:text]}\n\n"
end
webvtt.strip
end

def self.load(transcript)
return [] if transcript.blank?

cues = []
# Split transcript by blank lines
transcript.split("\n\n").each do |block|
lines = block.split("\n")
next if lines.size < 2

timecodes = lines[0].split(" --> ")
text = lines[1..].join("\n")
cues << {start_time: timecodes[0], end_time: timecodes[1], text: text}
end
cues
end
end
6 changes: 3 additions & 3 deletions config/initializers/inflections.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
# end

# These inflection rules are supported but not enabled by default:
# ActiveSupport::Inflector.inflections(:en) do |inflect|
# inflect.acronym "RESTful"
# end
ActiveSupport::Inflector.inflections(:en) do |inflect|
inflect.acronym "WebVTT"
end
3 changes: 2 additions & 1 deletion config/initializers/meilisearch.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
MeiliSearch::Rails.configuration = {
meilisearch_url: Rails.env.local? ? "http://localhost:7700" : "http://91.107.208.207:7700", # example: http://localhost:7700
meilisearch_api_key: ENV["MEILI_MASTER_KEY"]
meilisearch_api_key: ENV["MEILI_MASTER_KEY"],
per_environment: true
}
5 changes: 5 additions & 0 deletions db/migrate/20240611113918_add_transcript_to_talk.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddTranscriptToTalk < ActiveRecord::Migration[7.1]
def change
add_column :talks, :transcript, :text, default: "", null: false
end
end
3 changes: 2 additions & 1 deletion db/schema.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

33 changes: 33 additions & 0 deletions lib/message_pb.rb

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 126ca2a

Please sign in to comment.