diff --git a/.env.sample b/.env.sample index 368edaf1..79b142bc 100644 --- a/.env.sample +++ b/.env.sample @@ -1,4 +1,4 @@ MEILI_MASTER_KEY=your local meili master key but in therory meilisearch can run without master key in development mode YOUTUBE_API_KEY=useful for scraping youtube videos but the app can run without it RUBYVIDEO_GITHUB_TOKEN=useful for the profile enhancement feature but the app can run without it - +OPENAI_ACCESS_TOKEN="change_me" diff --git a/.gitignore b/.gitignore index 9cf06003..d29f57da 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ *.local /data.ms +/data_meili /data_tmp /data_preparation/**/* !/data_preparation/organisations.yml diff --git a/Gemfile b/Gemfile index 389f25d7..dc2a5987 100644 --- a/Gemfile +++ b/Gemfile @@ -93,5 +93,6 @@ gem "meta-tags", "~> 2.18" gem "groupdate", "~> 6.2" gem "appsignal", "~> 3.4" +gem "ruby-openai" gem "chartkick", "~> 5.0" diff --git a/Gemfile.lock b/Gemfile.lock index b5fcc08c..a12b7046 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,6 @@ GIT remote: https://github.com/rails/rails.git - revision: 55412cd9257dc27a8a9175529857ce5f2d81f92f + revision: e10e35dd32aff48457a76b83e244e8baad34a9ec branch: main specs: actioncable (7.1.0.alpha) @@ -28,7 +28,7 @@ GIT net-imap net-pop net-smtp - rails-dom-testing (~> 2.2) + rails-dom-testing (~> 2.0) actionpack (7.1.0.alpha) actionview (= 7.1.0.alpha) activesupport (= 7.1.0.alpha) @@ -36,7 +36,7 @@ GIT rack (>= 2.2.4) rack-session (>= 1.0.1) rack-test (>= 0.6.3) - rails-dom-testing (~> 2.2) + rails-dom-testing (~> 2.0) rails-html-sanitizer (~> 1.6) actiontext (7.1.0.alpha) actionpack (= 7.1.0.alpha) @@ -49,7 +49,7 @@ GIT activesupport (= 7.1.0.alpha) builder (~> 3.1) erubi (~> 1.11) - rails-dom-testing (~> 2.2) + rails-dom-testing (~> 2.0) rails-html-sanitizer (~> 1.6) activejob (7.1.0.alpha) activesupport (= 7.1.0.alpha) @@ -59,7 +59,6 @@ GIT activerecord (7.1.0.alpha) activemodel (= 7.1.0.alpha) activesupport (= 7.1.0.alpha) - timeout (>= 0.4.0) activestorage (7.1.0.alpha) actionpack (= 7.1.0.alpha) activejob (= 7.1.0.alpha) @@ -67,14 +66,10 @@ GIT activesupport (= 7.1.0.alpha) marcel (~> 1.0) activesupport (7.1.0.alpha) - base64 - bigdecimal concurrent-ruby (~> 1.0, >= 1.0.2) connection_pool (>= 2.2.5) - drb i18n (>= 1.6, < 2) minitest (>= 5.1) - mutex_m tzinfo (~> 2.0) rails (7.1.0.alpha) actioncable (= 7.1.0.alpha) @@ -102,7 +97,7 @@ GIT GEM remote: https://rubygems.org/ specs: - addressable (2.8.5) + addressable (2.8.4) public_suffix (>= 2.0.2, < 6.0) ahoy_matey (4.2.1) activesupport (>= 5.2) @@ -111,12 +106,11 @@ GEM annotate (3.2.0) activerecord (>= 3.2, < 8.0) rake (>= 10.4, < 14.0) - appsignal (3.4.5) + appsignal (3.4.4) rack ast (2.4.2) - authentication-zero (2.16.36) - base64 (0.1.1) - bcrypt (3.1.19) + authentication-zero (2.16.29) + bcrypt (3.1.18) better_html (2.0.2) actionview (>= 6.0) activesupport (>= 6.0) @@ -124,13 +118,12 @@ GEM erubi (~> 1.4) parser (>= 2.4) smart_properties - bigdecimal (3.1.4) bindex (0.8.1) bootsnap (1.16.0) msgpack (~> 1.2) builder (3.2.4) byebug (11.1.3) - capybara (3.39.2) + capybara (3.39.0) addressable matrix mini_mime (>= 0.1.3) @@ -139,7 +132,7 @@ GEM rack-test (>= 0.6.3) regexp_parser (>= 1.5, < 3.0) xpath (~> 3.2) - chartkick (5.0.2) + chartkick (5.0.4) concurrent-ruby (1.2.2) connection_pool (2.4.1) crack (0.4.5) @@ -150,14 +143,12 @@ GEM irb (>= 1.5.0) reline (>= 0.3.1) device_detector (1.1.0) - dockerfile-rails (1.5.2) + dockerfile-rails (1.2.5) rails dotenv (2.8.1) dotenv-rails (2.8.1) dotenv (= 2.8.1) railties (>= 3.2) - drb (2.1.1) - ruby2_keywords dry-cli (1.0.0) erb_lint (0.4.0) activesupport @@ -168,18 +159,17 @@ GEM smart_properties error_highlight (0.5.1) erubi (1.12.0) + faraday (2.7.10) + faraday-net_http (>= 2.0, < 3.1) + ruby2_keywords (>= 0.0.4) + faraday-multipart (1.0.4) + multipart-post (~> 2) + faraday-net_http (3.0.2) globalid (1.1.0) activesupport (>= 5.0) groupdate (6.2.1) activesupport (>= 5.2) - hanami-router (0.6.2) - hanami-utils (~> 0.7) - http_router (~> 0.11) - hanami-utils (0.9.2) hashdiff (1.0.1) - http_router (0.11.2) - rack (>= 1.0.0) - url_mount (~> 0.2.1) httparty (0.21.0) mini_mime (>= 1.0.0) multi_xml (>= 0.5.2) @@ -189,18 +179,14 @@ GEM activesupport (>= 3.0) nokogiri (>= 1.6) io-console (0.6.0) - irb (1.7.4) - reline (>= 0.3.6) + irb (1.7.0) + reline (>= 0.3.0) json (2.6.3) language_server-protocol (3.17.0.3) - lint_roller (1.1.0) - litestack (0.3.0) - erubi - hanami-router + lint_roller (1.0.0) + litestack (0.2.3) oj - rack sqlite3 - tilt loofah (2.21.3) crass (~> 1.0.2) nokogiri (>= 1.12.0) @@ -218,13 +204,13 @@ GEM meta-tags (2.18.0) actionpack (>= 3.2.0, < 7.1) mini_mime (1.1.5) - minitest (5.19.0) - msgpack (1.7.1) + minitest (5.18.0) + msgpack (1.7.0) multi_xml (0.6.0) - mutex_m (0.1.2) + multipart-post (2.3.0) net-http (0.3.2) uri - net-imap (0.3.7) + net-imap (0.3.6) date net-protocol net-pop (0.1.2) @@ -234,28 +220,25 @@ GEM net-smtp (0.3.3) net-protocol nio4r (2.5.9) - nokogiri (1.15.4-aarch64-linux) - racc (~> 1.4) - nokogiri (1.15.4-arm64-darwin) + nokogiri (1.15.2-aarch64-linux) racc (~> 1.4) - nokogiri (1.15.4-x86_64-darwin) + nokogiri (1.15.2-arm64-darwin) racc (~> 1.4) - nokogiri (1.15.4-x86_64-linux) + nokogiri (1.15.2-x86_64-linux) racc (~> 1.4) - oj (3.15.1) + oj (3.14.3) pagy (6.0.4) parallel (1.23.0) - parser (3.2.2.3) + parser (3.2.2.1) ast (~> 2.4.1) - racc prettier_print (1.2.1) propshaft (0.7.0) actionpack (>= 7.0.0) activesupport (>= 7.0.0) rack railties (>= 7.0.0) - public_suffix (5.0.3) - puma (6.3.0) + public_suffix (5.0.1) + puma (6.2.2) nio4r (~> 2.0) racc (1.7.1) rack (3.0.8) @@ -274,71 +257,70 @@ GEM actionpack (>= 5.0.1.rc1) actionview (>= 5.0.1.rc1) activesupport (>= 5.0.1.rc1) - rails-dom-testing (2.2.0) - activesupport (>= 5.0.0) - minitest + rails-dom-testing (2.0.3) + activesupport (>= 4.2.0) nokogiri (>= 1.6) rails-html-sanitizer (1.6.0) loofah (~> 2.21) nokogiri (~> 1.14) rainbow (3.1.1) rake (13.0.6) - regexp_parser (2.8.1) - reline (0.3.8) + regexp_parser (2.8.0) + reline (0.3.5) io-console (~> 0.5) - rexml (3.2.6) - rubocop (1.52.1) + rexml (3.2.5) + rubocop (1.50.2) json (~> 2.3) parallel (~> 1.10) - parser (>= 3.2.2.3) + parser (>= 3.2.0.0) rainbow (>= 2.2.2, < 4.0) regexp_parser (>= 1.8, < 3.0) rexml (>= 3.2.5, < 4.0) rubocop-ast (>= 1.28.0, < 2.0) ruby-progressbar (~> 1.7) unicode-display_width (>= 2.4.0, < 3.0) - rubocop-ast (1.29.0) + rubocop-ast (1.28.1) parser (>= 3.2.1.0) - rubocop-performance (1.18.0) + rubocop-performance (1.16.0) rubocop (>= 1.7.0, < 2.0) rubocop-ast (>= 0.4.0) ruby-lsp (0.5.1) language_server-protocol (~> 3.17.0) sorbet-runtime syntax_tree (>= 6.1.1, < 7) + ruby-openai (5.1.0) + faraday (>= 1) + faraday-multipart (>= 1) ruby-progressbar (1.13.0) ruby2_keywords (0.0.5) rubyzip (2.3.2) safely_block (0.4.0) - selenium-webdriver (4.11.0) + selenium-webdriver (4.9.1) rexml (~> 3.2, >= 3.2.5) rubyzip (>= 1.2.2, < 3.0) websocket (~> 1.0) smart_properties (1.17.0) - sorbet-runtime (0.5.10962) - sqlite3 (1.6.4-aarch64-linux) - sqlite3 (1.6.4-arm64-darwin) - sqlite3 (1.6.4-x86_64-darwin) - sqlite3 (1.6.4-x86_64-linux) - standard (1.30.1) + sorbet-runtime (0.5.10820) + sqlite3 (1.6.2-aarch64-linux) + sqlite3 (1.6.2-arm64-darwin) + sqlite3 (1.6.2-x86_64-linux) + standard (1.28.2) language_server-protocol (~> 3.17.0.2) lint_roller (~> 1.0) - rubocop (~> 1.52.0) + rubocop (~> 1.50.2) standard-custom (~> 1.0.0) - standard-performance (~> 1.1.0) - standard-custom (1.0.2) + standard-performance (~> 1.0.1) + standard-custom (1.0.0) lint_roller (~> 1.0) - rubocop (~> 1.50) - standard-performance (1.1.2) - lint_roller (~> 1.1) - rubocop-performance (~> 1.18.0) + standard-performance (1.0.1) + lint_roller (~> 1.0) + rubocop-performance (~> 1.16.0) standardrb (1.0.1) standard syntax_tree (6.1.1) prettier_print (>= 1.2.0) thor (1.2.2) - tilt (2.2.0) - timeout (0.4.0) + timeout (0.3.2) turbo-rails (1.4.0) actionpack (>= 6.0.0) activejob (>= 6.0.0) @@ -347,13 +329,11 @@ GEM concurrent-ruby (~> 1.0) unicode-display_width (2.4.2) uri (0.12.1) - url_mount (0.2.1) - rack - vcr (6.2.0) - vite_rails (3.0.15) + vcr (6.1.0) + vite_rails (3.0.14) railties (>= 5.1, < 8) vite_ruby (~> 3.0, >= 3.2.2) - vite_ruby (3.3.4) + vite_ruby (3.3.2) dry-cli (>= 0.7, < 2) rack-proxy (~> 0.6, >= 0.6.1) zeitwerk (~> 2.2) @@ -368,17 +348,16 @@ GEM hashdiff (>= 0.4.0, < 2.0.0) webrick (1.8.1) websocket (1.2.9) - websocket-driver (0.7.6) + websocket-driver (0.7.5) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.5) xpath (3.2.0) nokogiri (~> 1.8) - zeitwerk (2.6.11) + zeitwerk (2.6.8) PLATFORMS aarch64-linux arm64-darwin-22 - x86_64-darwin-19 x86_64-linux DEPENDENCIES @@ -409,6 +388,7 @@ DEPENDENCIES rails! rails-controller-testing ruby-lsp (~> 0.5.1) + ruby-openai selenium-webdriver sqlite3 (~> 1.4) standardrb (~> 1.0) diff --git a/Procfile.dev b/Procfile.dev index 6f438902..806c4806 100644 --- a/Procfile.dev +++ b/Procfile.dev @@ -1,3 +1,3 @@ web: bin/rails server -p 3000 vite: bin/vite dev -search: meilisearch +search: docker rm -f rubyvideo-meilisearch && docker run --name rubyvideo-meilisearch -p 7700:7700 -v $(pwd)/meili_data:/data_meili getmeili/meilisearch:v1.3.2 meilisearch --env development --no-analytics --log-level=INFO \ No newline at end of file diff --git a/app/models/ai.rb b/app/models/ai.rb new file mode 100644 index 00000000..f475270d --- /dev/null +++ b/app/models/ai.rb @@ -0,0 +1,13 @@ +class Ai + def self.embedding(*inputs) + return nil unless ENV["OPENAI_ACCESS_TOKEN"].present? + client = OpenAI::Client.new + response = client.embeddings( + parameters: { + model: "text-embedding-ada-002", + input: inputs.join("\n\n") + } + ) + response.dig("data", 0, "embedding") + end +end diff --git a/app/models/talk.rb b/app/models/talk.rb index 73896733..3f3d430b 100644 --- a/app/models/talk.rb +++ b/app/models/talk.rb @@ -4,25 +4,36 @@ # Table name: talks # # id :integer not null, primary key -# title :string default(""), not null +# date :date # description :text default(""), not null +# embedding :json +# like_count :integer # slug :string default(""), not null -# video_id :string default(""), not null -# video_provider :string default(""), not null -# thumbnail_sm :string default(""), not null -# thumbnail_md :string default(""), not null # thumbnail_lg :string default(""), not null +# thumbnail_md :string default(""), not null +# thumbnail_sm :string default(""), not null +# thumbnail_xl :string default(""), not null +# thumbnail_xs :string default(""), not null +# title :string default(""), not null +# video_provider :string default(""), not null +# view_count :integer # year :integer # created_at :datetime not null # updated_at :datetime not null # event_id :integer -# thumbnail_xs :string default(""), not null -# thumbnail_xl :string default(""), not null -# date :date -# like_count :integer -# view_count :integer +# video_id :string default(""), not null +# +# Indexes +# +# index_talks_on_date (date) +# index_talks_on_event_id (event_id) +# index_talks_on_slug (slug) +# index_talks_on_title (title) +# +# Foreign Keys +# +# event_id (event_id => events.id) # -# rubocop:enable Layout/LineLength class Talk < ApplicationRecord include Sluggable include Suggestable @@ -41,12 +52,17 @@ class Talk < ApplicationRecord # delegates delegate :name, to: :event, prefix: true, allow_nil: true + before_save :compute_embedding, if: :must_compute_embedding? + # search - meilisearch do + meilisearch primary_key: :id, enqueue: true, raise_on_failure: Rails.env.development? do attribute :title attribute :description attribute :slug - attribute :video_id + # ⚠️ This `video_id` attribute makes indexing (silently) fail with v1.3.2 of meilisearch. Error message from meilisearch (GET /tasks): + # "The primary key inference failed as the engine found 2 fields ending with `id` in their names: 'id' and 'video_id'. Please specify the primary key manually using the `primaryKey` query parameter" + # Adding a custom primary_key: :id above didn't make any difference, so removing this attribute for now. + # attribute :video_id attribute :video_provider attribute :thumbnail_sm attribute :thumbnail_md @@ -54,13 +70,38 @@ class Talk < ApplicationRecord attribute :speakers do speakers.pluck(:name) end + # ⚠️ This must return nil and not an empty array if no vector is available. + # Otherwise all other indexing tasks with non-zero vector arrays will silently fail, since the engine will expect all vectors to have the same length. + attribute :_vectors searchable_attributes [:title, :description] sortable_attributes [:title] + filterable_attributes [:id] attributes_to_highlight ["*"] end - meilisearch enqueue: true + # Recomputes embedding for all talks that don't have one yet. + def self.reembed!(sleep_interval: 2.seconds, limit: nil) + # required for querying vectors (not indexing) + MeiliSearch::Rails.client.http_patch "/experimental-features", {vectorStore: true} + + Talk.where(embedding: nil).limit(limit).in_batches(of: 10) do |talks| + talks.each do |talk| + talk.compute_embedding + talk.save! + end + # seems to help with not getting rate-limited by OpenAI + sleep sleep_interval + end + + if Talk.where(embedding: nil).exists? + Rails.logger.warn "Some talks are still missing their embedding. You should re-run the task" + false + else + Rails.logger.info "Good job, all talks have their embedding." + true + end + end def to_meta_tags { @@ -88,4 +129,27 @@ def thumbnail_lg def thumbnail_xl self[:thumbnail_xl].presence || "https://i.ytimg.com/vi/#{video_id}/maxresdefault.jpg" end + + def neighbors(limit: 5) + current_talk = Talk.index.document(id) + query_vector = current_talk.fetch("_vectors", []) + return Talk.none if query_vector.blank? + Talk.search("", vector: query_vector, limit: limit, filter: "id != #{id}") + rescue MeiliSearch::ApiError => e + Rails.logger.error("MeiliSearch error: #{e.message}") + Talk.none + end + + def _vectors + embedding + end + + def compute_embedding + Rails.logger.info "Computing embedding for talk #{id}" + self.embedding = Ai.embedding(title, description) + end + + private def must_compute_embedding? + embedding.nil? || will_save_change_to_title? || will_save_change_to_description? + end end diff --git a/app/views/talks/_talk.html.erb b/app/views/talks/_talk.html.erb index e16e768f..6825a223 100644 --- a/app/views/talks/_talk.html.erb +++ b/app/views/talks/_talk.html.erb @@ -28,6 +28,16 @@ <%#= talk.event_edition %>

+ <% related_talks = talk.neighbors(limit: 5) %> + <% if related_talks.any? %> +

Other talks you might be interested in

+ + <% end %> + <% if action_name != "show" %> <%= link_to "Show this talk", talk, class: "rounded-lg py-3 px-5 bg-gray-100 inline-block font-medium" %> <%= link_to "Edit this talk", edit_talk_path(talk), class: "rounded-lg py-3 ml-2 px-5 bg-gray-100 inline-block font-medium" %> diff --git a/config/initializers/meilisearch.rb b/config/initializers/meilisearch.rb index 767818ed..f2680d2a 100644 --- a/config/initializers/meilisearch.rb +++ b/config/initializers/meilisearch.rb @@ -1,4 +1,4 @@ MeiliSearch::Rails.configuration = { meilisearch_url: Rails.env.local? ? "http://localhost:7700" : "http://91.107.208.207:7700", # example: http://localhost:7700 - meilisearch_api_key: ENV["MEILI_MASTER_KEY"] + meilisearch_api_key: Rails.env.local? ? nil : ENV["MEILI_MASTER_KEY"] } diff --git a/config/initializers/openai.rb b/config/initializers/openai.rb new file mode 100644 index 00000000..789e64d7 --- /dev/null +++ b/config/initializers/openai.rb @@ -0,0 +1,5 @@ +if ENV["OPENAI_ACCESS_TOKEN"].present? + OpenAI.configure do |config| + config.access_token = ENV.fetch("OPENAI_ACCESS_TOKEN") + end +end diff --git a/db/migrate/20230906073343_add_embedding_to_talks.rb b/db/migrate/20230906073343_add_embedding_to_talks.rb new file mode 100644 index 00000000..bea1479f --- /dev/null +++ b/db/migrate/20230906073343_add_embedding_to_talks.rb @@ -0,0 +1,5 @@ +class AddEmbeddingToTalks < ActiveRecord::Migration[7.1] + def change + add_column :talks, :embedding, :jsonb, null: true + end +end diff --git a/db/schema.rb b/db/schema.rb index 57157f5e..04069374 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.1].define(version: 2023_07_20_151537) do +ActiveRecord::Schema[7.1].define(version: 2023_09_06_073343) do create_table "ahoy_events", force: :cascade do |t| t.integer "visit_id" t.integer "user_id" @@ -153,6 +153,7 @@ t.date "date" t.integer "like_count" t.integer "view_count" + t.json "embedding" t.index ["date"], name: "index_talks_on_date" t.index ["event_id"], name: "index_talks_on_event_id" t.index ["slug"], name: "index_talks_on_slug"