From 56ba3555171c1d80d4b9347bc7f37f0d1faaabd7 Mon Sep 17 00:00:00 2001 From: maximerety Date: Sun, 3 Mar 2024 20:52:35 +0100 Subject: [PATCH] Rework in_batches to be more network efficient When `use_ranges: true` option is used, there is no need to return the whole list of ids for every range. This would waste to many network and time resources, as only the last id is needed. Instead we can use OFFSET to peek at the last id of the next batch, and return only this one. There's a trade-off though, since we need one additional query to confirm the size and last id for the very last batch. Unless you have only a handful of small batches, this is overall a winning strategy with less time and network resources spent to generate batches. There is a trade-off, however, as we need an additional query to confirm the size and last id of the very last batch. Unless you only have a handful of small batches, this strategy is a winner overall, as it reduces the time and network resources spent generating batches. --- .../lib/active_record/relation/batches.rb | 28 +++++++++++++------ activerecord/test/cases/batches_test.rb | 18 ++++++------ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/activerecord/lib/active_record/relation/batches.rb b/activerecord/lib/active_record/relation/batches.rb index f9c26770b7e08..2443834225d76 100644 --- a/activerecord/lib/active_record/relation/batches.rb +++ b/activerecord/lib/active_record/relation/batches.rb @@ -373,32 +373,44 @@ def batch_on_unloaded_relation(relation:, start:, finish:, load:, order:, use_ra if load records = batch_relation.records ids = records.map(&:id) + ids_size = ids.size + ids_last = ids.last yielded_relation = where(primary_key => ids) yielded_relation.load_records(records) elsif (empty_scope && use_ranges != false) || use_ranges - ids = batch_relation.ids - finish = ids.last - if finish - yielded_relation = apply_finish_limit(batch_relation, finish, batch_orders) + # Efficiently peak at the last id for the next batch using offset and limit. + ids_size = batch_limit + ids_last = batch_relation.offset(batch_limit - 1).limit(1).ids.first + # If the last id is not found using offset, there is at most one more batch of size < batch_limit. + # We retry getting the list of ids so that we have the exact size and last id. + unless ids_last + ids = batch_relation.ids + ids_size = ids.size + ids_last = ids.last + end + if ids_last + yielded_relation = apply_finish_limit(batch_relation, ids_last, batch_orders) yielded_relation = yielded_relation.except(:limit, :order) yielded_relation.skip_query_cache!(false) end else ids = batch_relation.ids + ids_size = ids.size + ids_last = ids.last yielded_relation = where(primary_key => ids) end - break if ids.empty? + break if ids_size == 0 - primary_key_offset = ids.last + primary_key_offset = ids_last raise ArgumentError.new("Primary key not included in the custom select clause") unless primary_key_offset yield yielded_relation - break if ids.length < batch_limit + break if ids_size < batch_limit if limit_value - remaining -= ids.length + remaining -= ids_size if remaining == 0 # Saves a useless iteration when the limit is a multiple of the diff --git a/activerecord/test/cases/batches_test.rb b/activerecord/test/cases/batches_test.rb index f8df51b9755c5..08f310877ce3e 100644 --- a/activerecord/test/cases/batches_test.rb +++ b/activerecord/test/cases/batches_test.rb @@ -327,7 +327,7 @@ def test_in_batches_has_attribute_readers end def test_in_batches_should_yield_relation_if_block_given - assert_queries_count(6) do + assert_queries_count(7) do Post.in_batches(of: 2) do |relation| assert_kind_of ActiveRecord::Relation, relation end @@ -335,7 +335,7 @@ def test_in_batches_should_yield_relation_if_block_given end def test_in_batches_should_be_enumerable_if_no_block_given - assert_queries_count(6) do + assert_queries_count(7) do Post.in_batches(of: 2).each do |relation| assert_kind_of ActiveRecord::Relation, relation end @@ -370,7 +370,7 @@ def test_in_batches_each_record_should_be_ordered_by_id end def test_in_batches_update_all_affect_all_records - assert_queries_count(6 + 6) do # 6 selects, 6 updates + assert_queries_count(7 + 6) do # 7 selects, 6 updates Post.in_batches(of: 2).update_all(title: "updated-title") end assert_equal Post.all.pluck(:title), ["updated-title"] * Post.count @@ -416,7 +416,7 @@ def test_in_batches_should_be_loaded end def test_in_batches_if_not_loaded_executes_more_queries - assert_queries_count(@total + 1) do + assert_queries_count(@total + 2) do Post.in_batches(of: 1, load: false) do |relation| assert_not_predicate relation, :loaded? end @@ -490,7 +490,7 @@ def test_in_batches_when_loaded_can_return_an_enum end def test_in_batches_should_return_relations - assert_queries_count(@total + 1) do + assert_queries_count(@total + 2) do Post.in_batches(of: 1) do |relation| assert_kind_of ActiveRecord::Relation, relation end @@ -507,7 +507,7 @@ def test_in_batches_should_start_from_the_start_option def test_in_batches_should_end_at_the_finish_option post = Post.order("id DESC").where("id <= ?", 5).first - assert_queries_count(7) do + assert_queries_count(8) do relation = Post.in_batches(of: 1, finish: 5, load: true).reverse_each.first assert_equal post, relation.last end @@ -554,11 +554,11 @@ def test_in_batches_no_subqueries_for_whole_tables_batching end def test_in_batches_shouldnt_execute_query_unless_needed - assert_queries_count(2) do + assert_queries_count(3) do Post.in_batches(of: @total) { |relation| assert_kind_of ActiveRecord::Relation, relation } end - assert_queries_count(1) do + assert_queries_count(2) do Post.in_batches(of: @total + 1) { |relation| assert_kind_of ActiveRecord::Relation, relation } end end @@ -819,7 +819,7 @@ def test_find_in_batches_should_return_a_sized_enumerator test ".in_batches bypasses the query cache for its own queries" do Post.cache do - assert_queries_count(2) do + assert_queries_count(4) do Post.in_batches { } Post.in_batches { } end