Skip to content

Commit

Permalink
Rework in_batches to be more network efficient
Browse files Browse the repository at this point in the history
When `use_ranges: true` option is used, there is no need to return the
whole list of ids for every range. This would waste to many network and
time resources, as only the last id is needed.

Instead we can use OFFSET to peek at the last id of the next batch, and
return only this one.

There's a trade-off though, since we need one additional query to
confirm the size and last id for the very last batch. Unless you have
only a handful of small batches, this is overall a winning strategy with
less time and network resources spent to generate batches.

There is a trade-off, however, as we need an additional query to confirm
the size and last id of the very last batch. Unless you only have a
handful of small batches, this strategy is a winner overall, as it
reduces the time and network resources spent generating batches.
  • Loading branch information
maximerety committed Mar 3, 2024
1 parent 5d528ba commit 56ba355
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 17 deletions.
28 changes: 20 additions & 8 deletions activerecord/lib/active_record/relation/batches.rb
Original file line number Diff line number Diff line change
Expand Up @@ -373,32 +373,44 @@ def batch_on_unloaded_relation(relation:, start:, finish:, load:, order:, use_ra
if load
records = batch_relation.records
ids = records.map(&:id)
ids_size = ids.size
ids_last = ids.last
yielded_relation = where(primary_key => ids)
yielded_relation.load_records(records)
elsif (empty_scope && use_ranges != false) || use_ranges
ids = batch_relation.ids
finish = ids.last
if finish
yielded_relation = apply_finish_limit(batch_relation, finish, batch_orders)
# Efficiently peak at the last id for the next batch using offset and limit.
ids_size = batch_limit
ids_last = batch_relation.offset(batch_limit - 1).limit(1).ids.first
# If the last id is not found using offset, there is at most one more batch of size < batch_limit.
# We retry getting the list of ids so that we have the exact size and last id.
unless ids_last
ids = batch_relation.ids
ids_size = ids.size
ids_last = ids.last
end
if ids_last
yielded_relation = apply_finish_limit(batch_relation, ids_last, batch_orders)
yielded_relation = yielded_relation.except(:limit, :order)
yielded_relation.skip_query_cache!(false)
end
else
ids = batch_relation.ids
ids_size = ids.size
ids_last = ids.last
yielded_relation = where(primary_key => ids)
end

break if ids.empty?
break if ids_size == 0

primary_key_offset = ids.last
primary_key_offset = ids_last
raise ArgumentError.new("Primary key not included in the custom select clause") unless primary_key_offset

yield yielded_relation

break if ids.length < batch_limit
break if ids_size < batch_limit

if limit_value
remaining -= ids.length
remaining -= ids_size

if remaining == 0
# Saves a useless iteration when the limit is a multiple of the
Expand Down
18 changes: 9 additions & 9 deletions activerecord/test/cases/batches_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -327,15 +327,15 @@ def test_in_batches_has_attribute_readers
end

def test_in_batches_should_yield_relation_if_block_given
assert_queries_count(6) do
assert_queries_count(7) do
Post.in_batches(of: 2) do |relation|
assert_kind_of ActiveRecord::Relation, relation
end
end
end

def test_in_batches_should_be_enumerable_if_no_block_given
assert_queries_count(6) do
assert_queries_count(7) do
Post.in_batches(of: 2).each do |relation|
assert_kind_of ActiveRecord::Relation, relation
end
Expand Down Expand Up @@ -370,7 +370,7 @@ def test_in_batches_each_record_should_be_ordered_by_id
end

def test_in_batches_update_all_affect_all_records
assert_queries_count(6 + 6) do # 6 selects, 6 updates
assert_queries_count(7 + 6) do # 7 selects, 6 updates
Post.in_batches(of: 2).update_all(title: "updated-title")
end
assert_equal Post.all.pluck(:title), ["updated-title"] * Post.count
Expand Down Expand Up @@ -416,7 +416,7 @@ def test_in_batches_should_be_loaded
end

def test_in_batches_if_not_loaded_executes_more_queries
assert_queries_count(@total + 1) do
assert_queries_count(@total + 2) do
Post.in_batches(of: 1, load: false) do |relation|
assert_not_predicate relation, :loaded?
end
Expand Down Expand Up @@ -490,7 +490,7 @@ def test_in_batches_when_loaded_can_return_an_enum
end

def test_in_batches_should_return_relations
assert_queries_count(@total + 1) do
assert_queries_count(@total + 2) do
Post.in_batches(of: 1) do |relation|
assert_kind_of ActiveRecord::Relation, relation
end
Expand All @@ -507,7 +507,7 @@ def test_in_batches_should_start_from_the_start_option

def test_in_batches_should_end_at_the_finish_option
post = Post.order("id DESC").where("id <= ?", 5).first
assert_queries_count(7) do
assert_queries_count(8) do
relation = Post.in_batches(of: 1, finish: 5, load: true).reverse_each.first
assert_equal post, relation.last
end
Expand Down Expand Up @@ -554,11 +554,11 @@ def test_in_batches_no_subqueries_for_whole_tables_batching
end

def test_in_batches_shouldnt_execute_query_unless_needed
assert_queries_count(2) do
assert_queries_count(3) do
Post.in_batches(of: @total) { |relation| assert_kind_of ActiveRecord::Relation, relation }
end

assert_queries_count(1) do
assert_queries_count(2) do
Post.in_batches(of: @total + 1) { |relation| assert_kind_of ActiveRecord::Relation, relation }
end
end
Expand Down Expand Up @@ -819,7 +819,7 @@ def test_find_in_batches_should_return_a_sized_enumerator

test ".in_batches bypasses the query cache for its own queries" do
Post.cache do
assert_queries_count(2) do
assert_queries_count(4) do
Post.in_batches { }
Post.in_batches { }
end
Expand Down

0 comments on commit 56ba355

Please sign in to comment.