From 52a36f81553d396e5564953f749d4010f4f66121 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 3 Apr 2026 16:04:35 +0900 Subject: [PATCH] GH-49656: [Ruby] Add benchmark for writers Performance is important in Apache Arrow. So benchmark is useful for developing Apache Arrow implementation. * Add benchmarks for file and streaming writers. * Remove redundant type arguments from array constructors. Here are benchmark results on my environment. Pure Ruby implementation is about 2-2.5x slower than release build C++ implementation but about 2-2.5x faster than debug build C++ implementation. Release build C++/GLib: File format: ```console $ ruby -v -S benchmark-driver ruby/red-arrow-format/benchmark/file-writer.yaml ruby 4.1.0dev (2026-03-26T07:27:31Z master c5ab2114df) +PRISM [x86_64-linux] Warming up -------------------------------------- Arrow::Table#save 348.499 i/s - 374.000 times in 1.073175s (2.87ms/i) Arrow::RecordBatchFileWriter 353.426 i/s - 385.000 times in 1.089337s (2.83ms/i) ArrowFormat::FileWriter 133.293 i/s - 140.000 times in 1.050314s (7.50ms/i) Calculating ------------------------------------- Arrow::Table#save 336.984 i/s - 1.045k times in 3.101035s (2.97ms/i) Arrow::RecordBatchFileWriter 338.695 i/s - 1.060k times in 3.129655s (2.95ms/i) ArrowFormat::FileWriter 134.640 i/s - 399.000 times in 2.963462s (7.43ms/i) Comparison: Arrow::RecordBatchFileWriter: 338.7 i/s Arrow::Table#save: 337.0 i/s - 1.01x slower ArrowFormat::FileWriter: 134.6 i/s - 2.52x slower ``` Streaming format: ```console $ ruby -v -S benchmark-driver ruby/red-arrow-format/benchmark/streaming-writer.yaml ruby 4.1.0dev (2026-03-26T07:27:31Z master c5ab2114df) +PRISM [x86_64-linux] Warming up -------------------------------------- Arrow::Table#save 356.995 i/s - 385.000 times in 1.078447s (2.80ms/i) Arrow::RecordBatchStreamWriter 347.891 i/s - 374.000 times in 1.075050s (2.87ms/i) ArrowFormat::StreamingWriter 156.709 i/s - 160.000 times in 1.021004s (6.38ms/i) Calculating ------------------------------------- Arrow::Table#save 350.743 i/s - 1.070k times in 3.050665s (2.85ms/i) Arrow::RecordBatchStreamWriter 345.821 i/s - 1.043k times in 3.016011s (2.89ms/i) ArrowFormat::StreamingWriter 160.022 i/s - 470.000 times in 2.937090s (6.25ms/i) Comparison: Arrow::Table#save: 350.7 i/s Arrow::RecordBatchStreamWriter: 345.8 i/s - 1.01x slower ArrowFormat::StreamingWriter: 160.0 i/s - 2.19x slower ``` Debug build C++/GLib: File format: ```console $ ruby -v -S benchmark-driver ruby/red-arrow-format/benchmark/file-writer.yaml ruby 4.1.0dev (2026-03-26T07:27:31Z master c5ab2114df) +PRISM [x86_64-linux] Warming up -------------------------------------- Arrow::Table#save 63.290 i/s - 66.000 times in 1.042815s (15.80ms/i) Arrow::RecordBatchFileWriter 62.655 i/s - 66.000 times in 1.053389s (15.96ms/i) ArrowFormat::FileWriter 138.082 i/s - 140.000 times in 1.013891s (7.24ms/i) Calculating ------------------------------------- Arrow::Table#save 63.165 i/s - 189.000 times in 2.992143s (15.83ms/i) Arrow::RecordBatchFileWriter 61.773 i/s - 187.000 times in 3.027220s (16.19ms/i) ArrowFormat::FileWriter 134.709 i/s - 414.000 times in 3.073285s (7.42ms/i) Comparison: ArrowFormat::FileWriter: 134.7 i/s Arrow::Table#save: 63.2 i/s - 2.13x slower Arrow::RecordBatchFileWriter: 61.8 i/s - 2.18x slower ``` Streaming format: ```console $ ruby -v -S benchmark-driver ruby/red-arrow-format/benchmark/streaming-writer.yaml ruby 4.1.0dev (2026-03-26T07:27:31Z master c5ab2114df) +PRISM [x86_64-linux] Warming up -------------------------------------- Arrow::Table#save 63.252 i/s - 66.000 times in 1.043439s (15.81ms/i) Arrow::RecordBatchStreamWriter 61.272 i/s - 66.000 times in 1.077162s (16.32ms/i) ArrowFormat::StreamingWriter 152.598 i/s - 160.000 times in 1.048506s (6.55ms/i) Calculating ------------------------------------- Arrow::Table#save 61.016 i/s - 189.000 times in 3.097525s (16.39ms/i) Arrow::RecordBatchStreamWriter 63.024 i/s - 183.000 times in 2.903642s (15.87ms/i) ArrowFormat::StreamingWriter 160.416 i/s - 457.000 times in 2.848846s (6.23ms/i) Comparison: ArrowFormat::StreamingWriter: 160.4 i/s Arrow::RecordBatchStreamWriter: 63.0 i/s - 2.55x slower Arrow::Table#save: 61.0 i/s - 2.63x slower ``` --- .../benchmark/file-writer.yaml | 89 +++++++++++++++ .../benchmark/streaming-writer.yaml | 89 +++++++++++++++ .../lib/arrow-format/array.rb | 101 +++++++++++++++++- .../red-arrow-format/lib/arrow-format/type.rb | 39 ++++--- 4 files changed, 293 insertions(+), 25 deletions(-) create mode 100644 ruby/red-arrow-format/benchmark/file-writer.yaml create mode 100644 ruby/red-arrow-format/benchmark/streaming-writer.yaml diff --git a/ruby/red-arrow-format/benchmark/file-writer.yaml b/ruby/red-arrow-format/benchmark/file-writer.yaml new file mode 100644 index 000000000000..37b89f5bff7b --- /dev/null +++ b/ruby/red-arrow-format/benchmark/file-writer.yaml @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prelude: | + Warning[:experimental] = false + + require "arrow" + require "arrow-format" + + seed = 29 + random = Random.new(seed) + + n_columns = 100 + n_rows = 10000 + max_uint32 = 2 ** 32 - 1 + arrays = n_columns.times.collect do |i| + if i.even? + Arrow::UInt32Array.new(n_rows.times.collect {random.rand(max_uint32)}) + else + Arrow::BinaryArray.new(n_rows.times.collect {random.bytes(random.rand(10))}) + end + end + columns = arrays.collect.with_index {|array, i| [i, array]} + red_arrow_table = Arrow::Table.new(columns) + + fields = arrays.collect.with_index do |array, i| + case array + when Arrow::UInt32Array + type = ArrowFormat::UInt32Type.singleton + when Arrow::BinaryArray + type = ArrowFormat::BinaryType.singleton + end + ArrowFormat::Field.new(i.to_s, type) + end + schema = ArrowFormat::Schema.new(fields) + def convert_buffer(buffer) + return nil if buffer.nil? + IO::Buffer.for(buffer.data.to_s.dup) + end + columns = fields.zip(arrays).collect do |field, array| + case array + when Arrow::UInt32Array + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.data_buffer)) + when Arrow::BinaryArray + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.offsets_buffer), + convert_buffer(array.data_buffer)) + end + end + red_arrow_format_record_batch = + ArrowFormat::RecordBatch.new(schema, n_rows, columns) + + GC.start + GC.disable +benchmark: + "Arrow::Table#save": | + buffer = Arrow::ResizableBuffer.new(4096) + red_arrow_table.save(buffer, format: :arrow_file) + "Arrow::RecordBatchFileWriter": | + buffer = Arrow::ResizableBuffer.new(4096) + Arrow::BufferOutputStream.open(buffer) do |output| + schema = red_arrow_table.schema + Arrow::RecordBatchFileWriter.open(output, schema) do |writer| + writer.write_table(red_arrow_table) + end + end + "ArrowFormat::FileWriter": | + output = +"".b + writer = ArrowFormat::FileWriter.new(output) + writer.start(red_arrow_format_record_batch.schema) + writer.write_record_batch(red_arrow_format_record_batch) + writer.finish diff --git a/ruby/red-arrow-format/benchmark/streaming-writer.yaml b/ruby/red-arrow-format/benchmark/streaming-writer.yaml new file mode 100644 index 000000000000..824e71dff677 --- /dev/null +++ b/ruby/red-arrow-format/benchmark/streaming-writer.yaml @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +prelude: | + Warning[:experimental] = false + + require "arrow" + require "arrow-format" + + seed = 29 + random = Random.new(seed) + + n_columns = 100 + n_rows = 10000 + max_uint32 = 2 ** 32 - 1 + arrays = n_columns.times.collect do |i| + if i.even? + Arrow::UInt32Array.new(n_rows.times.collect {random.rand(max_uint32)}) + else + Arrow::BinaryArray.new(n_rows.times.collect {random.bytes(random.rand(10))}) + end + end + columns = arrays.collect.with_index {|array, i| [i, array]} + red_arrow_table = Arrow::Table.new(columns) + + fields = arrays.collect.with_index do |array, i| + case array + when Arrow::UInt32Array + type = ArrowFormat::UInt32Type.singleton + when Arrow::BinaryArray + type = ArrowFormat::BinaryType.singleton + end + ArrowFormat::Field.new(i.to_s, type) + end + schema = ArrowFormat::Schema.new(fields) + def convert_buffer(buffer) + return nil if buffer.nil? + IO::Buffer.for(buffer.data.to_s.dup) + end + columns = fields.zip(arrays).collect do |field, array| + case array + when Arrow::UInt32Array + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.data_buffer)) + when Arrow::BinaryArray + field.type.build_array(n_rows, + convert_buffer(array.null_bitmap), + convert_buffer(array.offsets_buffer), + convert_buffer(array.data_buffer)) + end + end + red_arrow_format_record_batch = + ArrowFormat::RecordBatch.new(schema, n_rows, columns) + + GC.start + GC.disable +benchmark: + "Arrow::Table#save": | + buffer = Arrow::ResizableBuffer.new(4096) + red_arrow_table.save(buffer, format: :arrow_streaming) + "Arrow::RecordBatchStreamWriter": | + buffer = Arrow::ResizableBuffer.new(4096) + Arrow::BufferOutputStream.open(buffer) do |output| + schema = red_arrow_table.schema + Arrow::RecordBatchStreamWriter.open(output, schema) do |writer| + writer.write_table(red_arrow_table) + end + end + "ArrowFormat::StreamingWriter": | + output = +"".b + writer = ArrowFormat::StreamingWriter.new(output) + writer.start(red_arrow_format_record_batch.schema) + writer.write_record_batch(red_arrow_format_record_batch) + writer.finish diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb b/ruby/red-arrow-format/lib/arrow-format/array.rb index cb71a4d25503..9a248d279fdf 100644 --- a/ruby/red-arrow-format/lib/arrow-format/array.rb +++ b/ruby/red-arrow-format/lib/arrow-format/array.rb @@ -140,8 +140,8 @@ def slice_offsets_buffer(id, buffer, buffer_type) end class NullArray < Array - def initialize(type, size) - super(type, size, nil) + def initialize(size) + super(NullType.singleton, size, nil) end def each_buffer @@ -186,6 +186,10 @@ def element_size end class BooleanArray < PrimitiveArray + def initialize(size, validity_buffer, values_buffer) + super(BooleanType.singleton, size, validity_buffer, values_buffer) + end + def to_a return [] if empty? @@ -209,51 +213,120 @@ def clear_cache end class IntArray < PrimitiveArray + def initialize(size, validity_buffer, values_buffer) + super(self.class.type, size, validity_buffer, values_buffer) + end end class Int8Array < IntArray + class << self + def type + Int8Type.singleton + end + end end class UInt8Array < IntArray + class << self + def type + UInt8Type.singleton + end + end end class Int16Array < IntArray + class << self + def type + Int16Type.singleton + end + end end class UInt16Array < IntArray + class << self + def type + UInt16Type.singleton + end + end end class Int32Array < IntArray + class << self + def type + Int32Type.singleton + end + end end class UInt32Array < IntArray + class << self + def type + UInt32Type.singleton + end + end end class Int64Array < IntArray + class << self + def type + Int64Type.singleton + end + end end class UInt64Array < IntArray + class << self + def type + UInt64Type.singleton + end + end end class FloatingPointArray < PrimitiveArray + def initialize(size, validity_buffer, values_buffer) + super(self.class.type, size, validity_buffer, values_buffer) + end end class Float32Array < FloatingPointArray + class << self + def type + Float32Type.singleton + end + end end class Float64Array < FloatingPointArray + class << self + def type + Float64Type.singleton + end + end end class TemporalArray < PrimitiveArray end class DateArray < TemporalArray + def initialize(size, validity_buffer, values_buffer) + super(self.class.type, size, validity_buffer, values_buffer) + end end class Date32Array < DateArray + class << self + def type + Date32Type.singleton + end + end end class Date64Array < DateArray + class << self + def type + Date64Type.singleton + end + end end class TimeArray < TemporalArray @@ -318,8 +391,8 @@ class DurationArray < TemporalArray end class VariableSizeBinaryArray < Array - def initialize(type, size, validity_buffer, offsets_buffer, values_buffer) - super(type, size, validity_buffer) + def initialize(size, validity_buffer, offsets_buffer, values_buffer) + super(self.class.type, size, validity_buffer) @offsets_buffer = offsets_buffer @values_buffer = values_buffer end @@ -364,18 +437,38 @@ def offset_size end class BinaryArray < VariableSizeBinaryArray + class << self + def type + BinaryType.singleton + end + end end class LargeBinaryArray < VariableSizeBinaryArray + class << self + def type + LargeBinaryType.singleton + end + end end class VariableSizeUTF8Array < VariableSizeBinaryArray end class UTF8Array < VariableSizeUTF8Array + class << self + def type + UTF8Type.singleton + end + end end class LargeUTF8Array < VariableSizeUTF8Array + class << self + def type + LargeUTF8Type.singleton + end + end end class FixedSizeBinaryArray < Array diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb b/ruby/red-arrow-format/lib/arrow-format/type.rb index 17674af30c7e..38523cf00bf6 100644 --- a/ruby/red-arrow-format/lib/arrow-format/type.rb +++ b/ruby/red-arrow-format/lib/arrow-format/type.rb @@ -33,7 +33,7 @@ def name end def build_array(size) - NullArray.new(self, size) + NullArray.new(size) end def to_flatbuffers @@ -56,7 +56,7 @@ def name end def build_array(size, validity_buffer, values_buffer) - BooleanArray.new(self, size, validity_buffer, values_buffer) + BooleanArray.new(size, validity_buffer, values_buffer) end def to_flatbuffers @@ -107,7 +107,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int8Array.new(self, size, validity_buffer, values_buffer) + Int8Array.new(size, validity_buffer, values_buffer) end end @@ -131,7 +131,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt8Array.new(self, size, validity_buffer, values_buffer) + UInt8Array.new(size, validity_buffer, values_buffer) end end @@ -155,7 +155,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int16Array.new(self, size, validity_buffer, values_buffer) + Int16Array.new(size, validity_buffer, values_buffer) end end @@ -179,7 +179,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt16Array.new(self, size, validity_buffer, values_buffer) + UInt16Array.new(size, validity_buffer, values_buffer) end end @@ -203,7 +203,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int32Array.new(self, size, validity_buffer, values_buffer) + Int32Array.new(size, validity_buffer, values_buffer) end end @@ -227,7 +227,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt32Array.new(self, size, validity_buffer, values_buffer) + UInt32Array.new(size, validity_buffer, values_buffer) end end @@ -251,7 +251,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Int64Array.new(self, size, validity_buffer, values_buffer) + Int64Array.new(size, validity_buffer, values_buffer) end end @@ -275,7 +275,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - UInt64Array.new(self, size, validity_buffer, values_buffer) + UInt64Array.new(size, validity_buffer, values_buffer) end end @@ -313,7 +313,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Float32Array.new(self, size, validity_buffer, values_buffer) + Float32Array.new(size, validity_buffer, values_buffer) end end @@ -337,7 +337,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Float64Array.new(self, size, validity_buffer, values_buffer) + Float64Array.new(size, validity_buffer, values_buffer) end end @@ -378,7 +378,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Date32Array.new(self, size, validity_buffer, values_buffer) + Date32Array.new(size, validity_buffer, values_buffer) end end @@ -402,7 +402,7 @@ def buffer_type end def build_array(size, validity_buffer, values_buffer) - Date64Array.new(self, size, validity_buffer, values_buffer) + Date64Array.new(size, validity_buffer, values_buffer) end end @@ -628,8 +628,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - BinaryArray.new(self, - size, + BinaryArray.new(size, validity_buffer, offsets_buffer, values_buffer) @@ -660,8 +659,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - LargeBinaryArray.new(self, - size, + LargeBinaryArray.new(size, validity_buffer, offsets_buffer, values_buffer) @@ -692,7 +690,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - UTF8Array.new(self, size, validity_buffer, offsets_buffer, values_buffer) + UTF8Array.new(size, validity_buffer, offsets_buffer, values_buffer) end def to_flatbuffers @@ -720,8 +718,7 @@ def encoding end def build_array(size, validity_buffer, offsets_buffer, values_buffer) - LargeUTF8Array.new(self, - size, + LargeUTF8Array.new(size, validity_buffer, offsets_buffer, values_buffer)