From e702a23190f1b39d85ca261d780a8e0ed259c0f1 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Wed, 4 Mar 2026 20:16:50 -0500 Subject: [PATCH 1/3] [Json] Add benchmarks for list json reader --- arrow-json/Cargo.toml | 2 +- .../{json-reader.rs => json_reader.rs} | 87 ++++++++++++++++++- 2 files changed, 87 insertions(+), 2 deletions(-) rename arrow-json/benches/{json-reader.rs => json_reader.rs} (73%) diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 5fcde480eb6d..03e3553bc626 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -67,5 +67,5 @@ name = "serde" harness = false [[bench]] -name = "json-reader" +name = "json_reader" harness = false diff --git a/arrow-json/benches/json-reader.rs b/arrow-json/benches/json_reader.rs similarity index 73% rename from arrow-json/benches/json-reader.rs rename to arrow-json/benches/json_reader.rs index 504839f8ffe2..f87ba695eb62 100644 --- a/arrow-json/benches/json-reader.rs +++ b/arrow-json/benches/json_reader.rs @@ -32,6 +32,8 @@ const BATCH_SIZE: usize = 1 << 13; // 8K rows per batch const WIDE_FIELDS: usize = 64; const BINARY_BYTES: usize = 64; const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3 +const LIST_SHORT_ELEMENTS: usize = 5; +const LIST_LONG_ELEMENTS: usize = 100; fn decode_and_flush(decoder: &mut Decoder, data: &[u8]) { let mut offset = 0; @@ -240,11 +242,94 @@ fn bench_wide_projection(c: &mut Criterion) { ); } +fn build_list_json(rows: usize, elements: usize) -> Vec { + // Builds newline-delimited JSON objects with a single list field. + // Example (rows=2, elements=3): + // {"list":[0,1,2]} + // {"list":[1,2,3]} + let mut out = String::with_capacity(rows * (elements * 6 + 16)); + for row in 0..rows { + out.push_str("{\"list\":["); + for i in 0..elements { + if i > 0 { + out.push(','); + } + write!(&mut out, "{}", (row + i) as i64).unwrap(); + } + out.push_str("]}\n"); + } + out.into_bytes() +} + +fn build_list_values(rows: usize, elements: usize) -> Vec { + // Mirrors build_list_json but returns structured serde_json::Value objects. + let mut out = Vec::with_capacity(rows); + for row in 0..rows { + let arr: Vec = (0..elements) + .map(|i| Value::Number(Number::from((row + i) as i64))) + .collect(); + let mut map = Map::with_capacity(1); + map.insert("list".to_string(), Value::Array(arr)); + out.push(Value::Object(map)); + } + out +} + +fn build_list_schema() -> Arc { + Arc::new(Schema::new(vec![Field::new( + "list", + DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))), + false, + )])) +} + +fn bench_decode_list(c: &mut Criterion) { + let schema = build_list_schema(); + + // Short lists: tests list handling overhead (few elements per row) + let short_data = build_list_json(ROWS, LIST_SHORT_ELEMENTS); + bench_decode_schema(c, "decode_list_short_i64_json", &short_data, schema.clone()); + + // Long lists: tests child element decode throughput (many elements per row) + let long_data = build_list_json(ROWS, LIST_LONG_ELEMENTS); + bench_decode_schema(c, "decode_list_long_i64_json", &long_data, schema); +} + +fn bench_serialize_list(c: &mut Criterion) { + let schema = build_list_schema(); + + let short_values = build_list_values(ROWS, LIST_SHORT_ELEMENTS); + c.bench_function("decode_list_short_i64_serialize", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decoder.serialize(&short_values).unwrap(); + while let Some(_batch) = decoder.flush().unwrap() {} + }) + }); + + let long_values = build_list_values(ROWS, LIST_LONG_ELEMENTS); + c.bench_function("decode_list_long_i64_serialize", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decoder.serialize(&long_values).unwrap(); + while let Some(_batch) = decoder.flush().unwrap() {} + }) + }); +} + criterion_group!( benches, bench_decode_wide_object, bench_serialize_wide_object, bench_binary_hex, - bench_wide_projection + bench_wide_projection, + bench_decode_list, + bench_serialize_list ); criterion_main!(benches); From cfaea69765b6d845314211e5ed67c6176fbcc6fa Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Wed, 4 Mar 2026 20:59:36 -0500 Subject: [PATCH 2/3] Add bench for json-writer --- arrow-json/Cargo.toml | 4 + arrow-json/benches/json_writer.rs | 129 ++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 arrow-json/benches/json_writer.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 03e3553bc626..be1f8d0ccdca 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -69,3 +69,7 @@ harness = false [[bench]] name = "json_reader" harness = false + +[[bench]] +name = "json_writer" +harness = false diff --git a/arrow-json/benches/json_writer.rs b/arrow-json/benches/json_writer.rs new file mode 100644 index 000000000000..628d378b9a50 --- /dev/null +++ b/arrow-json/benches/json_writer.rs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::{FixedSizeListBuilder, Int64Builder, ListBuilder}; +use arrow_array::{Array, RecordBatch}; +use arrow_json::LineDelimitedWriter; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{Criterion, Throughput, criterion_group, criterion_main}; +use std::sync::Arc; + +const ROWS: usize = 1 << 17; // 128K rows +const LIST_SHORT_ELEMENTS: usize = 5; +const LIST_LONG_ELEMENTS: usize = 100; + +fn build_list_batch(rows: usize, elements: usize) -> RecordBatch { + let mut list_builder = ListBuilder::new(Int64Builder::new()); + for row in 0..rows { + for i in 0..elements { + list_builder.values().append_value((row + i) as i64); + } + list_builder.append(true); + } + let list_array = list_builder.finish(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "list", + DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))), + false, + )])); + + RecordBatch::try_new(schema, vec![Arc::new(list_array)]).unwrap() +} + +fn bench_write_list(c: &mut Criterion) { + let short_batch = build_list_batch(ROWS, LIST_SHORT_ELEMENTS); + let long_batch = build_list_batch(ROWS, LIST_LONG_ELEMENTS); + + let mut group = c.benchmark_group("write_list_i64"); + // Short lists: tests per-list overhead (few elements per row) + group.throughput(Throughput::Elements(ROWS as u64)); + group.bench_function("short", |b| { + b.iter(|| { + let mut buf = Vec::with_capacity(ROWS * LIST_SHORT_ELEMENTS * 8); + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write(&short_batch).unwrap(); + writer.finish().unwrap(); + buf + }) + }); + + // Long lists: tests child element encode throughput (many elements per row) + group.bench_function("long", |b| { + b.iter(|| { + let mut buf = Vec::with_capacity(ROWS * LIST_LONG_ELEMENTS * 8); + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write(&long_batch).unwrap(); + writer.finish().unwrap(); + buf + }) + }); + + group.finish(); +} + +fn build_fixed_size_list_batch(rows: usize, elements: usize) -> RecordBatch { + let mut builder = FixedSizeListBuilder::new(Int64Builder::new(), elements as i32); + for row in 0..rows { + for i in 0..elements { + builder.values().append_value((row + i) as i64); + } + builder.append(true); + } + let fsl_array = builder.finish(); + + let schema = Arc::new(Schema::new(vec![Field::new( + "fixed_size_list", + fsl_array.data_type().clone(), + false, + )])); + + RecordBatch::try_new(schema, vec![Arc::new(fsl_array)]).unwrap() +} + +fn bench_write_fixed_size_list(c: &mut Criterion) { + let short_batch = build_fixed_size_list_batch(ROWS, LIST_SHORT_ELEMENTS); + let long_batch = build_fixed_size_list_batch(ROWS, LIST_LONG_ELEMENTS); + + let mut group = c.benchmark_group("write_fixed_size_list_i64"); + group.throughput(Throughput::Elements(ROWS as u64)); + + group.bench_function("short", |b| { + b.iter(|| { + let mut buf = Vec::with_capacity(ROWS * LIST_SHORT_ELEMENTS * 8); + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write(&short_batch).unwrap(); + writer.finish().unwrap(); + buf + }) + }); + + group.bench_function("long", |b| { + b.iter(|| { + let mut buf = Vec::with_capacity(ROWS * LIST_LONG_ELEMENTS * 8); + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write(&long_batch).unwrap(); + writer.finish().unwrap(); + buf + }) + }); + + group.finish(); +} + +criterion_group!(benches, bench_write_list, bench_write_fixed_size_list); +criterion_main!(benches); From 6789ab466d7c93b5e78cfbd2a1cd9a8ec0af8741 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Tue, 10 Mar 2026 19:12:07 -0400 Subject: [PATCH 3/3] Refactor --- arrow-json/benches/json_writer.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arrow-json/benches/json_writer.rs b/arrow-json/benches/json_writer.rs index 628d378b9a50..055ad5be48b4 100644 --- a/arrow-json/benches/json_writer.rs +++ b/arrow-json/benches/json_writer.rs @@ -53,23 +53,23 @@ fn bench_write_list(c: &mut Criterion) { // Short lists: tests per-list overhead (few elements per row) group.throughput(Throughput::Elements(ROWS as u64)); group.bench_function("short", |b| { + let mut buf = Vec::with_capacity(ROWS * LIST_SHORT_ELEMENTS * 8); b.iter(|| { - let mut buf = Vec::with_capacity(ROWS * LIST_SHORT_ELEMENTS * 8); + buf.clear(); let mut writer = LineDelimitedWriter::new(&mut buf); writer.write(&short_batch).unwrap(); writer.finish().unwrap(); - buf }) }); // Long lists: tests child element encode throughput (many elements per row) group.bench_function("long", |b| { + let mut buf = Vec::with_capacity(ROWS * LIST_LONG_ELEMENTS * 8); b.iter(|| { - let mut buf = Vec::with_capacity(ROWS * LIST_LONG_ELEMENTS * 8); + buf.clear(); let mut writer = LineDelimitedWriter::new(&mut buf); writer.write(&long_batch).unwrap(); writer.finish().unwrap(); - buf }) }); @@ -103,22 +103,22 @@ fn bench_write_fixed_size_list(c: &mut Criterion) { group.throughput(Throughput::Elements(ROWS as u64)); group.bench_function("short", |b| { + let mut buf = Vec::with_capacity(ROWS * LIST_SHORT_ELEMENTS * 8); b.iter(|| { - let mut buf = Vec::with_capacity(ROWS * LIST_SHORT_ELEMENTS * 8); + buf.clear(); let mut writer = LineDelimitedWriter::new(&mut buf); writer.write(&short_batch).unwrap(); writer.finish().unwrap(); - buf }) }); group.bench_function("long", |b| { + let mut buf = Vec::with_capacity(ROWS * LIST_LONG_ELEMENTS * 8); b.iter(|| { - let mut buf = Vec::with_capacity(ROWS * LIST_LONG_ELEMENTS * 8); + buf.clear(); let mut writer = LineDelimitedWriter::new(&mut buf); writer.write(&long_batch).unwrap(); writer.finish().unwrap(); - buf }) });