From d825a1babb273b599c67d0878a1c230150bc5c2e Mon Sep 17 00:00:00 2001 From: Ryan Stewart Date: Tue, 12 May 2026 18:58:36 -0700 Subject: [PATCH 1/6] feat(parquet): fast-fill definition levels for majority-null leaf columns When writing a nullable leaf (primitive) Arrow array, `write_leaf` built the definition-level buffer one element at a time, mapping each null bit to a level. For columns that are mostly null this does ~num_rows of branchy work and allocates a num_rows level buffer even though almost every level is the same value. Add a length-gated bulk-fill path: when the column is majority-null and the sub-range is large enough to amortize the gate's per-call cost, build the definition levels by bulk-filling the null level (a vectorized memset) and overwriting only the non-null positions found via `NullBuffer::valid_indices()`. The per-row path is kept for non-majority-null arrays and for the small sub-ranges produced by list/struct write paths, so those shapes are not regressed. Contributes to #9731. Complements #9954's all-null fast path by covering the sparse (mostly-but-not-entirely-null) case it does not handle. Threshold sweep on Ryzen 9 9950X (parquet/arrow_writer benches, /default variant, vs main): T primitive list_primitive primitive_sparse list_primitive_sparse ---------------------------------------------------------------------- 0 -3.0% +2.6% -36.1% +7.8% 16 -1.4% +1.8% -34.8% +2.8% 32 -1.1% -0.1% -35.1% +1.7% 64 -1.1% +0.7% -34.5% +1.7% <- chosen 128 -1.0% +1.5% -35.1% +2.4% 256 -1.4% +1.4% -35.1% +2.7% T=0 reproduces the per-call slice/popcount regression on list_primitive_sparse_99pct_null (+7.8%, matches the criterion bot's original measurement on the unguarded version). The +1.7% floor at T>=32 is the structural cost of evaluating the gate itself across ~10K small write_leaf calls in the list path; reducing it further would require hoisting the decision into the caller. T=64 matches T=32 on every shape and gives ~12x margin over the avg list length of ~5. Final benchmarks vs main on Ryzen 9 9950X (T=64, /default variants): primitive/default -1.5% primitive_non_null/default -2.8% primitive_sparse_99pct_null/default -35.1% primitive_all_null/default -66.4% list_primitive/default +1.8% (within noise) list_primitive_non_null/default -0.7% (no change, p=0.45) list_primitive_sparse_99pct_null/default +3.0% (gate-check floor) struct_sparse_99pct_null/default -4.9% bool/default +2.2% --- parquet/src/arrow/arrow_writer/levels.rs | 65 ++++++++++++++++++++---- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 1f66de349df0..dcbee16b6465 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -645,17 +645,62 @@ impl LevelInfoBuilder { match &info.logical_nulls { Some(nulls) => { assert!(range.end <= nulls.len()); - let nulls = nulls.inner(); - info.def_levels.extend_from_iter(range.clone().map(|i| { - // Safety: range.end was asserted to be in bounds earlier - let valid = unsafe { nulls.value_unchecked(i) }; - max_def_level - (!valid as i16) - })); - info.non_null_indices.reserve(len); - info.non_null_indices.extend( - BitIndexIterator::new(nulls.inner(), nulls.offset() + range.start, len) + // Bulk-fill fast path. Gated on: + // - `len >= 64`: per-call slice/popcount/iterator overhead only + // amortizes on sizable sub-ranges. List/struct paths call + // write_leaf many times with tiny ranges (avg list length 1-5); + // paying any per-call popcount there would regress them. A + // threshold sweep at T={0,16,32,64,128,256} on Ryzen 9 9950X + // shows the regression floor settles by T=32 and the choice of + // 64 gives ~12x margin over avg list length without losing the + // flat-primitive wins. + // - `nulls.null_count() * 2 >= nulls.len()`: cached `null_count()` + // is O(1), so this check is free. We use the buffer-level density + // as a heuristic for the sub-range; for full-array writes (the + // primary target — flat primitive columns) it's exact. + // Note: even when this gate skips the fast path, evaluating the gate + // itself across high-frequency call sites (~10K calls in some list + // benchmarks) is a small structural cost (~+1-2% on list-sparse + // cases). It's the price of having any gate at all on this hot path; + // reducing it further would require hoisting the decision into the + // caller. The wins on the targeted shapes (-35% sparse-primitive, + // -66% all-null primitive) far outweigh it. + if len >= 64 && nulls.null_count() * 2 >= nulls.len() { + // Bulk-fill the null level (vectorized memset) and overwrite + // only the non-null positions. Correct for any null distribution + // in the range; the gate above only controls when it's profitable. + let range_nulls = nulls.slice(range.start, len); + let valid_in_range = len - range_nulls.null_count(); + let null_def_level = max_def_level - 1; + let buf = info + .def_levels + .materialize_mut() + .expect("definition levels present"); + let base = buf.len(); + buf.resize(base + len, null_def_level); + for i in range_nulls.valid_indices() { + buf[base + i] = max_def_level; + } + info.non_null_indices.reserve(valid_in_range); + info.non_null_indices + .extend(range_nulls.valid_indices().map(|i| i + range.start)); + } else { + let bits = nulls.inner(); + info.def_levels.extend_from_iter(range.clone().map(|i| { + // Safety: range.end was asserted to be in bounds earlier + let valid = unsafe { bits.value_unchecked(i) }; + max_def_level - (!valid as i16) + })); + info.non_null_indices.reserve(len); + info.non_null_indices.extend( + BitIndexIterator::new( + bits.inner(), + bits.offset() + range.start, + len, + ) .map(|i| i + range.start), - ); + ); + } } None => { info.append_def_level_run(max_def_level, len); From 4eaed80ae7f961a59c4828d93e677d66cb295bab Mon Sep 17 00:00:00 2001 From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com> Date: Wed, 13 May 2026 07:21:25 -0700 Subject: [PATCH 2/6] fix(parquet): incorporate review feedback for write_leaf bulk-fill - Extract the `64` threshold into `BULK_FILL_MIN_LEN` const with a doc comment pointing to this PR for the sweep rationale. - Trim the in-code comment block; threshold-sweep rationale moves to the PR description per CONTRIBUTING.md guidance on AI-generated submissions. - Refactor the slow-path else branch to mirror the fast-path slice idiom: `let range_nulls = nulls.slice(range.start, len);` then `range_nulls.iter()` for `def_levels` and `range_nulls.valid_indices().map(|i| i + range.start)` for `non_null_indices`. Drops the `unsafe value_unchecked` and the manual `BitIndexIterator::new` offset arithmetic. - Drop the now-unused `BitIndexIterator` import. Behavior unchanged. `cargo test -p parquet --features arrow --lib arrow_writer` green (136 tests); clippy and fmt clean. --- parquet/src/arrow/arrow_writer/levels.rs | 58 ++++++++---------------- 1 file changed, 19 insertions(+), 39 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index dcbee16b6465..bd025c4e554e 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -45,7 +45,6 @@ use crate::column::writer::LevelDataRef; use crate::errors::{ParquetError, Result}; use arrow_array::cast::AsArray; use arrow_array::{Array, ArrayRef, OffsetSizeTrait}; -use arrow_buffer::bit_iterator::BitIndexIterator; use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_schema::{DataType, Field}; use std::ops::Range; @@ -153,6 +152,13 @@ enum LevelInfoBuilder { Struct(Vec, LevelContext, Option), } +/// Minimum sub-range length before the bulk-fill fast path in `write_leaf` +/// becomes profitable for null-heavy leaf columns. Below this, per-call +/// slice + popcount overhead regresses list/struct paths that call +/// `write_leaf` many times with tiny ranges. Picked via threshold sweep; +/// see for the rationale. +const BULK_FILL_MIN_LEN: usize = 64; + impl LevelInfoBuilder { /// Create a new [`LevelInfoBuilder`] for the given [`Field`] and parent [`LevelContext`] fn try_new(field: &Field, parent_ctx: LevelContext, array: &ArrayRef) -> Result { @@ -645,30 +651,11 @@ impl LevelInfoBuilder { match &info.logical_nulls { Some(nulls) => { assert!(range.end <= nulls.len()); - // Bulk-fill fast path. Gated on: - // - `len >= 64`: per-call slice/popcount/iterator overhead only - // amortizes on sizable sub-ranges. List/struct paths call - // write_leaf many times with tiny ranges (avg list length 1-5); - // paying any per-call popcount there would regress them. A - // threshold sweep at T={0,16,32,64,128,256} on Ryzen 9 9950X - // shows the regression floor settles by T=32 and the choice of - // 64 gives ~12x margin over avg list length without losing the - // flat-primitive wins. - // - `nulls.null_count() * 2 >= nulls.len()`: cached `null_count()` - // is O(1), so this check is free. We use the buffer-level density - // as a heuristic for the sub-range; for full-array writes (the - // primary target — flat primitive columns) it's exact. - // Note: even when this gate skips the fast path, evaluating the gate - // itself across high-frequency call sites (~10K calls in some list - // benchmarks) is a small structural cost (~+1-2% on list-sparse - // cases). It's the price of having any gate at all on this hot path; - // reducing it further would require hoisting the decision into the - // caller. The wins on the targeted shapes (-35% sparse-primitive, - // -66% all-null primitive) far outweigh it. - if len >= 64 && nulls.null_count() * 2 >= nulls.len() { - // Bulk-fill the null level (vectorized memset) and overwrite - // only the non-null positions. Correct for any null distribution - // in the range; the gate above only controls when it's profitable. + // Bulk-fill is profitable only on null-heavy ranges long enough to + // amortize the slice/popcount cost; see `BULK_FILL_MIN_LEN` and the + // PR description for the threshold sweep. The gate uses the cached + // buffer-wide `null_count` (O(1)) to stay cheap on the cold path. + if len >= BULK_FILL_MIN_LEN && nulls.null_count() * 2 >= nulls.len() { let range_nulls = nulls.slice(range.start, len); let valid_in_range = len - range_nulls.null_count(); let null_def_level = max_def_level - 1; @@ -685,21 +672,14 @@ impl LevelInfoBuilder { info.non_null_indices .extend(range_nulls.valid_indices().map(|i| i + range.start)); } else { - let bits = nulls.inner(); - info.def_levels.extend_from_iter(range.clone().map(|i| { - // Safety: range.end was asserted to be in bounds earlier - let valid = unsafe { bits.value_unchecked(i) }; - max_def_level - (!valid as i16) - })); - info.non_null_indices.reserve(len); - info.non_null_indices.extend( - BitIndexIterator::new( - bits.inner(), - bits.offset() + range.start, - len, - ) - .map(|i| i + range.start), + let range_nulls = nulls.slice(range.start, len); + info.def_levels.extend_from_iter( + range_nulls + .iter() + .map(|valid| max_def_level - (!valid as i16)), ); + info.non_null_indices + .extend(range_nulls.valid_indices().map(|i| i + range.start)); } } None => { From 695f178a92e693cc5b8d7cc1e259644af53203be Mon Sep 17 00:00:00 2001 From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com> Date: Wed, 13 May 2026 19:32:07 -0700 Subject: [PATCH 3/6] refactor(parquet): partial dedup of write_leaf's null-density branches Hoist the shared work in `write_leaf`'s `Some(nulls)` arm out of the if/else where it can be done without changing the cost profile: - `let range_nulls = nulls.slice(range.start, len)`: O(1) metadata adjustment, safe to compute once and share between branches. - `info.non_null_indices.extend(range_nulls.valid_indices()...)`: a single iterator consumption after the branches, identical work in both paths. The `range_nulls.null_count()` popcount and `reserve(valid_in_range)` stay INSIDE the fast-path branch where bulk-fill needs the count anyway. An earlier revision (59ced5b97) hoisted the popcount out and caused 11-24% regressions on `list_primitive*` benches because `write_leaf` is called ~10K times with avg range ~5 from list paths; paying the popcount per call dominated the saved hash probes. Bench (`list_primitive_sparse_99pct_null/default`, local hardware, this revision): 9.83 ms, in line with main's 10.7 ms on GKE c4a-highmem-16, and well below the 12.1 ms the prior dedup attempt showed. Behavior unchanged. 136 arrow_writer tests green; clippy + fmt clean. --- parquet/src/arrow/arrow_writer/levels.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index bd025c4e554e..13e7ac4cc0f5 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -651,12 +651,14 @@ impl LevelInfoBuilder { match &info.logical_nulls { Some(nulls) => { assert!(range.end <= nulls.len()); + let range_nulls = nulls.slice(range.start, len); // Bulk-fill is profitable only on null-heavy ranges long enough to // amortize the slice/popcount cost; see `BULK_FILL_MIN_LEN` and the // PR description for the threshold sweep. The gate uses the cached - // buffer-wide `null_count` (O(1)) to stay cheap on the cold path. + // buffer-wide `null_count` (O(1)) to stay cheap on the cold path; + // `range_nulls.null_count()` is paid only inside the fast path, + // where bulk-fill needs it anyway. if len >= BULK_FILL_MIN_LEN && nulls.null_count() * 2 >= nulls.len() { - let range_nulls = nulls.slice(range.start, len); let valid_in_range = len - range_nulls.null_count(); let null_def_level = max_def_level - 1; let buf = info @@ -669,18 +671,15 @@ impl LevelInfoBuilder { buf[base + i] = max_def_level; } info.non_null_indices.reserve(valid_in_range); - info.non_null_indices - .extend(range_nulls.valid_indices().map(|i| i + range.start)); } else { - let range_nulls = nulls.slice(range.start, len); info.def_levels.extend_from_iter( range_nulls .iter() .map(|valid| max_def_level - (!valid as i16)), ); - info.non_null_indices - .extend(range_nulls.valid_indices().map(|i| i + range.start)); } + info.non_null_indices + .extend(range_nulls.valid_indices().map(|i| i + range.start)); } None => { info.append_def_level_run(max_def_level, len); From df1f323df7578ea93a1ed4254e1d23c0bd91196e Mon Sep 17 00:00:00 2001 From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com> Date: Wed, 13 May 2026 19:32:18 -0700 Subject: [PATCH 4/6] perf(parquet): reserve non_null_indices capacity in write_leaf's no-nulls arm The `logical_nulls = None` arm of `write_leaf` extends `non_null_indices` by `len` items but didn't pre-reserve capacity. Add the matching `reserve(len)` before the `extend(range.clone())` so the no-nulls path allocates once instead of relying on `Vec`'s amortized growth. Behavior unchanged. Caught by @etseidl's review round on PR #9967. --- parquet/src/arrow/arrow_writer/levels.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 13e7ac4cc0f5..48eb8157196d 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -683,6 +683,7 @@ impl LevelInfoBuilder { } None => { info.append_def_level_run(max_def_level, len); + info.non_null_indices.reserve(len); info.non_null_indices.extend(range.clone()); } } From cd166ccc7a2b52d221abf7ab9cd948fd0c25f031 Mon Sep 17 00:00:00 2001 From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com> Date: Wed, 13 May 2026 20:44:31 -0700 Subject: [PATCH 5/6] perf(parquet): revert Some-arm dedup in write_leaf, keep None-arm reserve The partial dedup in 695f178a (hoisting `let range_nulls = nulls.slice(...)` before the gate and `info.non_null_indices.extend(...)` after the if/else) regressed `list_primitive/default` 21% and `list_primitive_sparse_99pct_null/default` 13% on the GKE c4a-highmem-16 bench. Same operations as the v2 shape, but the compiler produced different code for the nullable-list path. Restore the v2 structure (slice declared inside each branch, both extends inline). Keep the `None`-arm `info.non_null_indices.reserve(len)` from df1f323d since that fixed `list_primitive_non_null/default` 1.12 -> 1.00 and addresses etseidl's reserve nit substantively. Bench shapes by revision on c4a-highmem-16: - v2 (4eaed80a): list_primitive/default 1.00, non_null 1.00, sparse_99pct 1.00 - v3 (695f178a, full dedup): 1.24, 1.12, 1.12 - v4 (df1f323d, partial dedup + reserve): 1.21, 1.00, 1.13 - v5 (this commit): expected back to v2's clean shape with reserve fix retained 136 arrow_writer tests green; fmt + clippy clean. --- parquet/src/arrow/arrow_writer/levels.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index 48eb8157196d..e85090ed6a0e 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -651,14 +651,12 @@ impl LevelInfoBuilder { match &info.logical_nulls { Some(nulls) => { assert!(range.end <= nulls.len()); - let range_nulls = nulls.slice(range.start, len); // Bulk-fill is profitable only on null-heavy ranges long enough to // amortize the slice/popcount cost; see `BULK_FILL_MIN_LEN` and the // PR description for the threshold sweep. The gate uses the cached - // buffer-wide `null_count` (O(1)) to stay cheap on the cold path; - // `range_nulls.null_count()` is paid only inside the fast path, - // where bulk-fill needs it anyway. + // buffer-wide `null_count` (O(1)) to stay cheap on the cold path. if len >= BULK_FILL_MIN_LEN && nulls.null_count() * 2 >= nulls.len() { + let range_nulls = nulls.slice(range.start, len); let valid_in_range = len - range_nulls.null_count(); let null_def_level = max_def_level - 1; let buf = info @@ -671,15 +669,18 @@ impl LevelInfoBuilder { buf[base + i] = max_def_level; } info.non_null_indices.reserve(valid_in_range); + info.non_null_indices + .extend(range_nulls.valid_indices().map(|i| i + range.start)); } else { + let range_nulls = nulls.slice(range.start, len); info.def_levels.extend_from_iter( range_nulls .iter() .map(|valid| max_def_level - (!valid as i16)), ); + info.non_null_indices + .extend(range_nulls.valid_indices().map(|i| i + range.start)); } - info.non_null_indices - .extend(range_nulls.valid_indices().map(|i| i + range.start)); } None => { info.append_def_level_run(max_def_level, len); From 552c773c64058a3a35537025d71cb9f202859dfe Mon Sep 17 00:00:00 2001 From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com> Date: Thu, 14 May 2026 06:12:56 -0700 Subject: [PATCH 6/6] perf(parquet): revert slow-arm to BitIndexIterator + value_unchecked Per etseidl's diagnosis on PR #9967: the `nulls.slice()` call in write_leaf's not-enough-nulls arm is the regression source on list_primitive paths, not the dedup structure. The list_primitive paths call write_leaf ~10K times with avg range ~5, where per-call NullBuffer slice + range_nulls.iter() / valid_indices overhead dominates. Restore the v1 (original) slow-arm pattern: - `let bits = nulls.inner()` once - `def_levels.extend_from_iter(range.clone().map(...))` with `bits.value_unchecked(i)` (safe: range.end asserted in bounds) - `non_null_indices` via `BitIndexIterator::new(bits.inner(), bits.offset() + range.start, len)` Fast path keeps `nulls.slice()` / `valid_indices()` since bulk-fill needs them and `len >= BULK_FILL_MIN_LEN` (64) amortizes the cost. 136 arrow_writer tests green; fmt + clippy clean. --- parquet/src/arrow/arrow_writer/levels.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index e85090ed6a0e..7222bffe1196 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -45,6 +45,7 @@ use crate::column::writer::LevelDataRef; use crate::errors::{ParquetError, Result}; use arrow_array::cast::AsArray; use arrow_array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow_buffer::bit_iterator::BitIndexIterator; use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_schema::{DataType, Field}; use std::ops::Range; @@ -672,14 +673,17 @@ impl LevelInfoBuilder { info.non_null_indices .extend(range_nulls.valid_indices().map(|i| i + range.start)); } else { - let range_nulls = nulls.slice(range.start, len); - info.def_levels.extend_from_iter( - range_nulls - .iter() - .map(|valid| max_def_level - (!valid as i16)), + let bits = nulls.inner(); + info.def_levels.extend_from_iter(range.clone().map(|i| { + // Safety: range.end was asserted to be in bounds earlier + let valid = unsafe { bits.value_unchecked(i) }; + max_def_level - (!valid as i16) + })); + info.non_null_indices.reserve(len); + info.non_null_indices.extend( + BitIndexIterator::new(bits.inner(), bits.offset() + range.start, len) + .map(|i| i + range.start), ); - info.non_null_indices - .extend(range_nulls.valid_indices().map(|i| i + range.start)); } } None => {