From d825a1babb273b599c67d0878a1c230150bc5c2e Mon Sep 17 00:00:00 2001
From: Ryan Stewart <noreply@example.com>
Date: Tue, 12 May 2026 18:58:36 -0700
Subject: [PATCH 1/6] feat(parquet): fast-fill definition levels for
 majority-null leaf columns

When writing a nullable leaf (primitive) Arrow array, `write_leaf` built the
definition-level buffer one element at a time, mapping each null bit to a
level. For columns that are mostly null this does ~num_rows of branchy work
and allocates a num_rows level buffer even though almost every level is the
same value.

Add a length-gated bulk-fill path: when the column is majority-null and the
sub-range is large enough to amortize the gate's per-call cost, build the
definition levels by bulk-filling the null level (a vectorized memset) and
overwriting only the non-null positions found via `NullBuffer::valid_indices()`.
The per-row path is kept for non-majority-null arrays and for the small
sub-ranges produced by list/struct write paths, so those shapes are not
regressed.

Contributes to #9731. Complements #9954's all-null fast path by covering the
sparse (mostly-but-not-entirely-null) case it does not handle.

Threshold sweep on Ryzen 9 9950X (parquet/arrow_writer benches, /default
variant, vs main):

  T   primitive  list_primitive  primitive_sparse  list_primitive_sparse
  ----------------------------------------------------------------------
  0    -3.0%      +2.6%           -36.1%            +7.8%
  16   -1.4%      +1.8%           -34.8%            +2.8%
  32   -1.1%      -0.1%           -35.1%            +1.7%
  64   -1.1%      +0.7%           -34.5%            +1.7%   <- chosen
  128  -1.0%      +1.5%           -35.1%            +2.4%
  256  -1.4%      +1.4%           -35.1%            +2.7%

T=0 reproduces the per-call slice/popcount regression on
list_primitive_sparse_99pct_null (+7.8%, matches the criterion bot's
original measurement on the unguarded version). The +1.7% floor at T>=32
is the structural cost of evaluating the gate itself across ~10K small
write_leaf calls in the list path; reducing it further would require
hoisting the decision into the caller. T=64 matches T=32 on every shape
and gives ~12x margin over the avg list length of ~5.

Final benchmarks vs main on Ryzen 9 9950X (T=64, /default variants):

  primitive/default                            -1.5%
  primitive_non_null/default                   -2.8%
  primitive_sparse_99pct_null/default         -35.1%
  primitive_all_null/default                  -66.4%
  list_primitive/default                       +1.8%  (within noise)
  list_primitive_non_null/default              -0.7%  (no change, p=0.45)
  list_primitive_sparse_99pct_null/default     +3.0%  (gate-check floor)
  struct_sparse_99pct_null/default             -4.9%
  bool/default                                 +2.2%
---
 parquet/src/arrow/arrow_writer/levels.rs | 65 ++++++++++++++++++++----
 1 file changed, 55 insertions(+), 10 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs
index 1f66de349df0..dcbee16b6465 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -645,17 +645,62 @@ impl LevelInfoBuilder {
             match &info.logical_nulls {
                 Some(nulls) => {
                     assert!(range.end <= nulls.len());
-                    let nulls = nulls.inner();
-                    info.def_levels.extend_from_iter(range.clone().map(|i| {
-                        // Safety: range.end was asserted to be in bounds earlier
-                        let valid = unsafe { nulls.value_unchecked(i) };
-                        max_def_level - (!valid as i16)
-                    }));
-                    info.non_null_indices.reserve(len);
-                    info.non_null_indices.extend(
-                        BitIndexIterator::new(nulls.inner(), nulls.offset() + range.start, len)
+                    // Bulk-fill fast path. Gated on:
+                    //   - `len >= 64`: per-call slice/popcount/iterator overhead only
+                    //     amortizes on sizable sub-ranges. List/struct paths call
+                    //     write_leaf many times with tiny ranges (avg list length 1-5);
+                    //     paying any per-call popcount there would regress them. A
+                    //     threshold sweep at T={0,16,32,64,128,256} on Ryzen 9 9950X
+                    //     shows the regression floor settles by T=32 and the choice of
+                    //     64 gives ~12x margin over avg list length without losing the
+                    //     flat-primitive wins.
+                    //   - `nulls.null_count() * 2 >= nulls.len()`: cached `null_count()`
+                    //     is O(1), so this check is free. We use the buffer-level density
+                    //     as a heuristic for the sub-range; for full-array writes (the
+                    //     primary target — flat primitive columns) it's exact.
+                    // Note: even when this gate skips the fast path, evaluating the gate
+                    // itself across high-frequency call sites (~10K calls in some list
+                    // benchmarks) is a small structural cost (~+1-2% on list-sparse
+                    // cases). It's the price of having any gate at all on this hot path;
+                    // reducing it further would require hoisting the decision into the
+                    // caller. The wins on the targeted shapes (-35% sparse-primitive,
+                    // -66% all-null primitive) far outweigh it.
+                    if len >= 64 && nulls.null_count() * 2 >= nulls.len() {
+                        // Bulk-fill the null level (vectorized memset) and overwrite
+                        // only the non-null positions. Correct for any null distribution
+                        // in the range; the gate above only controls when it's profitable.
+                        let range_nulls = nulls.slice(range.start, len);
+                        let valid_in_range = len - range_nulls.null_count();
+                        let null_def_level = max_def_level - 1;
+                        let buf = info
+                            .def_levels
+                            .materialize_mut()
+                            .expect("definition levels present");
+                        let base = buf.len();
+                        buf.resize(base + len, null_def_level);
+                        for i in range_nulls.valid_indices() {
+                            buf[base + i] = max_def_level;
+                        }
+                        info.non_null_indices.reserve(valid_in_range);
+                        info.non_null_indices
+                            .extend(range_nulls.valid_indices().map(|i| i + range.start));
+                    } else {
+                        let bits = nulls.inner();
+                        info.def_levels.extend_from_iter(range.clone().map(|i| {
+                            // Safety: range.end was asserted to be in bounds earlier
+                            let valid = unsafe { bits.value_unchecked(i) };
+                            max_def_level - (!valid as i16)
+                        }));
+                        info.non_null_indices.reserve(len);
+                        info.non_null_indices.extend(
+                            BitIndexIterator::new(
+                                bits.inner(),
+                                bits.offset() + range.start,
+                                len,
+                            )
                             .map(|i| i + range.start),
-                    );
+                        );
+                    }
                 }
                 None => {
                     info.append_def_level_run(max_def_level, len);

From 4eaed80ae7f961a59c4828d93e677d66cb295bab Mon Sep 17 00:00:00 2001
From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com>
Date: Wed, 13 May 2026 07:21:25 -0700
Subject: [PATCH 2/6] fix(parquet): incorporate review feedback for write_leaf
 bulk-fill

- Extract the `64` threshold into `BULK_FILL_MIN_LEN` const with a
  doc comment pointing to this PR for the sweep rationale.
- Trim the in-code comment block; threshold-sweep rationale moves to
  the PR description per CONTRIBUTING.md guidance on AI-generated
  submissions.
- Refactor the slow-path else branch to mirror the fast-path slice
  idiom: `let range_nulls = nulls.slice(range.start, len);` then
  `range_nulls.iter()` for `def_levels` and
  `range_nulls.valid_indices().map(|i| i + range.start)` for
  `non_null_indices`. Drops the `unsafe value_unchecked` and the
  manual `BitIndexIterator::new` offset arithmetic.
- Drop the now-unused `BitIndexIterator` import.

Behavior unchanged. `cargo test -p parquet --features arrow --lib
arrow_writer` green (136 tests); clippy and fmt clean.
---
 parquet/src/arrow/arrow_writer/levels.rs | 58 ++++++++----------------
 1 file changed, 19 insertions(+), 39 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs
index dcbee16b6465..bd025c4e554e 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -45,7 +45,6 @@ use crate::column::writer::LevelDataRef;
 use crate::errors::{ParquetError, Result};
 use arrow_array::cast::AsArray;
 use arrow_array::{Array, ArrayRef, OffsetSizeTrait};
-use arrow_buffer::bit_iterator::BitIndexIterator;
 use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
 use arrow_schema::{DataType, Field};
 use std::ops::Range;
@@ -153,6 +152,13 @@ enum LevelInfoBuilder {
     Struct(Vec<LevelInfoBuilder>, LevelContext, Option<NullBuffer>),
 }
 
+/// Minimum sub-range length before the bulk-fill fast path in `write_leaf`
+/// becomes profitable for null-heavy leaf columns. Below this, per-call
+/// slice + popcount overhead regresses list/struct paths that call
+/// `write_leaf` many times with tiny ranges. Picked via threshold sweep;
+/// see <https://github.com/apache/arrow-rs/pull/9967> for the rationale.
+const BULK_FILL_MIN_LEN: usize = 64;
+
 impl LevelInfoBuilder {
     /// Create a new [`LevelInfoBuilder`] for the given [`Field`] and parent [`LevelContext`]
     fn try_new(field: &Field, parent_ctx: LevelContext, array: &ArrayRef) -> Result<Self> {
@@ -645,30 +651,11 @@ impl LevelInfoBuilder {
             match &info.logical_nulls {
                 Some(nulls) => {
                     assert!(range.end <= nulls.len());
-                    // Bulk-fill fast path. Gated on:
-                    //   - `len >= 64`: per-call slice/popcount/iterator overhead only
-                    //     amortizes on sizable sub-ranges. List/struct paths call
-                    //     write_leaf many times with tiny ranges (avg list length 1-5);
-                    //     paying any per-call popcount there would regress them. A
-                    //     threshold sweep at T={0,16,32,64,128,256} on Ryzen 9 9950X
-                    //     shows the regression floor settles by T=32 and the choice of
-                    //     64 gives ~12x margin over avg list length without losing the
-                    //     flat-primitive wins.
-                    //   - `nulls.null_count() * 2 >= nulls.len()`: cached `null_count()`
-                    //     is O(1), so this check is free. We use the buffer-level density
-                    //     as a heuristic for the sub-range; for full-array writes (the
-                    //     primary target — flat primitive columns) it's exact.
-                    // Note: even when this gate skips the fast path, evaluating the gate
-                    // itself across high-frequency call sites (~10K calls in some list
-                    // benchmarks) is a small structural cost (~+1-2% on list-sparse
-                    // cases). It's the price of having any gate at all on this hot path;
-                    // reducing it further would require hoisting the decision into the
-                    // caller. The wins on the targeted shapes (-35% sparse-primitive,
-                    // -66% all-null primitive) far outweigh it.
-                    if len >= 64 && nulls.null_count() * 2 >= nulls.len() {
-                        // Bulk-fill the null level (vectorized memset) and overwrite
-                        // only the non-null positions. Correct for any null distribution
-                        // in the range; the gate above only controls when it's profitable.
+                    // Bulk-fill is profitable only on null-heavy ranges long enough to
+                    // amortize the slice/popcount cost; see `BULK_FILL_MIN_LEN` and the
+                    // PR description for the threshold sweep. The gate uses the cached
+                    // buffer-wide `null_count` (O(1)) to stay cheap on the cold path.
+                    if len >= BULK_FILL_MIN_LEN && nulls.null_count() * 2 >= nulls.len() {
                         let range_nulls = nulls.slice(range.start, len);
                         let valid_in_range = len - range_nulls.null_count();
                         let null_def_level = max_def_level - 1;
@@ -685,21 +672,14 @@ impl LevelInfoBuilder {
                         info.non_null_indices
                             .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     } else {
-                        let bits = nulls.inner();
-                        info.def_levels.extend_from_iter(range.clone().map(|i| {
-                            // Safety: range.end was asserted to be in bounds earlier
-                            let valid = unsafe { bits.value_unchecked(i) };
-                            max_def_level - (!valid as i16)
-                        }));
-                        info.non_null_indices.reserve(len);
-                        info.non_null_indices.extend(
-                            BitIndexIterator::new(
-                                bits.inner(),
-                                bits.offset() + range.start,
-                                len,
-                            )
-                            .map(|i| i + range.start),
+                        let range_nulls = nulls.slice(range.start, len);
+                        info.def_levels.extend_from_iter(
+                            range_nulls
+                                .iter()
+                                .map(|valid| max_def_level - (!valid as i16)),
                         );
+                        info.non_null_indices
+                            .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     }
                 }
                 None => {

From 695f178a92e693cc5b8d7cc1e259644af53203be Mon Sep 17 00:00:00 2001
From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com>
Date: Wed, 13 May 2026 19:32:07 -0700
Subject: [PATCH 3/6] refactor(parquet): partial dedup of write_leaf's
 null-density branches

Hoist the shared work in `write_leaf`'s `Some(nulls)` arm out of the
if/else where it can be done without changing the cost profile:

- `let range_nulls = nulls.slice(range.start, len)`: O(1) metadata
  adjustment, safe to compute once and share between branches.
- `info.non_null_indices.extend(range_nulls.valid_indices()...)`:
  a single iterator consumption after the branches, identical work
  in both paths.

The `range_nulls.null_count()` popcount and `reserve(valid_in_range)`
stay INSIDE the fast-path branch where bulk-fill needs the count
anyway. An earlier revision (59ced5b97) hoisted the popcount out and
caused 11-24% regressions on `list_primitive*` benches because
`write_leaf` is called ~10K times with avg range ~5 from list paths;
paying the popcount per call dominated the saved hash probes.

Bench (`list_primitive_sparse_99pct_null/default`, local hardware,
this revision): 9.83 ms, in line with main's 10.7 ms on GKE
c4a-highmem-16, and well below the 12.1 ms the prior dedup
attempt showed.

Behavior unchanged. 136 arrow_writer tests green; clippy + fmt clean.
---
 parquet/src/arrow/arrow_writer/levels.rs | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs
index bd025c4e554e..13e7ac4cc0f5 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -651,12 +651,14 @@ impl LevelInfoBuilder {
             match &info.logical_nulls {
                 Some(nulls) => {
                     assert!(range.end <= nulls.len());
+                    let range_nulls = nulls.slice(range.start, len);
                     // Bulk-fill is profitable only on null-heavy ranges long enough to
                     // amortize the slice/popcount cost; see `BULK_FILL_MIN_LEN` and the
                     // PR description for the threshold sweep. The gate uses the cached
-                    // buffer-wide `null_count` (O(1)) to stay cheap on the cold path.
+                    // buffer-wide `null_count` (O(1)) to stay cheap on the cold path;
+                    // `range_nulls.null_count()` is paid only inside the fast path,
+                    // where bulk-fill needs it anyway.
                     if len >= BULK_FILL_MIN_LEN && nulls.null_count() * 2 >= nulls.len() {
-                        let range_nulls = nulls.slice(range.start, len);
                         let valid_in_range = len - range_nulls.null_count();
                         let null_def_level = max_def_level - 1;
                         let buf = info
@@ -669,18 +671,15 @@ impl LevelInfoBuilder {
                             buf[base + i] = max_def_level;
                         }
                         info.non_null_indices.reserve(valid_in_range);
-                        info.non_null_indices
-                            .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     } else {
-                        let range_nulls = nulls.slice(range.start, len);
                         info.def_levels.extend_from_iter(
                             range_nulls
                                 .iter()
                                 .map(|valid| max_def_level - (!valid as i16)),
                         );
-                        info.non_null_indices
-                            .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     }
+                    info.non_null_indices
+                        .extend(range_nulls.valid_indices().map(|i| i + range.start));
                 }
                 None => {
                     info.append_def_level_run(max_def_level, len);

From df1f323df7578ea93a1ed4254e1d23c0bd91196e Mon Sep 17 00:00:00 2001
From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com>
Date: Wed, 13 May 2026 19:32:18 -0700
Subject: [PATCH 4/6] perf(parquet): reserve non_null_indices capacity in
 write_leaf's no-nulls arm

The `logical_nulls = None` arm of `write_leaf` extends `non_null_indices`
by `len` items but didn't pre-reserve capacity. Add the matching
`reserve(len)` before the `extend(range.clone())` so the no-nulls path
allocates once instead of relying on `Vec`'s amortized growth.

Behavior unchanged. Caught by @etseidl's review round on PR #9967.
---
 parquet/src/arrow/arrow_writer/levels.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs
index 13e7ac4cc0f5..48eb8157196d 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -683,6 +683,7 @@ impl LevelInfoBuilder {
                 }
                 None => {
                     info.append_def_level_run(max_def_level, len);
+                    info.non_null_indices.reserve(len);
                     info.non_null_indices.extend(range.clone());
                 }
             }

From cd166ccc7a2b52d221abf7ab9cd948fd0c25f031 Mon Sep 17 00:00:00 2001
From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com>
Date: Wed, 13 May 2026 20:44:31 -0700
Subject: [PATCH 5/6] perf(parquet): revert Some-arm dedup in write_leaf, keep
 None-arm reserve

The partial dedup in 695f178a (hoisting `let range_nulls = nulls.slice(...)`
before the gate and `info.non_null_indices.extend(...)` after the if/else)
regressed `list_primitive/default` 21% and `list_primitive_sparse_99pct_null/default`
13% on the GKE c4a-highmem-16 bench. Same operations as the v2 shape, but the
compiler produced different code for the nullable-list path.

Restore the v2 structure (slice declared inside each branch, both extends
inline). Keep the `None`-arm `info.non_null_indices.reserve(len)` from df1f323d
since that fixed `list_primitive_non_null/default` 1.12 -> 1.00 and addresses
etseidl's reserve nit substantively.

Bench shapes by revision on c4a-highmem-16:
- v2 (4eaed80a): list_primitive/default 1.00, non_null 1.00, sparse_99pct 1.00
- v3 (695f178a, full dedup): 1.24, 1.12, 1.12
- v4 (df1f323d, partial dedup + reserve): 1.21, 1.00, 1.13
- v5 (this commit): expected back to v2's clean shape with reserve fix retained

136 arrow_writer tests green; fmt + clippy clean.
---
 parquet/src/arrow/arrow_writer/levels.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs
index 48eb8157196d..e85090ed6a0e 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -651,14 +651,12 @@ impl LevelInfoBuilder {
             match &info.logical_nulls {
                 Some(nulls) => {
                     assert!(range.end <= nulls.len());
-                    let range_nulls = nulls.slice(range.start, len);
                     // Bulk-fill is profitable only on null-heavy ranges long enough to
                     // amortize the slice/popcount cost; see `BULK_FILL_MIN_LEN` and the
                     // PR description for the threshold sweep. The gate uses the cached
-                    // buffer-wide `null_count` (O(1)) to stay cheap on the cold path;
-                    // `range_nulls.null_count()` is paid only inside the fast path,
-                    // where bulk-fill needs it anyway.
+                    // buffer-wide `null_count` (O(1)) to stay cheap on the cold path.
                     if len >= BULK_FILL_MIN_LEN && nulls.null_count() * 2 >= nulls.len() {
+                        let range_nulls = nulls.slice(range.start, len);
                         let valid_in_range = len - range_nulls.null_count();
                         let null_def_level = max_def_level - 1;
                         let buf = info
@@ -671,15 +669,18 @@ impl LevelInfoBuilder {
                             buf[base + i] = max_def_level;
                         }
                         info.non_null_indices.reserve(valid_in_range);
+                        info.non_null_indices
+                            .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     } else {
+                        let range_nulls = nulls.slice(range.start, len);
                         info.def_levels.extend_from_iter(
                             range_nulls
                                 .iter()
                                 .map(|valid| max_def_level - (!valid as i16)),
                         );
+                        info.non_null_indices
+                            .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     }
-                    info.non_null_indices
-                        .extend(range_nulls.valid_indices().map(|i| i + range.start));
                 }
                 None => {
                     info.append_def_level_run(max_def_level, len);

From 552c773c64058a3a35537025d71cb9f202859dfe Mon Sep 17 00:00:00 2001
From: Ryan Stewart <47729789+RyanJamesStewart@users.noreply.github.com>
Date: Thu, 14 May 2026 06:12:56 -0700
Subject: [PATCH 6/6] perf(parquet): revert slow-arm to BitIndexIterator +
 value_unchecked

Per etseidl's diagnosis on PR #9967: the `nulls.slice()` call in
write_leaf's not-enough-nulls arm is the regression source on
list_primitive paths, not the dedup structure. The list_primitive
paths call write_leaf ~10K times with avg range ~5, where
per-call NullBuffer slice + range_nulls.iter() / valid_indices
overhead dominates.

Restore the v1 (original) slow-arm pattern:
- `let bits = nulls.inner()` once
- `def_levels.extend_from_iter(range.clone().map(...))` with
  `bits.value_unchecked(i)` (safe: range.end asserted in bounds)
- `non_null_indices` via `BitIndexIterator::new(bits.inner(),
  bits.offset() + range.start, len)`

Fast path keeps `nulls.slice()` / `valid_indices()` since bulk-fill
needs them and `len >= BULK_FILL_MIN_LEN` (64) amortizes the cost.

136 arrow_writer tests green; fmt + clippy clean.
---
 parquet/src/arrow/arrow_writer/levels.rs | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs
index e85090ed6a0e..7222bffe1196 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -45,6 +45,7 @@ use crate::column::writer::LevelDataRef;
 use crate::errors::{ParquetError, Result};
 use arrow_array::cast::AsArray;
 use arrow_array::{Array, ArrayRef, OffsetSizeTrait};
+use arrow_buffer::bit_iterator::BitIndexIterator;
 use arrow_buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
 use arrow_schema::{DataType, Field};
 use std::ops::Range;
@@ -672,14 +673,17 @@ impl LevelInfoBuilder {
                         info.non_null_indices
                             .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     } else {
-                        let range_nulls = nulls.slice(range.start, len);
-                        info.def_levels.extend_from_iter(
-                            range_nulls
-                                .iter()
-                                .map(|valid| max_def_level - (!valid as i16)),
+                        let bits = nulls.inner();
+                        info.def_levels.extend_from_iter(range.clone().map(|i| {
+                            // Safety: range.end was asserted to be in bounds earlier
+                            let valid = unsafe { bits.value_unchecked(i) };
+                            max_def_level - (!valid as i16)
+                        }));
+                        info.non_null_indices.reserve(len);
+                        info.non_null_indices.extend(
+                            BitIndexIterator::new(bits.inner(), bits.offset() + range.start, len)
+                                .map(|i| i + range.start),
                         );
-                        info.non_null_indices
-                            .extend(range_nulls.valid_indices().map(|i| i + range.start));
                     }
                 }
                 None => {