diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 2b5a09eebcb3..077e3c9fd6b0 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -52,20 +52,24 @@ //! - unsel_clustered: for Unselective Clustered – in each 10K-row block, rows with an offset >= 1000 are "unsel_clustered". //! -use arrow::array::{ArrayRef, BooleanArray, Float64Array, Int64Array, TimestampMillisecondArray}; -use arrow::compute::and; +use arrow::array::{ + ArrayRef, BooleanArray, Float64Array, Int64Array, StructArray, TimestampMillisecondArray, +}; use arrow::compute::kernels::cmp::{eq, gt, lt, neq}; +use arrow::compute::{and, or}; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use arrow_array::StringViewArray; use arrow_array::builder::{ArrayBuilder, StringViewBuilder}; -use arrow_cast::pretty::pretty_format_batches; use bytes::Bytes; -use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use criterion::{ + BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main, measurement::WallTime, +}; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt}; use parquet::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, RowFilter, + RowSelectionPolicy, }; use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; @@ -76,6 +80,9 @@ use rand::{Rng, SeedableRng, rngs::StdRng}; use std::ops::Range; use std::sync::Arc; +const COLUMN_NAMES: [&str; 4] = ["int64", "float64", "utf8View", "ts"]; +const UTF8_VIEW_MISSING_VALUE: &str = "__arrow_rs_missing__"; + /// Generates a random string. Has a 50% chance to generate a short string (3–11 characters) /// or a long string (13–20 characters). fn random_string(rng: &mut StdRng) -> String { @@ -188,32 +195,77 @@ const ROW_GROUP_SIZE: usize = 100_000; /// Writes the RecordBatch to an in memory buffer, returning the buffer fn write_parquet_file() -> Vec { - let batch = create_record_batch(TOTAL_ROWS); - println!("Batch created with {TOTAL_ROWS} rows, row group size = {ROW_GROUP_SIZE}"); - println!( - "First 100 rows:\n{}", - pretty_format_batches(&[batch.clone().slice(0, 100)]).unwrap() - ); + write_parquet_file_with_rows(TOTAL_ROWS, ROW_GROUP_SIZE) +} + +/// Writes a RecordBatch with a configurable shape to an in memory buffer, +/// returning the buffer. +fn write_parquet_file_with_rows(total_rows: usize, row_group_size: usize) -> Vec { + let batch = create_record_batch(total_rows); + write_record_batch_to_parquet(&batch, row_group_size) +} + +fn write_record_batch_to_parquet(batch: &RecordBatch, row_group_size: usize) -> Vec { let schema = batch.schema(); let props = WriterProperties::builder() .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(ROW_GROUP_SIZE)) + .set_max_row_group_row_count(Some(row_group_size)) .build(); let mut buffer = vec![]; { let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); - writer.write(&batch).unwrap(); + writer.write(batch).unwrap(); writer.close().unwrap(); } buffer } +fn create_nested_record_batch(size: usize) -> RecordBatch { + let tag = Arc::new(StringViewArray::from_iter_values( + (0..size).map(|idx| format!("tag_{}", idx % 7)), + )) as ArrayRef; + let payload = StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int64, false)), + Arc::new(Int64Array::from_iter_values( + (0..size).map(|idx| idx as i64 + 1_000), + )) as ArrayRef, + ), + ( + Arc::new(Field::new("label", DataType::Utf8View, false)), + Arc::new(StringViewArray::from_iter_values( + (0..size).map(|idx| format!("payload_{idx}")), + )) as ArrayRef, + ), + ]); + let payload = Arc::new(payload) as ArrayRef; + let value = Arc::new(Int64Array::from_iter_values( + (0..size).map(|idx| idx as i64 + 10_000), + )) as ArrayRef; + + RecordBatch::try_from_iter(vec![("tag", tag), ("payload", payload), ("value", value)]).unwrap() +} + +fn write_nested_parquet_file_with_rows(total_rows: usize, row_group_size: usize) -> Vec { + let batch = create_nested_record_batch(total_rows); + write_record_batch_to_parquet(&batch, row_group_size) +} + /// ProjectionCase defines the projection mode for the benchmark: /// either projecting all columns or excluding the column that is used for filtering. -#[derive(Clone)] +#[derive(Clone, Copy)] enum ProjectionCase { AllColumns, ExcludeFilterColumn, + FilterColumnsOnly, + CountOnly, + FixedColumns, + Float64AndTs, + Float64Only, + Int64AndFloat64, + Int64AndUtf8, + TsAndUtf8, + Utf8Only, } impl std::fmt::Display for ProjectionCase { @@ -221,6 +273,53 @@ impl std::fmt::Display for ProjectionCase { match self { ProjectionCase::AllColumns => write!(f, "all_columns"), ProjectionCase::ExcludeFilterColumn => write!(f, "exclude_filter_column"), + ProjectionCase::FilterColumnsOnly => write!(f, "filter_columns_only"), + ProjectionCase::CountOnly => write!(f, "count_only"), + ProjectionCase::FixedColumns => write!(f, "fixed_columns"), + ProjectionCase::Float64AndTs => write!(f, "float64_and_ts"), + ProjectionCase::Float64Only => write!(f, "float64_only"), + ProjectionCase::Int64AndFloat64 => write!(f, "int64_and_float64"), + ProjectionCase::Int64AndUtf8 => write!(f, "int64_and_utf8"), + ProjectionCase::TsAndUtf8 => write!(f, "ts_and_utf8"), + ProjectionCase::Utf8Only => write!(f, "utf8_only"), + } + } +} + +#[derive(Clone, Copy)] +enum SyncStrategy { + FullPostFilter, + PushdownAuto, + PushdownSelectors, + PushdownMask, +} + +impl std::fmt::Display for SyncStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + SyncStrategy::FullPostFilter => write!(f, "full_post_filter"), + SyncStrategy::PushdownAuto => write!(f, "pushdown_auto"), + SyncStrategy::PushdownSelectors => write!(f, "pushdown_selectors"), + SyncStrategy::PushdownMask => write!(f, "pushdown_mask"), + } + } +} + +#[derive(Clone, Copy)] +enum AsyncStrategy { + FullPostFilter, + PushdownAutoCostModel, + PushdownSelectors, + PushdownMask, +} + +impl std::fmt::Display for AsyncStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + AsyncStrategy::FullPostFilter => write!(f, "full_post_filter"), + AsyncStrategy::PushdownAutoCostModel => write!(f, "pushdown_auto_cost_model"), + AsyncStrategy::PushdownSelectors => write!(f, "pushdown_selectors"), + AsyncStrategy::PushdownMask => write!(f, "pushdown_mask"), } } } @@ -334,6 +433,63 @@ enum FilterType { /// [ClickBench]: https://github.com/ClickHouse/ClickBench /// [Q21-Q27]: https://github.com/apache/datafusion/blob/b7177234e65cbbb2dcc04c252f6acd80bb026362/benchmarks/queries/clickbench/queries.sql#L22-L28 Utf8ViewNonEmpty, + /// Sparse variable-width predicate shaped like TPC-DS Q83 dynamic + /// `i_item_id` filters, where the predicate column is also projected. + Utf8ViewMissing, + /// Scalar-only part of ClickBench Q37: + /// + /// ```sql + /// WHERE CounterID = 62 + /// AND EventDate BETWEEN ... + /// AND DontCountHits = 0 + /// AND IsRefresh = 0 + /// AND Title <> '' + /// ``` + /// + /// DataFusion `Auto` does not push down the `Title <> ''` string predicate, + /// but it can push down the scalar prefix to defer decoding `Title`. + /// This synthetic predicate keeps that reader-level shape: cheap scalar + /// filter columns protect an expensive `Utf8View` output column. + ClickBenchQ37ScalarPrefix, + /// Shape of ClickBench extended Q6 under DataFusion row-filter pushdown: + /// an early cheap fixed-width predicate can prune almost all rows before a + /// later unprojected variable-width predicate is decoded. + ClickBenchQ6MixedPredicates, + /// Shape of ClickBench Q41-like fixed-width filters: sparse fragmented + /// scalar predicates with a cheap fixed-width output projection. + ClickBenchQ41SparseFixedOutput, + /// Shape of ClickBench Q40: multiple cheap scalar predicates, very small + /// output, and one projected predicate column used later by grouping. + ClickBenchQ40ScalarGroupBy, + /// Shape of TPC-DS Q41: a complex OR predicate over dictionary/string-like + /// and scalar columns where predicate evaluation dominates reader time. + TpcdsQ41ComplexOr, + /// Shape of TPC-DS Q20 catalog_sales after dynamic filters: multiple + /// fixed-width predicates where predicate columns are also projected. + TpcdsQ20ProjectedDynamicFilters, + /// Shape of TPC-DS Q21 after dynamic-filter pruning: sparse fragmented + /// fixed-width predicates where the final projection still includes the + /// predicate columns. This protects against choosing selectors for columns + /// that were already decoded/cached by predicate evaluation. + TpcdsQ21ProjectedFixedOutput, + /// Shape of TPC-DS Q2 fact scans: the dynamic filter applies to the date + /// key, the same date key is projected, and an additional fixed-width sales + /// value can still be deferred by predicate pushdown. + TpcdsQ2ProjectedPredicate10Pct, + TpcdsQ2ProjectedPredicate20Pct, + TpcdsQ2ProjectedPredicate30Pct, + TpcdsQ2ProjectedPredicate40Pct, + /// Scalar range predicate shaped like TPC-DS Q9 `ss_quantity BETWEEN ...` + /// subqueries. The selected rows are random and moderately selective, and + /// benchmark projections cover both count-only and numeric aggregate cases. + TpcdsQ9QuantityRange, + /// Exact shape for the projected-predicate moderate-selectivity gate: + /// a clustered 20% timestamp predicate where the predicate column is + /// projected and the deferred output is variable-width. + ProjectedTs20PctClustered, + /// Very sparse projected fixed-width scan shaped like TPC-DS fact-table + /// filters where the predicate column is also needed in the output projection. + TpcdsSparseProjectedFactScan, } impl std::fmt::Display for FilterType { @@ -347,6 +503,39 @@ impl std::fmt::Display for FilterType { FilterType::UnselectiveClustered => "ts < 9000", FilterType::Composite => "float64 > 99.0 AND ts >= 9000", FilterType::Utf8ViewNonEmpty => "utf8View <> ''", + FilterType::Utf8ViewMissing => "utf8View == ''", + FilterType::ClickBenchQ37ScalarPrefix => "int64 == 62 AND ts < 9000", + FilterType::ClickBenchQ6MixedPredicates => "int64 == 9999 AND utf8View <> ''", + FilterType::ClickBenchQ41SparseFixedOutput => "int64 < 8 AND ts < 9000", + FilterType::ClickBenchQ40ScalarGroupBy => { + "int64 == 62 AND float64 > 10.0 AND ts < 9000" + } + FilterType::TpcdsQ41ComplexOr => { + "(utf8View <> '' AND int64 < 8) OR (ts < 100 AND float64 > 95.0)" + } + FilterType::TpcdsQ20ProjectedDynamicFilters => { + "int64 < 12 AND ts < 9000 projected dynamic filters" + } + FilterType::TpcdsQ21ProjectedFixedOutput => { + "int64 < 8 AND ts < 9000 projected predicates" + } + FilterType::TpcdsQ2ProjectedPredicate10Pct => { + "int64 < 10 projected predicate with fixed output" + } + FilterType::TpcdsQ2ProjectedPredicate20Pct => { + "int64 < 20 projected predicate with fixed output" + } + FilterType::TpcdsQ2ProjectedPredicate30Pct => { + "int64 < 30 projected predicate with fixed output" + } + FilterType::TpcdsQ2ProjectedPredicate40Pct => { + "int64 < 40 projected predicate with fixed output" + } + FilterType::TpcdsQ9QuantityRange => "int64 > 0 AND int64 < 21", + FilterType::ProjectedTs20PctClustered => { + "ts < 2000 projected predicate with utf8 output" + } + FilterType::TpcdsSparseProjectedFactScan => "ts % 1000 == 0", }; write!(f, "{s}") } @@ -401,6 +590,103 @@ impl FilterType { let scalar = StringViewArray::new_scalar(""); neq(array, &scalar) } + FilterType::Utf8ViewMissing => { + let array = batch.column(batch.schema().index_of("utf8View")?); + let scalar = StringViewArray::new_scalar(UTF8_VIEW_MISSING_VALUE); + eq(array, &scalar) + } + // ClickBenchQ37ScalarPrefix: a cheap fragmented scalar predicate + // evaluated before decoding a variable-width output column. + FilterType::ClickBenchQ37ScalarPrefix => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let counter_match = eq(int64, &Int64Array::new_scalar(62))?; + let date_like_range = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&counter_match, &date_like_range) + } + FilterType::ClickBenchQ6MixedPredicates => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let utf8 = batch.column(batch.schema().index_of("utf8View")?); + let cheap_prefix = eq(int64, &Int64Array::new_scalar(9999))?; + let string_suffix = neq(utf8, &StringViewArray::new_scalar(""))?; + and(&cheap_prefix, &string_suffix) + } + FilterType::ClickBenchQ41SparseFixedOutput + | FilterType::TpcdsQ21ProjectedFixedOutput => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let counter_like = lt(int64, &Int64Array::new_scalar(8))?; + let date_like = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&counter_like, &date_like) + } + FilterType::ClickBenchQ40ScalarGroupBy => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let float64 = batch.column(batch.schema().index_of("float64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let counter_match = eq(int64, &Int64Array::new_scalar(62))?; + let width_match = gt(float64, &Float64Array::new_scalar(10.0))?; + let date_like = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&and(&counter_match, &width_match)?, &date_like) + } + FilterType::TpcdsQ41ComplexOr => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let float64 = batch.column(batch.schema().index_of("float64")?); + let utf8 = batch.column(batch.schema().index_of("utf8View")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let string_branch = and( + &neq(utf8, &StringViewArray::new_scalar(""))?, + <(int64, &Int64Array::new_scalar(8))?, + )?; + let scalar_branch = and( + <(ts, &TimestampMillisecondArray::new_scalar(100))?, + >(float64, &Float64Array::new_scalar(95.0))?, + )?; + or(&string_branch, &scalar_branch) + } + FilterType::TpcdsQ20ProjectedDynamicFilters => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let ts = batch.column(batch.schema().index_of("ts")?); + let item_like = lt(int64, &Int64Array::new_scalar(12))?; + let date_like = lt(ts, &TimestampMillisecondArray::new_scalar(9000))?; + and(&item_like, &date_like) + } + FilterType::TpcdsQ2ProjectedPredicate10Pct + | FilterType::TpcdsQ2ProjectedPredicate20Pct + | FilterType::TpcdsQ2ProjectedPredicate30Pct + | FilterType::TpcdsQ2ProjectedPredicate40Pct => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let threshold = match self { + FilterType::TpcdsQ2ProjectedPredicate10Pct => 10, + FilterType::TpcdsQ2ProjectedPredicate20Pct => 20, + FilterType::TpcdsQ2ProjectedPredicate30Pct => 30, + FilterType::TpcdsQ2ProjectedPredicate40Pct => 40, + _ => unreachable!(), + }; + lt(int64, &Int64Array::new_scalar(threshold)) + } + FilterType::TpcdsQ9QuantityRange => { + let int64 = batch.column(batch.schema().index_of("int64")?); + let lower = gt(int64, &Int64Array::new_scalar(0))?; + let upper = lt(int64, &Int64Array::new_scalar(21))?; + and(&lower, &upper) + } + FilterType::ProjectedTs20PctClustered => { + let ts = batch.column(batch.schema().index_of("ts")?); + lt(ts, &TimestampMillisecondArray::new_scalar(2000)) + } + FilterType::TpcdsSparseProjectedFactScan => { + let ts = batch + .column(batch.schema().index_of("ts")?) + .as_any() + .downcast_ref::() + .unwrap(); + Ok(BooleanArray::from( + ts.values() + .iter() + .map(|value| value % 1000 == 0) + .collect::>(), + )) + } } } @@ -414,7 +700,21 @@ impl FilterType { FilterType::UnselectiveUnclustered => &[1], FilterType::UnselectiveClustered => &[3], FilterType::Composite => &[1, 3], // Use float64 column and ts column as representative for composite - FilterType::Utf8ViewNonEmpty => &[2], + FilterType::Utf8ViewNonEmpty | FilterType::Utf8ViewMissing => &[2], + FilterType::ClickBenchQ37ScalarPrefix => &[0, 3], + FilterType::ClickBenchQ6MixedPredicates => &[0, 2], + FilterType::ClickBenchQ40ScalarGroupBy => &[0, 1, 3], + FilterType::ClickBenchQ41SparseFixedOutput + | FilterType::TpcdsQ20ProjectedDynamicFilters + | FilterType::TpcdsQ21ProjectedFixedOutput => &[0, 3], + FilterType::TpcdsQ41ComplexOr => &[0, 1, 2, 3], + FilterType::TpcdsQ2ProjectedPredicate10Pct + | FilterType::TpcdsQ2ProjectedPredicate20Pct + | FilterType::TpcdsQ2ProjectedPredicate30Pct + | FilterType::TpcdsQ2ProjectedPredicate40Pct => &[0], + FilterType::TpcdsQ9QuantityRange => &[0], + FilterType::ProjectedTs20PctClustered => &[3], + FilterType::TpcdsSparseProjectedFactScan => &[3], } } } @@ -449,17 +749,8 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { for filter_type in filter_types { for proj_case in &projection_cases { - // All indices corresponding to the 10 columns. - let all_indices = vec![0, 1, 2, 3]; let filter_col = filter_type.filter_projection().to_vec(); - // For the projection, either select all columns or exclude the filter column(s). - let output_projection: Vec = match proj_case { - ProjectionCase::AllColumns => all_indices.clone(), - ProjectionCase::ExcludeFilterColumn => all_indices - .into_iter() - .filter(|i| !filter_col.contains(i)) - .collect(), - }; + let output_projection = output_projection_for(filter_type, proj_case); let reader = InMemoryReader::try_new(&parquet_file).unwrap(); let metadata = Arc::clone(reader.metadata()); @@ -510,6 +801,756 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { } } +/// Compare full scan plus post-filtering against row-level pushdown strategies. +/// +/// This group is intentionally sync-only and smaller than +/// [`benchmark_filters_and_projections`]. It tracks the cases most likely to +/// inform a future default `Auto` policy: selective random filters, clustered +/// filters, ClickBench-like string filters, and the forced selector strategy +/// that originally motivated apache/arrow-rs#8565. +fn benchmark_sync_strategy_matrix(c: &mut Criterion) { + let parquet_file = Bytes::from(write_parquet_file()); + let filter_types = [ + FilterType::SelectiveUnclustered, + FilterType::ModeratelySelectiveClustered, + FilterType::ModeratelySelectiveUnclustered, + FilterType::Utf8ViewNonEmpty, + ]; + let strategies = [ + SyncStrategy::FullPostFilter, + SyncStrategy::PushdownAuto, + SyncStrategy::PushdownSelectors, + SyncStrategy::PushdownMask, + ]; + + let mut group = c.benchmark_group("arrow_reader_row_filter_strategy_matrix"); + + for filter_type in filter_types { + for projection_case in [ + ProjectionCase::AllColumns, + ProjectionCase::ExcludeFilterColumn, + ] { + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = output_projection_for(filter_type, &projection_case); + let read_projection = full_post_filter_read_projection(filter_type, &output_projection); + let output_column_names = projection_names(&output_projection); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection); + let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); + let pred_mask = ProjectionMask::roots( + schema_descr, + filter_type.filter_projection().iter().copied(), + ); + + for strategy in strategies { + let bench_id = BenchmarkId::new( + format!("{filter_type}/{projection_case}"), + strategy.to_string(), + ); + + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + let read_projection_mask = read_projection_mask.clone(); + let output_column_names = output_column_names.clone(); + + match strategy { + SyncStrategy::FullPostFilter => benchmark_sync_reader_post_filter( + reader, + read_projection_mask, + output_column_names, + filter_type, + ), + SyncStrategy::PushdownAuto => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_sync_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::default(), + ) + } + SyncStrategy::PushdownSelectors => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_sync_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Selectors, + ) + } + SyncStrategy::PushdownMask => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_sync_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Mask, + ) + } + } + }); + }); + } + } + } +} + +/// Compare async full scan plus post-filtering against async row-level pushdown +/// strategies. This is the matrix that exercises reader `Auto` cost modeling because +/// the async stream is backed by the push decoder row-group pipeline. +fn benchmark_async_strategy_matrix(c: &mut Criterion) { + let parquet_file = Bytes::from(write_parquet_file()); + let filter_types = [ + FilterType::SelectiveUnclustered, + FilterType::ModeratelySelectiveClustered, + FilterType::ModeratelySelectiveUnclustered, + FilterType::Utf8ViewNonEmpty, + ]; + let strategies = [ + AsyncStrategy::FullPostFilter, + AsyncStrategy::PushdownAutoCostModel, + AsyncStrategy::PushdownSelectors, + AsyncStrategy::PushdownMask, + ]; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_row_filter_async_strategy_matrix"); + + for filter_type in filter_types { + for projection_case in [ + ProjectionCase::AllColumns, + ProjectionCase::ExcludeFilterColumn, + ] { + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = output_projection_for(filter_type, &projection_case); + let read_projection = full_post_filter_read_projection(filter_type, &output_projection); + let output_column_names = projection_names(&output_projection); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection); + let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); + let pred_mask = ProjectionMask::roots( + schema_descr, + filter_type.filter_projection().iter().copied(), + ); + + for strategy in strategies { + let bench_id = BenchmarkId::new( + format!("{filter_type}/{projection_case}"), + strategy.to_string(), + ); + let rt_captured = rt.handle().clone(); + + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let projection_mask = projection_mask.clone(); + let read_projection_mask = read_projection_mask.clone(); + let output_column_names = output_column_names.clone(); + + rt_captured.block_on(async { + match strategy { + AsyncStrategy::FullPostFilter => { + benchmark_async_reader_post_filter( + reader, + read_projection_mask, + output_column_names, + filter_type, + ) + .await + } + AsyncStrategy::PushdownAutoCostModel => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::default(), + ) + .await + } + AsyncStrategy::PushdownSelectors => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Selectors, + ) + .await + } + AsyncStrategy::PushdownMask => { + let row_filter = row_filter_for(filter_type, pred_mask); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Mask, + ) + .await + } + } + }) + }); + }); + } + } + } +} + +/// A small async-only matrix that isolates the cases most relevant to the +/// row-filter cost model. This is intentionally narrower than +/// [`benchmark_async_strategy_matrix`]: it keeps the benchmark output focused +/// on cases where `Auto` should either switch to post-filter execution or +/// explicitly keep predicate pushdown. +/// +/// The `profile_*` cases are derived from DataFusion ClickBench and TPC-DS +/// comparisons. They keep the reader-level shapes worth tracking while +/// excluding query regressions that did not construct a Parquet `RowFilter`. +fn benchmark_async_cost_model_focus(c: &mut Criterion) { + const SMALL_TOTAL_ROWS: usize = 20_000; + const SMALL_ROW_GROUP_SIZE: usize = 5_000; + + let parquet_file = Bytes::from(write_parquet_file()); + let small_parquet_file = Bytes::from(write_parquet_file_with_rows( + SMALL_TOTAL_ROWS, + SMALL_ROW_GROUP_SIZE, + )); + let cases = [ + AsyncFocusCase::new( + "utf8_non_empty", + parquet_file.clone(), + FilterType::Utf8ViewNonEmpty, + ProjectionCase::ExcludeFilterColumn, + ), + AsyncFocusCase::new( + "utf8_non_empty", + parquet_file.clone(), + FilterType::Utf8ViewNonEmpty, + ProjectionCase::AllColumns, + ), + AsyncFocusCase::new( + "high_selectivity_float64", + parquet_file.clone(), + FilterType::UnselectiveUnclustered, + ProjectionCase::ExcludeFilterColumn, + ), + AsyncFocusCase::new( + "high_selectivity_ts_clustered", + parquet_file.clone(), + FilterType::UnselectiveClustered, + ProjectionCase::ExcludeFilterColumn, + ), + AsyncFocusCase::new( + "fragmented_int64_10pct", + parquet_file.clone(), + FilterType::ModeratelySelectiveUnclustered, + ProjectionCase::ExcludeFilterColumn, + ), + AsyncFocusCase::new( + "selective_float64_1pct", + parquet_file.clone(), + FilterType::SelectiveUnclustered, + ProjectionCase::ExcludeFilterColumn, + ), + AsyncFocusCase::new( + "profile_q37_scalar_utf8", + parquet_file.clone(), + FilterType::ClickBenchQ37ScalarPrefix, + ProjectionCase::Utf8Only, + ), + AsyncFocusCase::new( + "profile_q6_mixed_predicates", + parquet_file.clone(), + FilterType::ClickBenchQ6MixedPredicates, + ProjectionCase::Float64Only, + ), + AsyncFocusCase::new( + "profile_q40_scalar_group_by", + parquet_file.clone(), + FilterType::ClickBenchQ40ScalarGroupBy, + ProjectionCase::Float64AndTs, + ), + AsyncFocusCase::new( + "profile_q41_sparse_fixed_output", + parquet_file.clone(), + FilterType::ClickBenchQ41SparseFixedOutput, + ProjectionCase::Float64Only, + ), + AsyncFocusCase::new( + "profile_tpcds_q41_complex_or", + parquet_file.clone(), + FilterType::TpcdsQ41ComplexOr, + ProjectionCase::Float64Only, + ), + AsyncFocusCase::new( + "profile_tpcds_q20_projected_dynamic_filters", + parquet_file.clone(), + FilterType::TpcdsQ20ProjectedDynamicFilters, + ProjectionCase::FixedColumns, + ), + AsyncFocusCase::new( + "profile_q21_projected_predicate_fixed_output", + parquet_file.clone(), + FilterType::TpcdsQ21ProjectedFixedOutput, + ProjectionCase::FixedColumns, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_10pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate10Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_20pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate20Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_20pct_varwidth_output", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate20Pct, + ProjectionCase::Int64AndUtf8, + ), + AsyncFocusCase::new( + "profile_projected_ts_20pct_varwidth_output", + parquet_file.clone(), + FilterType::ProjectedTs20PctClustered, + ProjectionCase::TsAndUtf8, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_30pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate30Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q2_projected_predicate_40pct", + parquet_file.clone(), + FilterType::TpcdsQ2ProjectedPredicate40Pct, + ProjectionCase::Int64AndFloat64, + ), + AsyncFocusCase::new( + "profile_q1_count_only", + parquet_file.clone(), + FilterType::ClickBenchQ41SparseFixedOutput, + ProjectionCase::CountOnly, + ), + AsyncFocusCase::new( + "profile_q19_no_defer", + parquet_file.clone(), + FilterType::PointLookup, + ProjectionCase::FilterColumnsOnly, + ), + AsyncFocusCase::new( + "profile_sparse_fixed_deferred_output", + parquet_file.clone(), + FilterType::PointLookup, + ProjectionCase::Float64Only, + ), + AsyncFocusCase::new( + "profile_tpcds_sparse_projected_fact_scan", + parquet_file.clone(), + FilterType::TpcdsSparseProjectedFactScan, + ProjectionCase::FixedColumns, + ), + AsyncFocusCase::new( + "profile_q83_sparse_utf8_projected", + parquet_file.clone(), + FilterType::Utf8ViewMissing, + ProjectionCase::AllColumns, + ), + AsyncFocusCase::new( + "profile_small_scalar_no_defer", + small_parquet_file.clone(), + FilterType::ModeratelySelectiveUnclustered, + ProjectionCase::FilterColumnsOnly, + ), + AsyncFocusCase::new( + "profile_small_q37_scalar_utf8", + small_parquet_file, + FilterType::ClickBenchQ37ScalarPrefix, + ProjectionCase::Utf8Only, + ), + AsyncFocusCase::new( + "profile_q9_quantity_count", + parquet_file.clone(), + FilterType::TpcdsQ9QuantityRange, + ProjectionCase::FilterColumnsOnly, + ), + AsyncFocusCase::new( + "profile_q9_quantity_avg", + parquet_file, + FilterType::TpcdsQ9QuantityRange, + ProjectionCase::Float64Only, + ), + ]; + let strategies = [ + AsyncStrategy::FullPostFilter, + AsyncStrategy::PushdownAutoCostModel, + AsyncStrategy::PushdownMask, + AsyncStrategy::PushdownSelectors, + ]; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_row_filter_async_cost_model_focus"); + + for case in cases { + benchmark_async_focus_case(&mut group, &rt, case, &strategies); + } +} + +/// Isolate projected scans that do not construct a [`RowFilter`]. +/// +/// This tracks the reader-level shape seen in TPC-DS Q83 return-table scans: +/// a narrow primitive projection where row-level pushdown metrics are zero. +/// It deliberately lives outside the cost-model matrix because there is no +/// filter strategy to choose. +fn benchmark_projection_scan_focus(c: &mut Criterion) { + let parquet_file = Bytes::from(write_parquet_file()); + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_projection_scan_focus"); + + let case_name = "profile_q83_return_scan_primitives"; + let projection = vec![0, 1, 3]; + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let projection_mask = ProjectionMask::roots(schema_descr, projection); + + let bench_id = BenchmarkId::new(case_name, "async"); + let rt_captured = rt.handle().clone(); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let projection_mask = projection_mask.clone(); + rt_captured.block_on(benchmark_async_reader_projected(reader, projection_mask)); + }); + }); + + let bench_id = BenchmarkId::new(case_name, "sync"); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let projection_mask = projection_mask.clone(); + benchmark_sync_reader_projected(reader, projection_mask); + }); + }); +} + +struct AsyncFocusCase { + case_name: &'static str, + parquet_file: Bytes, + filter_type: FilterType, + projection_case: ProjectionCase, +} + +impl AsyncFocusCase { + fn new( + case_name: &'static str, + parquet_file: Bytes, + filter_type: FilterType, + projection_case: ProjectionCase, + ) -> Self { + Self { + case_name, + parquet_file, + filter_type, + projection_case, + } + } +} + +fn benchmark_async_focus_case( + group: &mut BenchmarkGroup<'_, WallTime>, + rt: &tokio::runtime::Runtime, + case: AsyncFocusCase, + strategies: &[AsyncStrategy], +) { + let AsyncFocusCase { + case_name, + parquet_file, + filter_type, + projection_case, + } = case; + + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = output_projection_for(filter_type, &projection_case); + let read_projection = full_post_filter_read_projection(filter_type, &output_projection); + let output_column_names = projection_names(&output_projection); + let projection_mask = ProjectionMask::roots(schema_descr, output_projection); + let read_projection_mask = ProjectionMask::roots(schema_descr, read_projection); + let pred_mask = ProjectionMask::roots( + schema_descr, + filter_type.filter_projection().iter().copied(), + ); + let q6_int64_pred_mask = ProjectionMask::roots(schema_descr, [0]); + let q6_utf8_pred_mask = ProjectionMask::roots(schema_descr, [2]); + let q41_int64_pred_mask = ProjectionMask::roots(schema_descr, [0]); + let q41_ts_pred_mask = ProjectionMask::roots(schema_descr, [3]); + let q40_float64_pred_mask = ProjectionMask::roots(schema_descr, [1]); + + for strategy in strategies.iter().copied() { + let bench_id = BenchmarkId::new( + format!("{case_name}/{projection_case}"), + strategy.to_string(), + ); + let rt_captured = rt.handle().clone(); + + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let q6_int64_pred_mask = q6_int64_pred_mask.clone(); + let q6_utf8_pred_mask = q6_utf8_pred_mask.clone(); + let q41_int64_pred_mask = q41_int64_pred_mask.clone(); + let q41_ts_pred_mask = q41_ts_pred_mask.clone(); + let q40_float64_pred_mask = q40_float64_pred_mask.clone(); + let projection_mask = projection_mask.clone(); + let read_projection_mask = read_projection_mask.clone(); + let output_column_names = output_column_names.clone(); + + rt_captured.block_on(async { + match strategy { + AsyncStrategy::FullPostFilter => { + benchmark_async_reader_post_filter( + reader, + read_projection_mask, + output_column_names, + filter_type, + ) + .await + } + AsyncStrategy::PushdownAutoCostModel => { + let row_filter = row_filter_for_focus_case( + filter_type, + pred_mask, + q6_int64_pred_mask, + q6_utf8_pred_mask, + q41_int64_pred_mask, + q41_ts_pred_mask, + q40_float64_pred_mask, + ); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::default(), + ) + .await + } + AsyncStrategy::PushdownSelectors => { + let row_filter = row_filter_for_focus_case( + filter_type, + pred_mask, + q6_int64_pred_mask, + q6_utf8_pred_mask, + q41_int64_pred_mask, + q41_ts_pred_mask, + q40_float64_pred_mask, + ); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Selectors, + ) + .await + } + AsyncStrategy::PushdownMask => { + let row_filter = row_filter_for_focus_case( + filter_type, + pred_mask, + q6_int64_pred_mask, + q6_utf8_pred_mask, + q41_int64_pred_mask, + q41_ts_pred_mask, + q40_float64_pred_mask, + ); + benchmark_async_reader_with_policy( + reader, + projection_mask, + row_filter, + RowSelectionPolicy::Mask, + ) + .await + } + } + }) + }); + }); + } +} + +fn output_projection_for(filter_type: FilterType, projection_case: &ProjectionCase) -> Vec { + let filter_columns = filter_type.filter_projection(); + match projection_case { + ProjectionCase::AllColumns | ProjectionCase::ExcludeFilterColumn => COLUMN_NAMES + .iter() + .enumerate() + .map(|(idx, _)| idx) + .filter(move |idx| { + matches!(projection_case, ProjectionCase::AllColumns) + || !filter_columns.contains(idx) + }) + .collect(), + ProjectionCase::FilterColumnsOnly => filter_columns.to_vec(), + ProjectionCase::CountOnly => vec![], + ProjectionCase::FixedColumns => vec![0, 1, 3], + ProjectionCase::Float64AndTs => vec![1, 3], + ProjectionCase::Float64Only => vec![1], + ProjectionCase::Int64AndFloat64 => vec![0, 1], + ProjectionCase::Int64AndUtf8 => vec![0, 2], + ProjectionCase::TsAndUtf8 => vec![2, 3], + ProjectionCase::Utf8Only => vec![2], + } +} + +fn full_post_filter_read_projection( + filter_type: FilterType, + output_projection: &[usize], +) -> Vec { + let mut read_projection = output_projection.to_vec(); + for filter_idx in filter_type.filter_projection() { + if !read_projection.contains(filter_idx) { + read_projection.push(*filter_idx); + } + } + read_projection.sort_unstable(); + read_projection +} + +fn projection_names(projection: &[usize]) -> Vec<&'static str> { + projection.iter().map(|idx| COLUMN_NAMES[*idx]).collect() +} + +fn row_filter_for(filter_type: FilterType, pred_mask: ProjectionMask) -> RowFilter { + let filter = ArrowPredicateFn::new(pred_mask, move |batch| filter_type.filter_batch(&batch)); + RowFilter::new(vec![Box::new(filter)]) +} + +fn row_filter_for_focus_case( + filter_type: FilterType, + pred_mask: ProjectionMask, + q6_int64_pred_mask: ProjectionMask, + q6_utf8_pred_mask: ProjectionMask, + q41_int64_pred_mask: ProjectionMask, + q41_ts_pred_mask: ProjectionMask, + q40_float64_pred_mask: ProjectionMask, +) -> RowFilter { + match filter_type { + FilterType::ClickBenchQ6MixedPredicates => { + let int64_filter = + ArrowPredicateFn::new(q6_int64_pred_mask, move |batch: RecordBatch| { + let int64 = batch.column(batch.schema().index_of("int64")?); + eq(int64, &Int64Array::new_scalar(9999)) + }); + let utf8_filter = + ArrowPredicateFn::new(q6_utf8_pred_mask, move |batch: RecordBatch| { + let utf8 = batch.column(batch.schema().index_of("utf8View")?); + neq(utf8, &StringViewArray::new_scalar("")) + }); + + RowFilter::new(vec![Box::new(int64_filter), Box::new(utf8_filter)]) + } + FilterType::ClickBenchQ40ScalarGroupBy => { + let int64_filter = + ArrowPredicateFn::new(q41_int64_pred_mask, move |batch: RecordBatch| { + let int64 = batch.column(batch.schema().index_of("int64")?); + eq(int64, &Int64Array::new_scalar(62)) + }); + let float64_filter = + ArrowPredicateFn::new(q40_float64_pred_mask, move |batch: RecordBatch| { + let float64 = batch.column(batch.schema().index_of("float64")?); + gt(float64, &Float64Array::new_scalar(10.0)) + }); + let ts_filter = ArrowPredicateFn::new(q41_ts_pred_mask, move |batch: RecordBatch| { + let ts = batch.column(batch.schema().index_of("ts")?); + lt(ts, &TimestampMillisecondArray::new_scalar(9000)) + }); + + RowFilter::new(vec![ + Box::new(int64_filter), + Box::new(float64_filter), + Box::new(ts_filter), + ]) + } + FilterType::ClickBenchQ41SparseFixedOutput + | FilterType::TpcdsQ20ProjectedDynamicFilters + | FilterType::TpcdsQ21ProjectedFixedOutput => { + let int64_filter = + ArrowPredicateFn::new(q41_int64_pred_mask, move |batch: RecordBatch| { + let int64 = batch.column(batch.schema().index_of("int64")?); + let scalar = match filter_type { + FilterType::TpcdsQ20ProjectedDynamicFilters => 12, + _ => 8, + }; + lt(int64, &Int64Array::new_scalar(scalar)) + }); + let ts_filter = ArrowPredicateFn::new(q41_ts_pred_mask, move |batch: RecordBatch| { + let ts = batch.column(batch.schema().index_of("ts")?); + lt(ts, &TimestampMillisecondArray::new_scalar(9000)) + }); + + RowFilter::new(vec![Box::new(int64_filter), Box::new(ts_filter)]) + } + _ => row_filter_for(filter_type, pred_mask), + } +} + +#[derive(Clone, Copy)] +enum NestedFilterType { + AlwaysTrueTag, + TagNotZero, +} + +impl std::fmt::Display for NestedFilterType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::AlwaysTrueTag => write!(f, "always_true_tag"), + Self::TagNotZero => write!(f, "tag_not_zero"), + } + } +} + +impl NestedFilterType { + fn filter_batch(self, batch: &RecordBatch) -> arrow::error::Result { + match self { + Self::AlwaysTrueTag => Ok(BooleanArray::from(vec![true; batch.num_rows()])), + Self::TagNotZero => { + let tag = batch.column(batch.schema().index_of("tag")?); + let scalar = StringViewArray::new_scalar("tag_0"); + neq(tag, &scalar) + } + } + } +} + +fn nested_row_filter_for(filter_type: NestedFilterType, pred_mask: ProjectionMask) -> RowFilter { + let filter = ArrowPredicateFn::new(pred_mask, move |batch| filter_type.filter_batch(&batch)); + RowFilter::new(vec![Box::new(filter)]) +} + /// Use async API async fn benchmark_async_reader( reader: InMemoryReader, @@ -529,6 +1570,94 @@ async fn benchmark_async_reader( } } +async fn benchmark_async_reader_with_policy( + reader: InMemoryReader, + projection_mask: ProjectionMask, + row_filter: RowFilter, + row_selection_policy: RowSelectionPolicy, +) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .with_row_filter(row_filter) + .with_row_selection_policy(row_selection_policy) + .build() + .unwrap(); + while let Some(b) = stream.next().await { + b.unwrap(); // consume the batches, no buffering + } +} + +async fn benchmark_async_reader_post_filter( + reader: InMemoryReader, + read_projection: ProjectionMask, + output_column_names: Vec<&'static str>, + filter_type: FilterType, +) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(read_projection) + .build() + .unwrap(); + + while let Some(b) = stream.next().await { + let batch = b.unwrap(); + let filter = filter_type.filter_batch(&batch).unwrap(); + let filtered = arrow_select::filter::filter_record_batch(&batch, &filter).unwrap(); + let output_projection = output_column_names + .iter() + .map(|name| filtered.schema().index_of(name).unwrap()) + .collect::>(); + let output = filtered.project(&output_projection).unwrap(); + std::hint::black_box(output.num_rows()); + } +} + +async fn benchmark_async_reader_post_filter_nested( + reader: InMemoryReader, + read_projection: ProjectionMask, + output_column_names: &[&str], + filter_type: NestedFilterType, +) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(read_projection) + .build() + .unwrap(); + + while let Some(b) = stream.next().await { + let batch = b.unwrap(); + let filter = filter_type.filter_batch(&batch).unwrap(); + let filtered = arrow_select::filter::filter_record_batch(&batch, &filter).unwrap(); + let output_projection = output_column_names + .iter() + .map(|name| filtered.schema().index_of(name).unwrap()) + .collect::>(); + let output = filtered.project(&output_projection).unwrap(); + std::hint::black_box(output.num_rows()); + } +} + +async fn benchmark_async_reader_projected(reader: InMemoryReader, projection_mask: ProjectionMask) { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .build() + .unwrap(); + while let Some(b) = stream.next().await { + let batch = b.unwrap(); + std::hint::black_box(batch.num_rows()); + } +} + /// Like [`benchmark_async_reader`] but also threads `with_limit(limit)` into /// the stream builder. Used by the `LIMIT` benchmark below. async fn benchmark_async_reader_with_limit( @@ -569,6 +1698,65 @@ fn benchmark_sync_reader( } } +fn benchmark_sync_reader_with_policy( + reader: InMemoryReader, + projection_mask: ProjectionMask, + row_filter: RowFilter, + row_selection_policy: RowSelectionPolicy, +) { + let stream = ParquetRecordBatchReaderBuilder::try_new(reader.into_inner()) + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .with_row_filter(row_filter) + .with_row_selection_policy(row_selection_policy) + .build() + .unwrap(); + for b in stream { + b.unwrap(); // consume the batches, no buffering + } +} + +fn benchmark_sync_reader_post_filter( + reader: InMemoryReader, + read_projection: ProjectionMask, + output_column_names: Vec<&'static str>, + filter_type: FilterType, +) { + let stream = ParquetRecordBatchReaderBuilder::try_new(reader.into_inner()) + .unwrap() + .with_batch_size(8192) + .with_projection(read_projection) + .build() + .unwrap(); + + for b in stream { + let batch = b.unwrap(); + let filter = filter_type.filter_batch(&batch).unwrap(); + let filtered = arrow_select::filter::filter_record_batch(&batch, &filter).unwrap(); + let output_projection = output_column_names + .iter() + .map(|name| filtered.schema().index_of(name).unwrap()) + .collect::>(); + let output = filtered.project(&output_projection).unwrap(); + std::hint::black_box(output.num_rows()); + } +} + +fn benchmark_sync_reader_projected(reader: InMemoryReader, projection_mask: ProjectionMask) { + let stream = ParquetRecordBatchReaderBuilder::try_new(reader.into_inner()) + .unwrap() + .with_batch_size(8192) + .with_projection(projection_mask) + .build() + .unwrap(); + + for b in stream { + let batch = b.unwrap(); + std::hint::black_box(batch.num_rows()); + } +} + /// Adapter to read asynchronously from in memory bytes and always loads the /// metadata with page indexes. #[derive(Debug, Clone)] @@ -636,7 +1824,6 @@ fn benchmark_filters_with_limit(c: &mut Criterion) { ProjectionCase::AllColumns, ProjectionCase::ExcludeFilterColumn, ]; - let all_indices = vec![0, 1, 2, 3]; let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() @@ -648,14 +1835,7 @@ fn benchmark_filters_with_limit(c: &mut Criterion) { for filter_type in filter_types { for proj_case in &projection_cases { let filter_col = filter_type.filter_projection().to_vec(); - let output_projection: Vec = match proj_case { - ProjectionCase::AllColumns => all_indices.clone(), - ProjectionCase::ExcludeFilterColumn => all_indices - .iter() - .copied() - .filter(|i| !filter_col.contains(i)) - .collect(), - }; + let output_projection = output_projection_for(filter_type, proj_case); let reader = InMemoryReader::try_new(&parquet_file).unwrap(); let metadata = Arc::clone(reader.metadata()); @@ -693,9 +1873,108 @@ fn benchmark_filters_with_limit(c: &mut Criterion) { } } +/// Focused nested-output case for post-filter cost modeling. +/// +/// The predicate column is an unprojected variable-width scalar column, and the +/// output is a whole nested `Struct` root. This isolates the reader case enabled +/// by root-aware post-filter projection without requiring recursive nested-child +/// projection. +fn benchmark_async_nested_post_filter_focus(c: &mut Criterion) { + let parquet_file = Bytes::from(write_nested_parquet_file_with_rows( + TOTAL_ROWS, + ROW_GROUP_SIZE, + )); + let strategies = [ + AsyncStrategy::FullPostFilter, + AsyncStrategy::PushdownAutoCostModel, + AsyncStrategy::PushdownMask, + AsyncStrategy::PushdownSelectors, + ]; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + let mut group = c.benchmark_group("arrow_reader_row_filter_async_nested_post_filter_focus"); + let reader = InMemoryReader::try_new(&parquet_file).unwrap(); + let metadata = Arc::clone(reader.metadata()); + let schema_descr = metadata.file_metadata().schema_descr(); + let output_projection = ProjectionMask::columns(schema_descr, ["payload"]); + let read_projection = ProjectionMask::columns(schema_descr, ["tag", "payload"]); + let pred_mask = ProjectionMask::columns(schema_descr, ["tag"]); + let filter_cases = [ + NestedFilterType::AlwaysTrueTag, + NestedFilterType::TagNotZero, + ]; + + for filter_case in filter_cases { + for strategy in strategies { + let bench_id = BenchmarkId::new( + format!("whole_struct_output/{filter_case}"), + strategy.to_string(), + ); + let rt_captured = rt.handle().clone(); + group.bench_function(bench_id, |b| { + b.iter(|| { + let reader = reader.clone(); + let pred_mask = pred_mask.clone(); + let output_projection = output_projection.clone(); + let read_projection = read_projection.clone(); + rt_captured.block_on(async { + match strategy { + AsyncStrategy::FullPostFilter => { + benchmark_async_reader_post_filter_nested( + reader, + read_projection, + &["payload"], + filter_case, + ) + .await + } + AsyncStrategy::PushdownAutoCostModel => { + benchmark_async_reader_with_policy( + reader, + output_projection, + nested_row_filter_for(filter_case, pred_mask), + RowSelectionPolicy::default(), + ) + .await + } + AsyncStrategy::PushdownSelectors => { + benchmark_async_reader_with_policy( + reader, + output_projection, + nested_row_filter_for(filter_case, pred_mask), + RowSelectionPolicy::Selectors, + ) + .await + } + AsyncStrategy::PushdownMask => { + benchmark_async_reader_with_policy( + reader, + output_projection, + nested_row_filter_for(filter_case, pred_mask), + RowSelectionPolicy::Mask, + ) + .await + } + } + }) + }); + }); + } + } +} + criterion_group!( benches, benchmark_filters_and_projections, + benchmark_sync_strategy_matrix, + benchmark_async_strategy_matrix, + benchmark_async_cost_model_focus, + benchmark_projection_scan_focus, benchmark_filters_with_limit, + benchmark_async_nested_post_filter_focus, ); criterion_main!(benches); diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs index 3fd5e1d650be..7538fd0ef526 100644 --- a/parquet/src/arrow/arrow_reader/filter.rs +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -190,6 +190,14 @@ impl RowFilter { pub fn new(predicates: Vec>) -> Self { Self { predicates } } + /// Returns the union of all predicate projections, if there are any predicates + pub(crate) fn union_projection(&self) -> Option { + let mut projection = self.predicates.first()?.projection().clone(); + for predicate in self.predicates.iter().skip(1) { + projection.union(predicate.projection()); + } + Some(projection) + } /// Returns the inner predicates pub fn predicates(&self) -> &Vec> { &self.predicates diff --git a/parquet/src/arrow/arrow_reader/metrics.rs b/parquet/src/arrow/arrow_reader/metrics.rs index b36d79586bb3..cb17b9a77c14 100644 --- a/parquet/src/arrow/arrow_reader/metrics.rs +++ b/parquet/src/arrow/arrow_reader/metrics.rs @@ -17,8 +17,87 @@ //! [ArrowReaderMetrics] for collecting metrics about the Arrow reader +use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, RowGroupExecutionMode, RowSelectionStrategyDecision, + RowSelectionStrategyReason, +}; use std::sync::Arc; -use std::sync::atomic::AtomicUsize; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::time::{Duration, Instant}; + +#[derive(Clone, Copy, Debug)] +pub(crate) enum ArrowReaderPhase { + PredicateRangePlanning, + PredicateDecode, + PredicateEvaluate, + PredicateSelectionBuild, + PredicateSelectionMerge, + OutputRangePlanning, + OutputSelectionResolve, + OutputMaskFilter, + PostFilterPredicateProject, + PostFilterPredicateEvaluate, + PostFilterApplyFilter, + PostFilterOutputProject, + PostSelectionApplyFilter, +} + +impl ArrowReaderPhase { + const COUNT: usize = 13; + #[cfg(all(test, feature = "async"))] + const ALL: [Self; Self::COUNT] = [ + Self::PredicateRangePlanning, + Self::PredicateDecode, + Self::PredicateEvaluate, + Self::PredicateSelectionBuild, + Self::PredicateSelectionMerge, + Self::OutputRangePlanning, + Self::OutputSelectionResolve, + Self::OutputMaskFilter, + Self::PostFilterPredicateProject, + Self::PostFilterPredicateEvaluate, + Self::PostFilterApplyFilter, + Self::PostFilterOutputProject, + Self::PostSelectionApplyFilter, + ]; + + fn index(self) -> usize { + match self { + Self::PredicateRangePlanning => 0, + Self::PredicateDecode => 1, + Self::PredicateEvaluate => 2, + Self::PredicateSelectionBuild => 3, + Self::PredicateSelectionMerge => 4, + Self::OutputRangePlanning => 5, + Self::OutputSelectionResolve => 6, + Self::OutputMaskFilter => 7, + Self::PostFilterPredicateProject => 8, + Self::PostFilterPredicateEvaluate => 9, + Self::PostFilterApplyFilter => 10, + Self::PostFilterOutputProject => 11, + Self::PostSelectionApplyFilter => 12, + } + } + + #[cfg(all(test, feature = "async"))] + fn name(self) -> &'static str { + match self { + Self::PredicateRangePlanning => "predicate_range_planning", + Self::PredicateDecode => "predicate_decode", + Self::PredicateEvaluate => "predicate_evaluate", + Self::PredicateSelectionBuild => "predicate_selection_build", + Self::PredicateSelectionMerge => "predicate_selection_merge", + Self::OutputRangePlanning => "output_range_planning", + Self::OutputSelectionResolve => "output_selection_resolve", + Self::OutputMaskFilter => "output_mask_filter", + Self::PostFilterPredicateProject => "post_filter_predicate_project", + Self::PostFilterPredicateEvaluate => "post_filter_predicate_evaluate", + Self::PostFilterApplyFilter => "post_filter_apply_filter", + Self::PostFilterOutputProject => "post_filter_output_project", + Self::PostSelectionApplyFilter => "post_selection_apply_filter", + } + } +} /// This enum represents the state of Arrow reader metrics collection. /// @@ -45,7 +124,12 @@ impl ArrowReaderMetrics { /// Creates a new instance of [`ArrowReaderMetrics::Enabled`] pub fn enabled() -> Self { - Self::Enabled(Arc::new(ArrowReaderMetricsInner::new())) + Self::Enabled(Arc::new(ArrowReaderMetricsInner::new(false))) + } + + #[cfg(all(test, feature = "async"))] + pub(crate) fn enabled_with_phase_profile() -> Self { + Self::Enabled(Arc::new(ArrowReaderMetricsInner::new(true))) } /// Predicate Cache: number of records read directly from the inner reader @@ -82,14 +166,130 @@ impl ArrowReaderMetrics { pub fn records_read_from_cache(&self) -> Option { match self { Self::Disabled => None, - Self::Enabled(inner) => Some( - inner - .records_read_from_cache - .load(std::sync::atomic::Ordering::Relaxed), - ), + Self::Enabled(inner) => Some(inner.records_read_from_cache.load(Ordering::Relaxed)), } } + /// Row Selection: number of selected rows recorded in planned selections + pub fn row_selection_selected_rows(&self) -> Option { + self.load(|inner| &inner.row_selection_selected_rows) + } + + /// Row Selection: number of skipped rows recorded in planned selections + pub fn row_selection_skipped_rows(&self) -> Option { + self.load(|inner| &inner.row_selection_skipped_rows) + } + + /// Row Selection: number of non-empty selectors recorded in planned selections + pub fn row_selection_selector_count(&self) -> Option { + self.load(|inner| &inner.row_selection_selector_count) + } + + /// Row Selection: number of selected runs recorded in planned selections + pub fn row_selection_selected_run_count(&self) -> Option { + self.load(|inner| &inner.row_selection_selected_run_count) + } + + /// Row Selection: number of skipped runs recorded in planned selections + pub fn row_selection_skipped_run_count(&self) -> Option { + self.load(|inner| &inner.row_selection_skipped_run_count) + } + + /// Row Selection: number of plans using mask materialization + pub fn row_selection_mask_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_mask_plan_count) + } + + /// Row Selection: number of plans using selector materialization + pub fn row_selection_selector_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_selector_plan_count) + } + + /// Row Selection: number of plans forced to masks + pub fn row_selection_forced_mask_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_forced_mask_plan_count) + } + + /// Row Selection: number of plans forced to selectors + pub fn row_selection_forced_selector_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_forced_selector_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for empty selections + pub fn row_selection_auto_mask_empty_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_empty_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for short runs + pub fn row_selection_auto_mask_short_run_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_short_run_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for fragmented selected rows + pub fn row_selection_auto_mask_fragmented_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_fragmented_plan_count) + } + + /// Row Selection: number of Auto plans choosing masks for high selected-row ratio + pub fn row_selection_auto_mask_high_ratio_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_mask_high_ratio_plan_count) + } + + /// Row Selection: number of Auto plans choosing selectors for clustered selected rows + pub fn row_selection_auto_selector_clustered_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_selector_clustered_plan_count) + } + + /// Row Selection: number of Auto plans choosing selectors for long runs + pub fn row_selection_auto_selector_long_run_plan_count(&self) -> Option { + self.load(|inner| &inner.row_selection_auto_selector_long_run_plan_count) + } + + /// Cost model: number of row groups included in the observation window + pub fn cost_model_observed_row_group_count(&self) -> Option { + self.load(|inner| &inner.cost_model_observed_row_group_count) + } + + /// Cost model: number of row groups executed with pushdown + pub fn cost_model_pushdown_row_group_count(&self) -> Option { + self.load(|inner| &inner.cost_model_pushdown_row_group_count) + } + + /// Cost model: number of row groups executed with post-filter + pub fn cost_model_post_filter_row_group_count(&self) -> Option { + self.load(|inner| &inner.cost_model_post_filter_row_group_count) + } + + /// Cost model: number of incomplete observation-window decisions + pub fn cost_model_observation_incomplete_count(&self) -> Option { + self.load(|inner| &inner.cost_model_observation_incomplete_count) + } + + /// Cost model: number of times pushdown remained preferred + pub fn cost_model_pushdown_still_preferred_count(&self) -> Option { + self.load(|inner| &inner.cost_model_pushdown_still_preferred_count) + } + + /// Cost model: number of high-selectivity no-pruning triggers + pub fn cost_model_high_selectivity_no_pruning_count(&self) -> Option { + self.load(|inner| &inner.cost_model_high_selectivity_no_pruning_count) + } + + /// Cost model: number of projected-predicate moderate-selectivity triggers + pub fn cost_model_projected_predicate_moderate_selectivity_count(&self) -> Option { + self.load(|inner| &inner.cost_model_projected_predicate_moderate_selectivity_count) + } + + /// Cost model: number of fragmented moderate-selectivity triggers + pub fn cost_model_fragmented_moderate_selectivity_count(&self) -> Option { + self.load(|inner| &inner.cost_model_fragmented_moderate_selectivity_count) + } + + /// Cost model: number of fragmented high-selectivity triggers + pub fn cost_model_fragmented_high_selectivity_count(&self) -> Option { + self.load(|inner| &inner.cost_model_fragmented_high_selectivity_count) + } + /// Increments the count of records read from the inner reader pub(crate) fn increment_inner_reads(&self, count: usize) { let Self::Enabled(inner) = self else { @@ -97,7 +297,7 @@ impl ArrowReaderMetrics { }; inner .records_read_from_inner - .fetch_add(count, std::sync::atomic::Ordering::Relaxed); + .fetch_add(count, Ordering::Relaxed); } /// Increments the count of records read from the cache @@ -108,7 +308,161 @@ impl ArrowReaderMetrics { inner .records_read_from_cache - .fetch_add(count, std::sync::atomic::Ordering::Relaxed); + .fetch_add(count, Ordering::Relaxed); + } + + pub(crate) fn record_row_selection(&self, decision: RowSelectionStrategyDecision) { + let Self::Enabled(inner) = self else { + return; + }; + + let shape = decision.shape; + inner + .row_selection_selected_rows + .fetch_add(shape.selected_rows, Ordering::Relaxed); + inner + .row_selection_skipped_rows + .fetch_add(shape.skipped_rows, Ordering::Relaxed); + inner + .row_selection_selector_count + .fetch_add(shape.selector_count, Ordering::Relaxed); + inner + .row_selection_selected_run_count + .fetch_add(shape.selected_run_count, Ordering::Relaxed); + inner + .row_selection_skipped_run_count + .fetch_add(shape.skipped_run_count, Ordering::Relaxed); + + let strategy_count = if decision.uses_mask() { + &inner.row_selection_mask_plan_count + } else { + &inner.row_selection_selector_plan_count + }; + strategy_count.fetch_add(1, Ordering::Relaxed); + + let decision_count = match decision.reason { + RowSelectionStrategyReason::ForcedMask => &inner.row_selection_forced_mask_plan_count, + RowSelectionStrategyReason::ForcedSelectors => { + &inner.row_selection_forced_selector_plan_count + } + RowSelectionStrategyReason::AutoMaskEmptySelection => { + &inner.row_selection_auto_mask_empty_plan_count + } + RowSelectionStrategyReason::AutoMaskShortRuns => { + &inner.row_selection_auto_mask_short_run_plan_count + } + RowSelectionStrategyReason::AutoMaskFragmentedSelection => { + &inner.row_selection_auto_mask_fragmented_plan_count + } + RowSelectionStrategyReason::AutoMaskHighSelectedRatio => { + &inner.row_selection_auto_mask_high_ratio_plan_count + } + RowSelectionStrategyReason::AutoSelectorClusteredSelection => { + &inner.row_selection_auto_selector_clustered_plan_count + } + RowSelectionStrategyReason::AutoSelectorLongRuns => { + &inner.row_selection_auto_selector_long_run_plan_count + } + }; + decision_count.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_cost_model_observed_row_group(&self) { + let Self::Enabled(inner) = self else { + return; + }; + inner + .cost_model_observed_row_group_count + .fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_cost_model_row_group(&self, mode: RowGroupExecutionMode) { + let Self::Enabled(inner) = self else { + return; + }; + + let counter = match mode { + RowGroupExecutionMode::Pushdown(_) => &inner.cost_model_pushdown_row_group_count, + RowGroupExecutionMode::PostFilter => &inner.cost_model_post_filter_row_group_count, + }; + counter.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn record_cost_model_trigger(&self, reason: CostModelDecisionReason) { + let Self::Enabled(inner) = self else { + return; + }; + + let counter = match reason { + CostModelDecisionReason::HighSelectivityNoPruning => { + &inner.cost_model_high_selectivity_no_pruning_count + } + CostModelDecisionReason::ProjectedPredicateModerateSelectivity => { + &inner.cost_model_projected_predicate_moderate_selectivity_count + } + CostModelDecisionReason::FragmentedModerateSelectivity => { + &inner.cost_model_fragmented_moderate_selectivity_count + } + CostModelDecisionReason::FragmentedHighSelectivity => { + &inner.cost_model_fragmented_high_selectivity_count + } + CostModelDecisionReason::ObservationIncomplete => { + &inner.cost_model_observation_incomplete_count + } + CostModelDecisionReason::PushdownStillPreferred => { + &inner.cost_model_pushdown_still_preferred_count + } + }; + counter.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn time_phase(&self, phase: ArrowReaderPhase, f: impl FnOnce() -> T) -> T { + let Self::Enabled(inner) = self else { + return f(); + }; + if !inner.phase_profile_enabled { + return f(); + } + + let start = Instant::now(); + let result = f(); + inner.record_phase(phase, start.elapsed()); + result + } + + #[cfg(all(test, feature = "async"))] + pub(crate) fn phase_profile_report(&self) -> Option { + let Self::Enabled(inner) = self else { + return None; + }; + if !inner.phase_profile_enabled { + return None; + } + + let mut lines = vec!["phase,total_ms,count,avg_us".to_string()]; + for phase in ArrowReaderPhase::ALL { + let idx = phase.index(); + let total_ns = inner.phase_ns[idx].load(Ordering::Relaxed); + let count = inner.phase_counts[idx].load(Ordering::Relaxed); + if count == 0 { + continue; + } + + let total_ms = total_ns as f64 / 1_000_000.0; + let avg_us = total_ns as f64 / count as f64 / 1_000.0; + lines.push(format!( + "{},{total_ms:.3},{count},{avg_us:.3}", + phase.name() + )); + } + Some(lines.join("\n")) + } + + fn load(&self, metric: fn(&ArrowReaderMetricsInner) -> &AtomicUsize) -> Option { + match self { + Self::Disabled => None, + Self::Enabled(inner) => Some(metric(inner).load(Ordering::Relaxed)), + } } } @@ -122,14 +476,98 @@ pub struct ArrowReaderMetricsInner { records_read_from_inner: AtomicUsize, /// Total number of records read from previously cached pages records_read_from_cache: AtomicUsize, + /// Total selected rows in planned row selections + row_selection_selected_rows: AtomicUsize, + /// Total skipped rows in planned row selections + row_selection_skipped_rows: AtomicUsize, + /// Total non-empty selectors in planned row selections + row_selection_selector_count: AtomicUsize, + /// Total selected runs in planned row selections + row_selection_selected_run_count: AtomicUsize, + /// Total skipped runs in planned row selections + row_selection_skipped_run_count: AtomicUsize, + /// Number of plans materialized with masks + row_selection_mask_plan_count: AtomicUsize, + /// Number of plans materialized with selectors + row_selection_selector_plan_count: AtomicUsize, + /// Number of plans forced to masks + row_selection_forced_mask_plan_count: AtomicUsize, + /// Number of plans forced to selectors + row_selection_forced_selector_plan_count: AtomicUsize, + /// Number of Auto plans choosing masks for empty selections + row_selection_auto_mask_empty_plan_count: AtomicUsize, + /// Number of Auto plans choosing masks for short runs + row_selection_auto_mask_short_run_plan_count: AtomicUsize, + /// Number of Auto plans using masks for fragmented selected rows + row_selection_auto_mask_fragmented_plan_count: AtomicUsize, + /// Number of Auto plans using masks for high selected-row ratio + row_selection_auto_mask_high_ratio_plan_count: AtomicUsize, + /// Number of Auto plans using selectors for clustered selected rows + row_selection_auto_selector_clustered_plan_count: AtomicUsize, + /// Number of Auto plans choosing selectors for long runs + row_selection_auto_selector_long_run_plan_count: AtomicUsize, + /// Number of row groups included in cost-model observation + cost_model_observed_row_group_count: AtomicUsize, + /// Number of cost-model eligible row groups executed with pushdown + cost_model_pushdown_row_group_count: AtomicUsize, + /// Number of row groups executed with post-filter + cost_model_post_filter_row_group_count: AtomicUsize, + /// Number of incomplete cost-model observations + cost_model_observation_incomplete_count: AtomicUsize, + /// Number of cost-model decisions that kept pushdown + cost_model_pushdown_still_preferred_count: AtomicUsize, + /// Number of high-selectivity no-pruning cost-model triggers + cost_model_high_selectivity_no_pruning_count: AtomicUsize, + /// Number of projected-predicate moderate-selectivity cost-model triggers + cost_model_projected_predicate_moderate_selectivity_count: AtomicUsize, + /// Number of fragmented moderate-selectivity cost-model triggers + cost_model_fragmented_moderate_selectivity_count: AtomicUsize, + /// Number of fragmented high-selectivity cost-model triggers + cost_model_fragmented_high_selectivity_count: AtomicUsize, + phase_profile_enabled: bool, + phase_ns: [AtomicU64; ArrowReaderPhase::COUNT], + phase_counts: [AtomicUsize; ArrowReaderPhase::COUNT], } impl ArrowReaderMetricsInner { /// Creates a new instance of `ArrowReaderMetricsInner` - pub(crate) fn new() -> Self { + pub(crate) fn new(phase_profile_enabled: bool) -> Self { Self { records_read_from_inner: AtomicUsize::new(0), records_read_from_cache: AtomicUsize::new(0), + row_selection_selected_rows: AtomicUsize::new(0), + row_selection_skipped_rows: AtomicUsize::new(0), + row_selection_selector_count: AtomicUsize::new(0), + row_selection_selected_run_count: AtomicUsize::new(0), + row_selection_skipped_run_count: AtomicUsize::new(0), + row_selection_mask_plan_count: AtomicUsize::new(0), + row_selection_selector_plan_count: AtomicUsize::new(0), + row_selection_forced_mask_plan_count: AtomicUsize::new(0), + row_selection_forced_selector_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_empty_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_short_run_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_fragmented_plan_count: AtomicUsize::new(0), + row_selection_auto_mask_high_ratio_plan_count: AtomicUsize::new(0), + row_selection_auto_selector_clustered_plan_count: AtomicUsize::new(0), + row_selection_auto_selector_long_run_plan_count: AtomicUsize::new(0), + cost_model_observed_row_group_count: AtomicUsize::new(0), + cost_model_pushdown_row_group_count: AtomicUsize::new(0), + cost_model_post_filter_row_group_count: AtomicUsize::new(0), + cost_model_observation_incomplete_count: AtomicUsize::new(0), + cost_model_pushdown_still_preferred_count: AtomicUsize::new(0), + cost_model_high_selectivity_no_pruning_count: AtomicUsize::new(0), + cost_model_projected_predicate_moderate_selectivity_count: AtomicUsize::new(0), + cost_model_fragmented_moderate_selectivity_count: AtomicUsize::new(0), + cost_model_fragmented_high_selectivity_count: AtomicUsize::new(0), + phase_profile_enabled, + phase_ns: std::array::from_fn(|_| AtomicU64::new(0)), + phase_counts: std::array::from_fn(|_| AtomicUsize::new(0)), } } + + fn record_phase(&self, phase: ArrowReaderPhase, duration: Duration) { + let idx = phase.index(); + self.phase_ns[idx].fetch_add(duration.as_nanos() as u64, Ordering::Relaxed); + self.phase_counts[idx].fetch_add(1, Ordering::Relaxed); + } } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 12c3e192cdfc..d7500c52b247 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -20,11 +20,14 @@ use arrow_array::cast::AsArray; use arrow_array::{Array, RecordBatch, RecordBatchReader}; use arrow_schema::{ArrowError, DataType as ArrowType, FieldRef, Schema, SchemaRef}; +use arrow_select::concat::concat_batches; use arrow_select::filter::filter_record_batch; pub use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; +use post_filter::{PostFilterState, PostSelectionFilterState}; pub use selection::{RowSelection, RowSelectionCursor, RowSelectionPolicy, RowSelector}; +use std::collections::VecDeque; use std::fmt::{Debug, Formatter}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; pub use crate::arrow::array_reader::RowGroups; use crate::arrow::array_reader::{ArrayReader, ArrayReaderBuilder}; @@ -47,12 +50,13 @@ use crate::file::metadata::{ use crate::file::reader::{ChunkReader, SerializedPageReader}; use crate::schema::types::SchemaDescriptor; -use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; // Exposed so integration tests and benchmarks can temporarily override the threshold. pub use read_plan::{PredicateOptions, ReadPlan, ReadPlanBuilder}; mod filter; pub mod metrics; +mod post_filter; mod read_plan; pub(crate) mod selection; pub mod statistics; @@ -1225,7 +1229,10 @@ impl ParquetRecordBatchReaderBuilder { .with_parquet_metadata(&reader.metadata) .build_array_reader(fields.as_deref(), predicate.projection())?; - plan_builder = plan_builder.with_predicate(array_reader, predicate.as_mut())?; + plan_builder = plan_builder.with_predicate_options( + PredicateOptions::new(array_reader, predicate.as_mut()) + .with_metrics(metrics.clone()), + )?; } } @@ -1239,9 +1246,13 @@ impl ParquetRecordBatchReaderBuilder { .with_offset(offset) .with_limit(limit) .build_limited() - .build(); + .build_with_metrics(&metrics); - Ok(ParquetRecordBatchReader::new(array_reader, read_plan)) + Ok(ParquetRecordBatchReader::new_with_metrics( + array_reader, + read_plan, + metrics, + )) } } @@ -1341,8 +1352,13 @@ impl PageIterator for ReaderPageIterator {} /// [`Bytes`]: bytes::Bytes pub struct ParquetRecordBatchReader { array_reader: Box, + array_reader_position: usize, schema: SchemaRef, read_plan: ReadPlan, + metrics: ArrowReaderMetrics, + post_filter: Option, + post_selection_filter: Option, + buffered_batches: Option>, } impl Debug for ParquetRecordBatchReader { @@ -1351,6 +1367,12 @@ impl Debug for ParquetRecordBatchReader { .field("array_reader", &"...") .field("schema", &self.schema) .field("read_plan", &self.read_plan) + .field("post_filter", &self.post_filter) + .field("post_selection_filter", &self.post_selection_filter) + .field( + "buffered_batches", + &self.buffered_batches.as_ref().map(|b| b.len()), + ) .finish() } } @@ -1372,6 +1394,37 @@ impl ParquetRecordBatchReader { /// Returns `Result>` rather than `Option>` to /// simplify error handling with `?` fn next_inner(&mut self) -> Result> { + if let Some(buffered_batches) = self.buffered_batches.as_mut() { + return Ok(buffered_batches.pop_front()); + } + + if self.post_filter.is_none() && self.post_selection_filter.is_none() { + return self.next_inner_decoded(); + } + + loop { + let Some(batch) = self.next_inner_decoded()? else { + return Ok(None); + }; + + let batch = match self.post_filter.as_mut() { + Some(post_filter) => post_filter.apply(batch)?, + None => batch, + }; + let batch = match self.post_selection_filter.as_mut() { + Some(post_selection_filter) => post_selection_filter.apply(batch)?, + None => batch, + }; + + if batch.num_rows() == 0 { + continue; + } + + return Ok(Some(batch)); + } + } + + fn next_inner_decoded(&mut self) -> Result> { let mut read_records = 0; let batch_size = self.batch_size(); if batch_size == 0 { @@ -1379,6 +1432,105 @@ impl ParquetRecordBatchReader { } match self.read_plan.row_selection_cursor_mut() { RowSelectionCursor::Mask(mask_cursor) => { + if mask_cursor.is_sparse() { + let sparse_cursor = mask_cursor.sparse_mut().unwrap(); + + while !sparse_cursor.is_empty() { + let Some(mask_chunk) = sparse_cursor.next_sparse_mask_chunk(batch_size)? + else { + return Ok(None); + }; + let mut filtered_batches = Vec::new(); + + for segment in mask_chunk.segments { + if segment.row_range.start < self.array_reader_position { + return Err(general_err!( + "sparse mask segment starts before current reader position - segment start {}, current position {}", + segment.row_range.start, + self.array_reader_position + )); + } + + if segment.row_range.start > self.array_reader_position { + let to_skip = segment.row_range.start - self.array_reader_position; + let skipped = self.array_reader.skip_records(to_skip)?; + if skipped != to_skip { + return Err(general_err!( + "failed to skip rows, expected {}, got {}", + to_skip, + skipped + )); + } + self.array_reader_position += skipped; + } + + let to_read = segment.row_range.len(); + if to_read == 0 { + continue; + } + + let read = self.array_reader.read_records(to_read)?; + if read == 0 { + return Err(general_err!( + "reached end of column while expecting {} rows", + to_read + )); + } + if read != to_read { + return Err(general_err!( + "insufficient rows read from array reader - expected {}, got {}", + to_read, + read + )); + } + self.array_reader_position += read; + + let mask = sparse_cursor.mask_values_for(&segment)?; + let selected_rows = mask.true_count(); + + let array = self.array_reader.consume_batch()?; + // The column reader exposes the projection as a struct array; convert this + // into a record batch before applying the boolean filter mask. + let struct_array = array.as_struct_opt().ok_or_else(|| { + ArrowError::ParquetError( + "Struct array reader should return struct array".to_string(), + ) + })?; + + let filtered_batch = self + .metrics + .time_phase(ArrowReaderPhase::OutputMaskFilter, || { + filter_record_batch(&RecordBatch::from(struct_array), &mask) + })?; + + if filtered_batch.num_rows() != selected_rows { + return Err(general_err!( + "filtered rows mismatch selection - expected {}, got {}", + selected_rows, + filtered_batch.num_rows() + )); + } + + if filtered_batch.num_rows() == 0 { + continue; + } + + filtered_batches.push(filtered_batch); + } + + match filtered_batches.len() { + 0 => continue, + 1 => return Ok(filtered_batches.pop()), + _ => { + let schema = filtered_batches[0].schema(); + return Ok(Some(concat_batches(&schema, &filtered_batches)?)); + } + } + } + + return Ok(None); + } + // Stream the record batch reader using contiguous segments of the selection // mask, avoiding the need to materialize intermediate `RowSelector` ranges. while !mask_cursor.is_empty() { @@ -1395,6 +1547,7 @@ impl ParquetRecordBatchReader { skipped )); } + self.array_reader_position += skipped; } if mask_chunk.chunk_rows == 0 { @@ -1420,6 +1573,7 @@ impl ParquetRecordBatchReader { read )); } + self.array_reader_position += read; let array = self.array_reader.consume_batch()?; // The column reader exposes the projection as a struct array; convert this @@ -1430,8 +1584,11 @@ impl ParquetRecordBatchReader { ) })?; - let filtered_batch = - filter_record_batch(&RecordBatch::from(struct_array), &mask)?; + let filtered_batch = self + .metrics + .time_phase(ArrowReaderPhase::OutputMaskFilter, || { + filter_record_batch(&RecordBatch::from(struct_array), &mask) + })?; if filtered_batch.num_rows() != mask_chunk.selected_rows { return Err(general_err!( @@ -1461,6 +1618,7 @@ impl ParquetRecordBatchReader { skipped )); } + self.array_reader_position += skipped; continue; } @@ -1483,12 +1641,16 @@ impl ParquetRecordBatchReader { }; match self.array_reader.read_records(to_read)? { 0 => break, - rec => read_records += rec, + rec => { + read_records += rec; + self.array_reader_position += rec; + } }; } } RowSelectionCursor::All => { - self.array_reader.read_records(batch_size)?; + let read = self.array_reader.read_records(batch_size)?; + self.array_reader_position += read; } }; @@ -1503,6 +1665,21 @@ impl ParquetRecordBatchReader { None }) } + + pub(crate) fn materialize_post_filter(&mut self) -> Result<()> { + if self.post_filter.is_none() || self.buffered_batches.is_some() { + return Ok(()); + } + + let mut buffered_batches = VecDeque::new(); + while let Some(batch) = self.next_inner()? { + buffered_batches.push_back(batch); + } + self.post_filter = None; + self.buffered_batches = Some(buffered_batches); + + Ok(()) + } } impl RecordBatchReader for ParquetRecordBatchReader { @@ -1548,8 +1725,13 @@ impl ParquetRecordBatchReader { Ok(Self { array_reader, + array_reader_position: 0, schema: Arc::new(Schema::new(levels.fields.clone())), read_plan, + metrics, + post_filter: None, + post_selection_filter: None, + buffered_batches: None, }) } @@ -1557,6 +1739,37 @@ impl ParquetRecordBatchReader { /// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None` /// all rows will be returned pub(crate) fn new(array_reader: Box, read_plan: ReadPlan) -> Self { + Self::new_with_metrics(array_reader, read_plan, ArrowReaderMetrics::disabled()) + } + + pub(crate) fn new_with_metrics( + array_reader: Box, + read_plan: ReadPlan, + metrics: ArrowReaderMetrics, + ) -> Self { + let schema = match array_reader.get_data_type() { + ArrowType::Struct(fields) => Schema::new(fields.clone()), + _ => unreachable!("Struct array reader's data type is not struct!"), + }; + + Self { + array_reader, + array_reader_position: 0, + schema: Arc::new(schema), + read_plan, + metrics, + post_filter: None, + post_selection_filter: None, + buffered_batches: None, + } + } + + pub(crate) fn new_post_selection_filter( + array_reader: Box, + read_plan: ReadPlan, + selection: RowSelection, + metrics: ArrowReaderMetrics, + ) -> Self { let schema = match array_reader.get_data_type() { ArrowType::Struct(fields) => Schema::new(fields.clone()), _ => unreachable!("Struct array reader's data type is not struct!"), @@ -1564,11 +1777,51 @@ impl ParquetRecordBatchReader { Self { array_reader, + array_reader_position: 0, schema: Arc::new(schema), read_plan, + metrics: metrics.clone(), + post_filter: None, + post_selection_filter: Some(PostSelectionFilterState::new(selection, metrics)), + buffered_batches: None, } } + pub(crate) fn new_post_filter( + array_reader: Box, + read_plan: ReadPlan, + filter: Arc>, + parquet_schema: &SchemaDescriptor, + read_projection: &ProjectionMask, + output_projection: &ProjectionMask, + metrics: ArrowReaderMetrics, + ) -> Result { + let read_schema = match array_reader.get_data_type() { + ArrowType::Struct(fields) => Schema::new(fields.clone()), + _ => unreachable!("Struct array reader's data type is not struct!"), + }; + let post_filter = PostFilterState::try_new( + filter, + metrics.clone(), + parquet_schema, + &read_schema, + read_projection, + output_projection, + )?; + let schema = Arc::clone(&post_filter.output_schema); + + Ok(Self { + array_reader, + array_reader_position: 0, + schema, + read_plan, + metrics, + post_filter: Some(post_filter), + post_selection_filter: None, + buffered_batches: None, + }) + } + #[inline(always)] pub(crate) fn batch_size(&self) -> usize { self.read_plan.batch_size() @@ -1589,6 +1842,7 @@ pub(crate) mod tests { use rand::{Rng, RngCore, SeedableRng, random, rng}; use tempfile::tempfile; + use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; use crate::arrow::arrow_reader::{ ArrowPredicateFn, ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder, RowFilter, RowSelection, RowSelector, @@ -1639,6 +1893,43 @@ pub(crate) mod tests { assert_eq!(original_schema.fields(), reader.schema().fields()); } + #[test] + fn sync_reader_records_row_selection_metrics_after_limit_offset() { + let schema = Arc::new(Schema::new(vec![Field::new( + "c0", + ArrowDataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Int32Array::from_iter_values(0..10)) as ArrayRef], + ) + .unwrap(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema, None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let metrics = ArrowReaderMetrics::enabled(); + let _reader = ParquetRecordBatchReaderBuilder::try_new(Bytes::from(buffer)) + .unwrap() + .with_metrics(metrics.clone()) + .with_offset(3) + .with_limit(4) + .build() + .unwrap(); + + assert_eq!(metrics.row_selection_selected_rows(), Some(4)); + assert_eq!(metrics.row_selection_skipped_rows(), Some(3)); + assert_eq!(metrics.row_selection_selector_count(), Some(2)); + assert_eq!(metrics.row_selection_mask_plan_count(), Some(1)); + assert_eq!( + metrics.row_selection_auto_mask_short_run_plan_count(), + Some(1) + ); + } + #[test] fn test_reuse_schema() { let file = get_test_file("parquet/alltypes-java.parquet"); diff --git a/parquet/src/arrow/arrow_reader/post_filter.rs b/parquet/src/arrow/arrow_reader/post_filter.rs new file mode 100644 index 000000000000..59c81dc56ffb --- /dev/null +++ b/parquet/src/arrow/arrow_reader/post_filter.rs @@ -0,0 +1,311 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Post-decode filtering support for parquet row filters. +//! +//! Normal predicate pushdown decodes predicate columns first, builds a +//! `RowSelection`, and then decodes output columns for selected rows. The +//! post-filter path in this module instead decodes the union of predicate and +//! output columns once and applies predicates after decode. +//! +//! ```text +//! read projection = output columns UNION predicate columns +//! | +//! v +//! decode RecordBatch +//! | +//! +-- predicate 1 --> filter batch +//! +-- predicate 2 --> filter batch +//! | +//! v +//! project original output columns +//! ``` +//! +//! This is profitable for shapes where row-level pushdown has high overhead +//! and little pruning, especially fragmented high-selectivity selections. + +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; +use crate::arrow::arrow_reader::{RowFilter, RowSelection}; +use crate::arrow::{ProjectionMask, RootColumnSelection}; +use crate::errors::{ParquetError, Result}; +use crate::schema::types::SchemaDescriptor; +use arrow_array::{BooleanArray, RecordBatch}; +use arrow_buffer::BooleanBuffer; +use arrow_schema::{ArrowError, Schema, SchemaRef}; +use arrow_select::filter::filter_record_batch; +use std::sync::{Arc, Mutex}; + +#[derive(Debug)] +pub(super) struct PostFilterState { + filter: Arc>, + metrics: ArrowReaderMetrics, + predicate_projection_indices: Vec>, + predicate_projection_schemas: Vec, + output_projection_indices: Vec, + pub(super) output_schema: SchemaRef, +} + +impl PostFilterState { + pub(super) fn try_new( + filter: Arc>, + metrics: ArrowReaderMetrics, + parquet_schema: &SchemaDescriptor, + read_schema: &Schema, + read_projection: &ProjectionMask, + output_projection: &ProjectionMask, + ) -> Result { + // Projection indices are computed once when constructing the reader. + // Each predicate sees only the columns it requested, while the caller + // receives only the original output projection after all predicates run. + let filter_guard = filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + + let predicate_projection_indices = filter_guard + .predicates + .iter() + .map(|predicate| { + projection_indices(parquet_schema, read_projection, predicate.projection()) + }) + .collect::>>()?; + drop(filter_guard); + + let predicate_projection_schemas = predicate_projection_indices + .iter() + .map(|indices| read_schema.project(indices).map(SchemaRef::new)) + .collect::, _>>()?; + + let output_projection_indices = + projection_indices(parquet_schema, read_projection, output_projection)?; + let output_schema = SchemaRef::new(read_schema.project(&output_projection_indices)?); + + Ok(Self { + filter, + metrics, + predicate_projection_indices, + predicate_projection_schemas, + output_projection_indices, + output_schema, + }) + } + + pub(super) fn apply(&mut self, mut batch: RecordBatch) -> Result { + let mut filter = self.filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + + // Apply predicates in the same order as RowFilter pushdown. Each + // predicate is evaluated against the currently surviving rows, so later + // predicates do not do work for rows already rejected by earlier ones. + for (predicate_idx, (predicate, projection_indices)) in filter + .predicates + .iter_mut() + .zip(self.predicate_projection_indices.iter()) + .enumerate() + { + let input_rows = batch.num_rows(); + let predicate_batch = + self.metrics + .time_phase(ArrowReaderPhase::PostFilterPredicateProject, || { + project_record_batch( + &batch, + projection_indices, + Arc::clone(&self.predicate_projection_schemas[predicate_idx]), + ) + })?; + let predicate_filter = self + .metrics + .time_phase(ArrowReaderPhase::PostFilterPredicateEvaluate, || { + predicate.evaluate(predicate_batch) + })?; + + if predicate_filter.len() != input_rows { + return Err(general_err!( + "ArrowPredicate predicate returned {} rows, expected {input_rows}", + predicate_filter.len() + )); + } + + batch = self + .metrics + .time_phase(ArrowReaderPhase::PostFilterApplyFilter, || { + filter_record_batch(&batch, &predicate_filter) + })?; + if batch.num_rows() == 0 { + break; + } + } + + Ok(self + .metrics + .time_phase(ArrowReaderPhase::PostFilterOutputProject, || { + project_record_batch( + &batch, + &self.output_projection_indices, + Arc::clone(&self.output_schema), + ) + })?) + } +} + +#[derive(Debug)] +pub(super) struct PostSelectionFilterState { + mask: BooleanBuffer, + position: usize, + metrics: ArrowReaderMetrics, +} + +impl PostSelectionFilterState { + pub(super) fn new(selection: RowSelection, metrics: ArrowReaderMetrics) -> Self { + Self { + mask: selection.boolean_mask(), + position: 0, + metrics, + } + } + + pub(super) fn apply(&mut self, batch: RecordBatch) -> Result { + // This path is not predicate post-filtering. It is used after pushdown + // has already computed a final RowSelection for the current row group, + // but the post-filter path decodes the base selection and applies that + // already-computed selection after decode. + let input_rows = batch.num_rows(); + let end = self.position.saturating_add(input_rows); + if end > self.mask.len() { + return Err(general_err!( + "post-selection filter exceeded selection length: end {end}, selection length {}", + self.mask.len() + )); + } + + let filter = BooleanArray::from(self.mask.slice(self.position, input_rows)); + self.position = end; + Ok(self + .metrics + .time_phase(ArrowReaderPhase::PostSelectionApplyFilter, || { + filter_record_batch(&batch, &filter) + })?) + } +} + +#[inline(always)] +fn project_record_batch( + batch: &RecordBatch, + indices: &[usize], + schema: SchemaRef, +) -> std::result::Result { + if indices.len() == batch.num_columns() && indices.iter().copied().eq(0..batch.num_columns()) { + debug_assert_eq!(batch.schema_ref().as_ref(), schema.as_ref()); + return Ok(batch.clone()); + } + + let columns = indices + .iter() + .map(|idx| { + batch.columns().get(*idx).cloned().ok_or_else(|| { + ArrowError::SchemaError(format!( + "project index {} out of bounds, max field {}", + idx, + batch.num_columns() + )) + }) + }) + .collect::, ArrowError>>()?; + + unsafe { + // The indices and schema are produced from the same valid read schema + // at construction time, and filtering preserves column lengths. + Ok(RecordBatch::new_unchecked( + schema, + columns, + batch.num_rows(), + )) + } +} + +fn projection_indices( + parquet_schema: &SchemaDescriptor, + read_projection: &ProjectionMask, + target_projection: &ProjectionMask, +) -> Result> { + let read_roots = read_projection.root_column_selection(parquet_schema); + let target_roots = target_projection.root_column_selection(parquet_schema); + validate_post_filter_projection( + parquet_schema, + read_projection, + target_projection, + &read_roots, + &target_roots, + )?; + + // Convert parquet projection masks to top-level RecordBatch column + // positions after the larger read projection has been decoded. For example: + // + // ```text + // parquet leaves: a b.aa b.bb c + // read projection: a b.aa b.bb => batch columns [a, b] + // target: b.aa b.bb => target index [1] + // ``` + let mut read_root_to_batch_idx = vec![None; parquet_schema.root_schema().get_fields().len()]; + for (batch_idx, root_idx) in read_roots.included_indices.iter().copied().enumerate() { + read_root_to_batch_idx[root_idx] = Some(batch_idx); + } + + target_roots + .included_indices + .into_iter() + .map(|target_root| { + read_root_to_batch_idx[target_root].ok_or_else(|| { + general_err!( + "post-filter target root column {target_root} not present in read projection" + ) + }) + }) + .collect() +} + +fn validate_post_filter_projection( + parquet_schema: &SchemaDescriptor, + read_projection: &ProjectionMask, + target_projection: &ProjectionMask, + read_roots: &RootColumnSelection, + target_roots: &RootColumnSelection, +) -> Result<()> { + // Post-filter only projects already-decoded batches by top-level Arrow + // field index. It can keep or drop a whole nested root, but it cannot + // recursively project nested children such as `b.aa` without `b.bb`. + if !read_roots.selects_whole_roots { + return Err(general_err!( + "post-filter cost model does not support partial nested read projections" + )); + } + if !target_roots.selects_whole_roots { + return Err(general_err!( + "post-filter cost model does not support partial nested target projections" + )); + } + + for leaf_idx in 0..parquet_schema.num_columns() { + if target_projection.leaf_included(leaf_idx) && !read_projection.leaf_included(leaf_idx) { + return Err(general_err!( + "post-filter target projection includes leaf column {leaf_idx} not present in read projection" + )); + } + } + + Ok(()) +} diff --git a/parquet/src/arrow/arrow_reader/read_plan.rs b/parquet/src/arrow/arrow_reader/read_plan.rs index 674ae2b8d964..bc423d81d1ac 100644 --- a/parquet/src/arrow/arrow_reader/read_plan.rs +++ b/parquet/src/arrow/arrow_reader/read_plan.rs @@ -19,8 +19,11 @@ //! from a Parquet file use crate::arrow::array_reader::ArrayReader; -use crate::arrow::arrow_reader::selection::RowSelectionPolicy; -use crate::arrow::arrow_reader::selection::RowSelectionStrategy; +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; +use crate::arrow::arrow_reader::selection::{ + LoadedRowRanges, RowSelectionPolicy, RowSelectionShape, RowSelectionStrategy, + RowSelectionStrategyDecision, RowSelectionStrategyReason, +}; use crate::arrow::arrow_reader::{ ArrowPredicate, ParquetRecordBatchReader, RowSelection, RowSelectionCursor, RowSelector, }; @@ -30,12 +33,19 @@ use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; use arrow_select::filter::prep_null_mask_filter; use std::collections::VecDeque; +const HIGH_SELECTED_RATIO_NUMERATOR: usize = 7; +const HIGH_SELECTED_RATIO_DENOMINATOR: usize = 8; +const FRAGMENTED_SELECTED_RUN_LIMIT: usize = 4; +const CLUSTERED_SELECTED_RUN_MULTIPLIER: usize = 4; +const CLUSTERED_SKIPPED_RUN_MULTIPLIER: usize = 4; + /// Options for [`ReadPlanBuilder::with_predicate_options`]. pub struct PredicateOptions<'a> { array_reader: Box, predicate: &'a mut dyn ArrowPredicate, limit: Option, total_rows: usize, + metrics: ArrowReaderMetrics, } impl<'a> PredicateOptions<'a> { @@ -51,6 +61,7 @@ impl<'a> PredicateOptions<'a> { predicate, limit: None, total_rows: 0, + metrics: ArrowReaderMetrics::disabled(), } } @@ -74,6 +85,11 @@ impl<'a> PredicateOptions<'a> { self.total_rows = total_rows; self } + + pub(crate) fn with_metrics(mut self, metrics: ArrowReaderMetrics) -> Self { + self.metrics = metrics; + self + } } /// A builder for [`ReadPlan`] @@ -84,6 +100,8 @@ pub struct ReadPlanBuilder { selection: Option, /// Policy to use when materializing the row selection row_selection_policy: RowSelectionPolicy, + /// Row ranges already loaded by page pruning + loaded_row_ranges: Option, } impl ReadPlanBuilder { @@ -93,6 +111,7 @@ impl ReadPlanBuilder { batch_size, selection: None, row_selection_policy: RowSelectionPolicy::default(), + loaded_row_ranges: None, } } @@ -110,6 +129,11 @@ impl ReadPlanBuilder { self } + pub(crate) fn with_loaded_row_ranges(mut self, loaded: Option) -> Self { + self.loaded_row_ranges = loaded; + self + } + /// Returns the current row selection policy pub fn row_selection_policy(&self) -> &RowSelectionPolicy { &self.row_selection_policy @@ -147,36 +171,35 @@ impl ReadPlanBuilder { /// Returns the [`RowSelectionStrategy`] for this plan. /// /// Guarantees to return either `Selectors` or `Mask`, never `Auto`. + #[cfg(test)] pub(crate) fn resolve_selection_strategy(&self) -> RowSelectionStrategy { + self.resolve_selection_strategy_decision().strategy + } + + pub(crate) fn resolve_selection_strategy_decision(&self) -> RowSelectionStrategyDecision { + let shape = RowSelectionShape::from_selection(self.selection.as_ref()); + match self.row_selection_policy { - RowSelectionPolicy::Selectors => RowSelectionStrategy::Selectors, - RowSelectionPolicy::Mask => RowSelectionStrategy::Mask, + RowSelectionPolicy::Selectors => RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::ForcedSelectors, + shape, + ), + RowSelectionPolicy::Mask => RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::ForcedMask, + shape, + ), RowSelectionPolicy::Auto { threshold, .. } => { - let selection = match self.selection.as_ref() { - Some(selection) => selection, - None => return RowSelectionStrategy::Selectors, - }; - - // total_rows: total number of rows selected / skipped - // effective_count: number of non-empty selectors - let (total_rows, effective_count) = - selection.iter().fold((0usize, 0usize), |(rows, count), s| { - if s.row_count > 0 { - (rows + s.row_count, count + 1) - } else { - (rows, count) - } - }); - - if effective_count == 0 { - return RowSelectionStrategy::Mask; + if self.selection.is_none() { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorLongRuns, + shape, + ); } - if total_rows < effective_count.saturating_mul(threshold) { - RowSelectionStrategy::Mask - } else { - RowSelectionStrategy::Selectors - } + resolve_auto_selection_strategy(threshold, shape) } } } @@ -210,6 +233,7 @@ impl ReadPlanBuilder { predicate, limit, total_rows, + metrics, } = options; // Target length for the concatenated filter output: @@ -223,14 +247,21 @@ impl ReadPlanBuilder { None => limit.map(|_| total_rows), }; - let reader = ParquetRecordBatchReader::new(array_reader, self.clone().build()); + let mut reader = ParquetRecordBatchReader::new(array_reader, self.clone().build()); let mut filters = vec![]; let mut processed_rows: usize = 0; let mut matched_rows: usize = 0; - for maybe_batch in reader { + loop { + let maybe_batch = + metrics.time_phase(ArrowReaderPhase::PredicateDecode, || reader.next()); + let Some(maybe_batch) = maybe_batch else { + break; + }; let maybe_batch = maybe_batch?; let input_rows = maybe_batch.num_rows(); - let filter = predicate.evaluate(maybe_batch)?; + let filter = metrics.time_phase(ArrowReaderPhase::PredicateEvaluate, || { + predicate.evaluate(maybe_batch) + })?; // Since user supplied predicate, check error here to catch bugs quickly if filter.len() != input_rows { return Err(arrow_err!( @@ -278,9 +309,15 @@ impl ReadPlanBuilder { if all_selected && self.selection.is_none() { return Ok(self); } - let raw = RowSelection::from_filters(&filters); + let raw = metrics.time_phase(ArrowReaderPhase::PredicateSelectionBuild, || { + RowSelection::from_filters(&filters) + }); self.selection = match self.selection.take() { - Some(selection) => Some(selection.and_then(&raw)), + Some(selection) => Some( + metrics.time_phase(ArrowReaderPhase::PredicateSelectionMerge, || { + selection.and_then(&raw) + }), + ), None => Some(raw), }; Ok(self) @@ -293,25 +330,43 @@ impl ReadPlanBuilder { self.selection = Some(RowSelection::from(vec![])); } + self.build_with_metrics(&ArrowReaderMetrics::disabled()) + } + + /// Create a final `ReadPlan` and record row-selection planning metrics. + pub(crate) fn build_with_metrics(mut self, metrics: &ArrowReaderMetrics) -> ReadPlan { + // If selection is empty, truncate + if !self.selects_any() { + self.selection = Some(RowSelection::from(vec![])); + } + // Preferred strategy must not be Auto - let selection_strategy = self.resolve_selection_strategy(); + let selection_strategy_decision = self.resolve_selection_strategy_decision(); + let selection_strategy = selection_strategy_decision.strategy; let Self { batch_size, selection, row_selection_policy: _, + loaded_row_ranges, } = self; let selection = selection.map(|s| s.trim()); + if matches!(metrics, ArrowReaderMetrics::Enabled(_)) && selection.is_some() { + let shape = RowSelectionShape::from_selection(selection.as_ref()); + metrics.record_row_selection(selection_strategy_decision.with_shape(shape)); + } let row_selection_cursor = selection .map(|s| { - let trimmed = s.trim(); - let selectors: Vec = trimmed.into(); + let selectors: Vec = s.into(); match selection_strategy { - RowSelectionStrategy::Mask => { - RowSelectionCursor::new_mask_from_selectors(selectors) - } + RowSelectionStrategy::Mask => match loaded_row_ranges { + Some(loaded) => { + RowSelectionCursor::new_sparse_mask_from_selectors(selectors, loaded) + } + None => RowSelectionCursor::new_mask_from_selectors(selectors), + }, RowSelectionStrategy::Selectors => RowSelectionCursor::new_selectors(selectors), } }) @@ -324,6 +379,111 @@ impl ReadPlanBuilder { } } +fn resolve_auto_selection_strategy( + threshold: usize, + shape: RowSelectionShape, +) -> RowSelectionStrategyDecision { + if shape.selector_count == 0 || shape.selected_rows == 0 { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskEmptySelection, + shape, + ); + } + + if clustered_selection_at_or_above_threshold(shape, threshold) { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorClusteredSelection, + shape, + ); + } + + if shape.skipped_rows > 0 + && selected_ratio_at_least( + shape, + HIGH_SELECTED_RATIO_NUMERATOR, + HIGH_SELECTED_RATIO_DENOMINATOR, + ) + { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskHighSelectedRatio, + shape, + ); + } + + if shape.selected_run_count > 1 + && shape.average_selected_run_length() <= FRAGMENTED_SELECTED_RUN_LIMIT as f64 + && selection_density_at_or_above_threshold(shape, threshold) + { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskFragmentedSelection, + shape, + ); + } + + if shape.selected_run_count > 0 + && shape.average_selected_run_length() + >= threshold.saturating_mul(CLUSTERED_SELECTED_RUN_MULTIPLIER) as f64 + && shape.average_skipped_run_length() > 0.0 + && shape.selected_ratio() <= 0.5 + { + return RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorClusteredSelection, + shape, + ); + } + + if shape.total_rows() < shape.selector_count.saturating_mul(threshold) { + RowSelectionStrategyDecision::new( + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskShortRuns, + shape, + ) + } else { + RowSelectionStrategyDecision::new( + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorLongRuns, + shape, + ) + } +} + +fn selected_ratio_at_least(shape: RowSelectionShape, numerator: usize, denominator: usize) -> bool { + (shape.selected_rows as u128) * (denominator as u128) + >= (shape.total_rows() as u128) * (numerator as u128) +} + +fn selection_density_at_or_above_threshold(shape: RowSelectionShape, threshold: usize) -> bool { + (shape.total_rows() as u128) <= (shape.selector_count as u128) * (threshold as u128) +} + +fn clustered_selection_at_or_above_threshold(shape: RowSelectionShape, threshold: usize) -> bool { + average_run_length_at_least( + shape.selected_rows, + shape.selected_run_count, + threshold, + CLUSTERED_SELECTED_RUN_MULTIPLIER, + ) && average_run_length_at_least( + shape.skipped_rows, + shape.skipped_run_count, + threshold, + CLUSTERED_SKIPPED_RUN_MULTIPLIER, + ) +} + +fn average_run_length_at_least( + rows: usize, + runs: usize, + threshold: usize, + multiplier: usize, +) -> bool { + runs > 0 && (rows as u128) >= (runs as u128) * (threshold as u128) * (multiplier as u128) +} + /// Builder for [`ReadPlan`] that applies a limit and offset to the read plan /// /// See [`ReadPlanBuilder::limited`] to create this builder. @@ -480,6 +640,299 @@ mod tests { ReadPlanBuilder::new(1024).with_selection(Some(selection)) } + fn assert_strategy_decision( + builder: ReadPlanBuilder, + strategy: RowSelectionStrategy, + reason: RowSelectionStrategyReason, + expected_shape: RowSelectionShape, + ) { + let decision = builder.resolve_selection_strategy_decision(); + assert_eq!(decision.strategy, strategy); + assert_eq!(decision.reason, reason); + assert_eq!(decision.shape, expected_shape); + } + + fn shape( + selected_rows: usize, + skipped_rows: usize, + selector_count: usize, + selected_run_count: usize, + skipped_run_count: usize, + ) -> RowSelectionShape { + RowSelectionShape { + selected_rows, + skipped_rows, + selector_count, + selected_run_count, + skipped_run_count, + } + } + + #[test] + fn row_group_execution_modes_cover_pushdown_and_post_filter() { + use crate::arrow::arrow_reader::selection::{RowGroupExecutionMode, RowSelectionStrategy}; + + assert_eq!( + RowGroupExecutionMode::Pushdown(RowSelectionStrategy::Mask).to_string(), + "Pushdown(Mask)" + ); + assert_eq!( + RowGroupExecutionMode::Pushdown(RowSelectionStrategy::Selectors).to_string(), + "Pushdown(Selectors)" + ); + assert_eq!(RowGroupExecutionMode::PostFilter.to_string(), "PostFilter"); + } + + #[test] + fn cost_model_classifier_triggers_for_fragmented_high_selectivity() { + use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, + }; + + let observation = CostModelObservation { + observed_row_groups: 2, + shape: RowSelectionShape { + selected_rows: 128, + skipped_rows: 64, + selector_count: 96, + selected_run_count: 64, + skipped_run_count: 32, + }, + }; + + assert_eq!( + observation.trigger_reason(), + CostModelDecisionReason::FragmentedHighSelectivity + ); + } + + #[test] + fn cost_model_classifier_waits_for_observation_window() { + use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, + }; + + let observation = CostModelObservation { + observed_row_groups: 0, + shape: RowSelectionShape { + selected_rows: 64, + skipped_rows: 64, + selector_count: 64, + selected_run_count: 32, + skipped_run_count: 32, + }, + }; + + assert_eq!( + observation.trigger_reason(), + CostModelDecisionReason::ObservationIncomplete + ); + } + + #[test] + fn cost_model_classifier_triggers_for_high_selectivity_without_pruning() { + use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, + }; + + let observation = CostModelObservation { + observed_row_groups: 2, + shape: RowSelectionShape { + selected_rows: 200, + skipped_rows: 0, + selector_count: 2, + selected_run_count: 2, + skipped_run_count: 0, + }, + }; + + assert_eq!( + observation.trigger_reason(), + CostModelDecisionReason::HighSelectivityNoPruning + ); + } + + #[test] + fn cost_model_classifier_triggers_for_fragmented_moderate_selectivity() { + use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, + }; + + let observation = CostModelObservation { + observed_row_groups: 2, + shape: RowSelectionShape { + selected_rows: 30, + skipped_rows: 170, + selector_count: 60, + selected_run_count: 30, + skipped_run_count: 30, + }, + }; + + assert_eq!( + observation.trigger_reason(), + CostModelDecisionReason::FragmentedModerateSelectivity + ); + } + + #[test] + fn cost_model_classifier_triggers_for_fragmented_near_ten_percent_selectivity() { + use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, + }; + + let observation = CostModelObservation { + observed_row_groups: 1, + shape: RowSelectionShape { + selected_rows: 9, + skipped_rows: 91, + selector_count: 18, + selected_run_count: 9, + skipped_run_count: 9, + }, + }; + + assert_eq!( + observation.trigger_reason(), + CostModelDecisionReason::FragmentedModerateSelectivity + ); + } + + #[test] + fn cost_model_classifier_keeps_q38_like_low_selectivity_fragmented_pushdown() { + use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, + }; + + let observation = CostModelObservation { + observed_row_groups: 1, + shape: RowSelectionShape { + selected_rows: 4_870, + skipped_rows: 57_698, + selector_count: 6_168, + selected_run_count: 3_084, + skipped_run_count: 3_084, + }, + }; + + assert_eq!( + observation.trigger_reason(), + CostModelDecisionReason::PushdownStillPreferred + ); + } + + #[test] + fn cost_model_classifier_keeps_low_selectivity_fragmented_pushdown() { + use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, + }; + + let observation = CostModelObservation { + observed_row_groups: 1, + shape: RowSelectionShape { + selected_rows: 4, + skipped_rows: 196, + selector_count: 8, + selected_run_count: 4, + skipped_run_count: 4, + }, + }; + + assert_eq!( + observation.trigger_reason(), + CostModelDecisionReason::PushdownStillPreferred + ); + } + + #[test] + fn selection_strategy_decision_records_forced_mask() { + let selection = RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(8)]); + let builder = + builder_with_selection(selection).with_row_selection_policy(RowSelectionPolicy::Mask); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::ForcedMask, + shape(8, 2, 2, 1, 1), + ); + } + + #[test] + fn selection_strategy_decision_records_forced_selectors() { + let selection = RowSelection::from(vec![RowSelector::skip(2), RowSelector::select(8)]); + let builder = builder_with_selection(selection) + .with_row_selection_policy(RowSelectionPolicy::Selectors); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::ForcedSelectors, + shape(8, 2, 2, 1, 1), + ); + } + + #[test] + fn selection_strategy_decision_records_auto_empty_selection() { + let selection = RowSelection::from(vec![]); + let builder = builder_with_selection(selection); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskEmptySelection, + shape(0, 0, 0, 0, 0), + ); + } + + #[test] + fn selection_strategy_decision_records_auto_short_runs() { + let selection = RowSelection::from(vec![RowSelector::select(8), RowSelector::skip(8)]); + let builder = builder_with_selection(selection); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Mask, + RowSelectionStrategyReason::AutoMaskShortRuns, + shape(8, 8, 2, 1, 1), + ); + } + + #[test] + fn selection_strategy_decision_records_auto_long_runs() { + let selection = RowSelection::from(vec![RowSelector::select(3), RowSelector::skip(3)]); + let builder = builder_with_selection(selection) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); + + assert_strategy_decision( + builder, + RowSelectionStrategy::Selectors, + RowSelectionStrategyReason::AutoSelectorLongRuns, + shape(3, 3, 2, 1, 1), + ); + } + + #[test] + fn build_metrics_records_structured_strategy_decision_shape() { + let metrics = ArrowReaderMetrics::enabled(); + let selection = RowSelection::from(vec![RowSelector::select(8), RowSelector::skip(4)]); + let builder = builder_with_selection(selection); + + builder.build_with_metrics(&metrics); + + assert_eq!(metrics.row_selection_selected_rows(), Some(8)); + assert_eq!(metrics.row_selection_skipped_rows(), Some(0)); + assert_eq!(metrics.row_selection_selector_count(), Some(1)); + assert_eq!(metrics.row_selection_selected_run_count(), Some(1)); + assert_eq!(metrics.row_selection_skipped_run_count(), Some(0)); + assert_eq!(metrics.row_selection_mask_plan_count(), Some(1)); + assert_eq!(metrics.row_selection_selector_plan_count(), Some(0)); + assert_eq!( + metrics.row_selection_auto_mask_short_run_plan_count(), + Some(1) + ); + } + #[test] fn preferred_selection_strategy_prefers_mask_by_default() { let selection = RowSelection::from(vec![RowSelector::select(8)]); @@ -492,7 +945,7 @@ mod tests { #[test] fn preferred_selection_strategy_prefers_selectors_when_threshold_small() { - let selection = RowSelection::from(vec![RowSelector::select(8)]); + let selection = RowSelection::from(vec![RowSelector::select(3), RowSelector::skip(3)]); let builder = builder_with_selection(selection) .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); assert_eq!( @@ -501,6 +954,160 @@ mod tests { ); } + #[test] + fn auto_strategy_prefers_mask_for_fragmented_selected_rows_at_threshold_boundary() { + let selectors: Vec = (0..64) + .flat_map(|_| [RowSelector::select(1), RowSelector::skip(63)]) + .collect(); + let builder = builder_with_selection(RowSelection::from(selectors)); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Mask); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoMaskFragmentedSelection + ); + assert_eq!(decision.shape.selected_run_count, 64); + assert_eq!(decision.shape.average_selected_run_length(), 1.0); + } + + #[test] + fn auto_strategy_prefers_mask_for_high_selected_ratio() { + let selection = RowSelection::from(vec![ + RowSelector::select(900), + RowSelector::skip(25), + RowSelector::select(75), + ]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Mask); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoMaskHighSelectedRatio + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_clustered_high_selected_ratio() { + let selectors: Vec = (0..10) + .flat_map(|_| [RowSelector::select(9000), RowSelector::skip(1000)]) + .collect(); + let selection = RowSelection::from(selectors); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorClusteredSelection + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_clustered_long_selected_runs() { + let selection = + RowSelection::from(vec![RowSelector::skip(9000), RowSelector::select(1000)]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorClusteredSelection + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_long_single_selected_run_with_no_skips() { + let selection = RowSelection::from(vec![RowSelector::select(1024)]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorLongRuns + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_tiny_runs_separated_by_huge_skip() { + let selection = RowSelection::from(vec![ + RowSelector::select(4), + RowSelector::skip(100_000), + RowSelector::select(4), + ]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorLongRuns + ); + } + + #[test] + fn auto_strategy_prefers_selectors_for_huge_half_selected_ratio_without_saturation() { + let selection = RowSelection::from(vec![ + RowSelector::select(usize::MAX / 2), + RowSelector::skip(usize::MAX / 2), + ]); + let builder = builder_with_selection(selection); + + let decision = builder.resolve_selection_strategy_decision(); + + assert_eq!(decision.strategy, RowSelectionStrategy::Selectors); + assert_eq!( + decision.reason, + RowSelectionStrategyReason::AutoSelectorClusteredSelection + ); + } + + #[test] + fn build_metrics_records_shape_aware_strategy_reasons() { + let metrics = ArrowReaderMetrics::enabled(); + let fragmented_selectors: Vec = (0..64) + .flat_map(|_| [RowSelector::select(1), RowSelector::skip(63)]) + .collect(); + + builder_with_selection(RowSelection::from(fragmented_selectors)) + .build_with_metrics(&metrics); + builder_with_selection(RowSelection::from(vec![ + RowSelector::select(900), + RowSelector::skip(25), + RowSelector::select(75), + ])) + .build_with_metrics(&metrics); + builder_with_selection(RowSelection::from(vec![ + RowSelector::skip(9000), + RowSelector::select(1000), + ])) + .build_with_metrics(&metrics); + + assert_eq!(metrics.row_selection_mask_plan_count(), Some(2)); + assert_eq!(metrics.row_selection_selector_plan_count(), Some(1)); + assert_eq!( + metrics.row_selection_auto_mask_fragmented_plan_count(), + Some(1) + ); + assert_eq!( + metrics.row_selection_auto_mask_high_ratio_plan_count(), + Some(1) + ); + assert_eq!( + metrics.row_selection_auto_selector_clustered_plan_count(), + Some(1) + ); + } + #[test] fn truncate_filter_after_n_trues_keeps_first_n_matches() { let f = BooleanArray::from(vec![true, false, true, true, false, true, true]); diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs index 2ddf812f9c39..a7f45cba479b 100644 --- a/parquet/src/arrow/arrow_reader/selection.rs +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -15,9 +15,15 @@ // specific language governing permissions and limitations // under the License. -use crate::arrow::ProjectionMask; +mod strategy; + +pub(crate) use strategy::{ + CostModelDecisionReason, CostModelObservation, RowGroupExecutionMode, RowSelectionShape, + RowSelectionStrategy, RowSelectionStrategyDecision, RowSelectionStrategyReason, +}; + use crate::errors::ParquetError; -use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; +use crate::file::page_index::offset_index::PageLocation; use arrow_array::{Array, BooleanArray}; use arrow_buffer::{BooleanBuffer, BooleanBufferBuilder}; use arrow_select::filter::SlicesIterator; @@ -48,18 +54,6 @@ impl Default for RowSelectionPolicy { } } -/// Fully resolved strategy for materializing [`RowSelection`] during execution. -/// -/// This is determined from a combination of user preference (via [`RowSelectionPolicy`]) -/// and safety considerations (e.g. page skipping). -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) enum RowSelectionStrategy { - /// Use a queue of [`RowSelector`] values - Selectors, - /// Use a boolean mask to materialise the selection - Mask, -} - /// [`RowSelection`] is a collection of [`RowSelector`] used to skip rows when /// scanning a parquet file #[derive(Debug, Clone, Copy, Eq, PartialEq)] @@ -250,37 +244,52 @@ impl RowSelection { ranges } - /// Returns true if this selection would skip any data pages within the provided columns - fn selection_skips_any_page( + pub(crate) fn selected_page_row_ranges( &self, - projection: &ProjectionMask, - columns: &[OffsetIndexMetaData], - ) -> bool { - columns.iter().enumerate().any(|(leaf_idx, column)| { - if !projection.leaf_included(leaf_idx) { - return false; + page_locations: &[PageLocation], + total_rows: usize, + ) -> Vec> { + let mut ranges = Vec::new(); + let mut selector_idx = 0; + let mut selector_start = 0usize; + + for (page_idx, page) in page_locations.iter().enumerate() { + let page_start = page.first_row_index as usize; + let page_end = page_locations + .get(page_idx + 1) + .map(|next| next.first_row_index as usize) + .unwrap_or(total_rows); + + while selector_idx < self.selectors.len() { + let selector_end = selector_start + self.selectors[selector_idx].row_count; + if selector_end > page_start { + break; + } + selector_start = selector_end; + selector_idx += 1; } - let locations = column.page_locations(); - if locations.is_empty() { - return false; - } + let mut scan_idx = selector_idx; + let mut scan_start = selector_start; + let mut page_is_selected = false; - let ranges = self.scan_ranges(locations); - !ranges.is_empty() && ranges.len() < locations.len() - }) - } + while scan_idx < self.selectors.len() && scan_start < page_end { + let selector = self.selectors[scan_idx]; + let selector_end = scan_start + selector.row_count; + if !selector.skip && selector_end > page_start { + page_is_selected = true; + break; + } + scan_start = selector_end; + scan_idx += 1; + } - /// Returns true if selectors should be forced, preventing mask materialisation - pub(crate) fn should_force_selectors( - &self, - projection: &ProjectionMask, - offset_index: Option<&[OffsetIndexMetaData]>, - ) -> bool { - match offset_index { - Some(columns) => self.selection_skips_any_page(projection, columns), - None => false, + if page_is_selected { + ranges.push(page_start..page_end); + } } + + ranges } /// Splits off the first `row_count` from this [`RowSelection`] @@ -430,6 +439,10 @@ impl RowSelection { self.selectors.iter().any(|x| !x.skip) } + pub(crate) fn boolean_mask(&self) -> BooleanBuffer { + boolean_mask_from_selectors(&self.selectors) + } + /// Trims this [`RowSelection`] removing any trailing skips pub(crate) fn trim(mut self) -> Self { while self.selectors.last().map(|x| x.skip).unwrap_or(false) { @@ -767,27 +780,51 @@ fn union_row_selections(left: &[RowSelector], right: &[RowSelector]) -> RowSelec /// or selections. For example, selecting every other row. #[derive(Debug)] pub struct MaskCursor { - mask: BooleanBuffer, - /// Current absolute offset into the selection - position: usize, + inner: MaskCursorInner, +} + +#[derive(Debug)] +enum MaskCursorInner { + Dense { + mask: BooleanBuffer, + /// Current absolute offset into the selection + position: usize, + }, + Sparse(SparseMaskCursor), } impl MaskCursor { /// Returns `true` when no further rows remain pub fn is_empty(&self) -> bool { - self.position >= self.mask.len() + match &self.inner { + MaskCursorInner::Dense { mask, position } => *position >= mask.len(), + MaskCursorInner::Sparse(cursor) => cursor.is_empty(), + } + } + + pub(crate) fn is_sparse(&self) -> bool { + matches!(self.inner, MaskCursorInner::Sparse(_)) + } + + pub(crate) fn sparse_mut(&mut self) -> Option<&mut SparseMaskCursor> { + match &mut self.inner { + MaskCursorInner::Sparse(cursor) => Some(cursor), + MaskCursorInner::Dense { .. } => None, + } } /// Advance through the mask representation, producing the next chunk summary pub fn next_mask_chunk(&mut self, batch_size: usize) -> Option { - let (initial_skip, chunk_rows, selected_rows, mask_start, end_position) = { - let mask = &self.mask; + let MaskCursorInner::Dense { mask, position } = &mut self.inner else { + return None; + }; - if self.position >= mask.len() { + let (initial_skip, chunk_rows, selected_rows, mask_start, end_position) = { + if *position >= mask.len() { return None; } - let start_position = self.position; + let start_position = *position; let mut cursor = start_position; let mut initial_skip = 0; @@ -814,7 +851,7 @@ impl MaskCursor { (initial_skip, chunk_rows, selected_rows, mask_start, cursor) }; - self.position = end_position; + *position = end_position; Some(MaskChunk { initial_skip, @@ -826,13 +863,19 @@ impl MaskCursor { /// Materialise the boolean values for a mask-backed chunk pub fn mask_values_for(&self, chunk: &MaskChunk) -> Result { - if chunk.mask_start.saturating_add(chunk.chunk_rows) > self.mask.len() { + let MaskCursorInner::Dense { mask, .. } = &self.inner else { + return Err(ParquetError::General( + "Internal Error: dense mask chunk requested from sparse mask cursor".to_string(), + )); + }; + + if chunk.mask_start.saturating_add(chunk.chunk_rows) > mask.len() { return Err(ParquetError::General( "Internal Error: MaskChunk exceeds mask length".to_string(), )); } Ok(BooleanArray::from( - self.mask.slice(chunk.mask_start, chunk.chunk_rows), + mask.slice(chunk.mask_start, chunk.chunk_rows), )) } } @@ -885,11 +928,176 @@ pub struct MaskChunk { pub mask_start: usize, } +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct LoadedRowRanges { + /// Absolute row-group ranges for which all projected columns have backing + /// page data loaded in memory. + ranges: Vec>, + /// Total row count of the row group the ranges are relative to. + total_rows: usize, +} + +impl LoadedRowRanges { + pub(crate) fn new(ranges: Vec>, total_rows: usize) -> Self { + // Sparse-mask execution indexes masks by absolute row-group position. + // Keep loaded ranges sorted and non-overlapping so range containment is + // unambiguous and the reader can move forward without rewinding. + debug_assert!( + ranges + .windows(2) + .all(|window| window[0].end <= window[1].start), + "loaded row ranges must be sorted and non-overlapping" + ); + debug_assert!( + ranges + .iter() + .all(|range| range.start <= range.end && range.end <= total_rows), + "loaded row ranges must be valid within total_rows" + ); + Self { ranges, total_rows } + } + + pub(crate) fn is_sparse(&self) -> bool { + match self.ranges.as_slice() { + [] => self.total_rows != 0, + [range] => range.start != 0 || range.end != self.total_rows, + _ => true, + } + } + + fn range_containing(&self, row: usize) -> Option<&Range> { + self.ranges + .iter() + .find(|range| range.start <= row && row < range.end) + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct MaskSegment { + /// Absolute row-group range to decode from the array reader. + pub row_range: Range, + /// Starting bit in the absolute row-group mask for this segment. + pub mask_start: usize, + /// Number of mask bits to apply to `row_range`. + pub mask_len: usize, +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct SparseMaskChunk { + pub segments: Vec, + pub selected_rows: usize, +} + +#[derive(Debug)] +pub(crate) struct SparseMaskCursor { + /// Boolean mask indexed by absolute row-group position. + mask: BooleanBuffer, + /// Absolute row ranges whose data pages are present for the projection. + loaded: LoadedRowRanges, + /// Current absolute row-group position in `mask`. + position: usize, +} + +impl SparseMaskCursor { + pub(crate) fn new(selectors: Vec, loaded: LoadedRowRanges) -> Self { + Self { + mask: boolean_mask_from_selectors(&selectors), + loaded, + position: 0, + } + } + + pub(crate) fn is_empty(&self) -> bool { + self.position >= self.mask.len() || self.position >= self.loaded.total_rows + } + + pub(crate) fn mask_values_for( + &self, + segment: &MaskSegment, + ) -> Result { + if segment.mask_start.saturating_add(segment.mask_len) > self.mask.len() { + return Err(ParquetError::General( + "Internal Error: sparse mask segment exceeds mask length".to_string(), + )); + } + Ok(BooleanArray::from( + self.mask.slice(segment.mask_start, segment.mask_len), + )) + } + + pub(crate) fn next_sparse_mask_chunk( + &mut self, + batch_size: usize, + ) -> Result, ParquetError> { + if self.is_empty() { + return Ok(None); + } + + let mut selected_rows = 0usize; + let mut segments = Vec::new(); + let mut cursor = self.position; + + while cursor < self.mask.len() + && cursor < self.loaded.total_rows + && selected_rows < batch_size + { + if !self.mask.value(cursor) { + cursor += 1; + continue; + } + + let Some(loaded) = self.loaded.range_containing(cursor) else { + // A selected row outside loaded ranges means the read plan asks + // Mask to materialize a row whose page data was pruned away. + // Returning an internal error is safer than silently producing + // incorrect rows. + return Err(ParquetError::General(format!( + "Internal Error: sparse mask selected row {cursor} outside loaded row ranges" + ))); + }; + + // Build the largest contiguous selected segment that stays within + // the current loaded range and does not exceed the output batch + // size. The record batch reader will skip to `row_range.start`, + // read exactly `row_range.len()` rows, and then apply this mask + // slice to the decoded batch. + let segment_start = cursor; + let mut segment_end = cursor; + while segment_end < loaded.end + && segment_end < self.mask.len() + && selected_rows < batch_size + && self.mask.value(segment_end) + { + selected_rows += 1; + segment_end += 1; + } + + segments.push(MaskSegment { + row_range: segment_start..segment_end, + mask_start: segment_start, + mask_len: segment_end - segment_start, + }); + cursor = segment_end; + } + + self.position = cursor; + if segments.is_empty() { + self.position = self.mask.len().min(self.loaded.total_rows); + return Ok(None); + } + + Ok(Some(SparseMaskChunk { + segments, + selected_rows, + })) + } +} + /// Cursor for iterating a [`RowSelection`] during execution within a /// [`ReadPlan`](crate::arrow::arrow_reader::ReadPlan). /// -/// This keeps per-reader state such as the current position and delegates the -/// actual storage strategy to the internal `RowSelectionBacking`. +/// This keeps per-reader state such as the current position and delegates dense +/// or sparse mask state to the mask cursor. #[derive(Debug)] pub enum RowSelectionCursor { /// Reading all rows @@ -904,8 +1112,20 @@ impl RowSelectionCursor { /// Create a [`MaskCursor`] cursor backed by a bitmask, from an existing set of selectors pub(crate) fn new_mask_from_selectors(selectors: Vec) -> Self { Self::Mask(MaskCursor { - mask: boolean_mask_from_selectors(&selectors), - position: 0, + inner: MaskCursorInner::Dense { + mask: boolean_mask_from_selectors(&selectors), + position: 0, + }, + }) + } + + /// Create a [`SparseMaskCursor`] from the provided selectors and loaded row ranges + pub(crate) fn new_sparse_mask_from_selectors( + selectors: Vec, + loaded: LoadedRowRanges, + ) -> Self { + Self::Mask(MaskCursor { + inner: MaskCursorInner::Sparse(SparseMaskCursor::new(selectors, loaded)), }) } @@ -933,772 +1153,4 @@ fn boolean_mask_from_selectors(selectors: &[RowSelector]) -> BooleanBuffer { } #[cfg(test)] -mod tests { - use super::*; - use rand::{Rng, rng}; - - #[test] - fn test_from_filters() { - let filters = vec![ - BooleanArray::from(vec![false, false, false, true, true, true, true]), - BooleanArray::from(vec![true, true, false, false, true, true, true]), - BooleanArray::from(vec![false, false, false, false]), - BooleanArray::from(Vec::::new()), - ]; - - let selection = RowSelection::from_filters(&filters[..1]); - assert!(selection.selects_any()); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(3), RowSelector::select(4)] - ); - - let selection = RowSelection::from_filters(&filters[..2]); - assert!(selection.selects_any()); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(3), - RowSelector::select(6), - RowSelector::skip(2), - RowSelector::select(3) - ] - ); - - let selection = RowSelection::from_filters(&filters); - assert!(selection.selects_any()); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(3), - RowSelector::select(6), - RowSelector::skip(2), - RowSelector::select(3), - RowSelector::skip(4) - ] - ); - - let selection = RowSelection::from_filters(&filters[2..3]); - assert!(!selection.selects_any()); - assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); - } - - #[test] - fn test_split_off() { - let mut selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]); - - let split = selection.split_off(34); - assert_eq!(split.selectors, vec![RowSelector::skip(34)]); - assert_eq!( - selection.selectors, - vec![ - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35) - ] - ); - - let split = selection.split_off(5); - assert_eq!(split.selectors, vec![RowSelector::select(5)]); - assert_eq!( - selection.selectors, - vec![ - RowSelector::select(7), - RowSelector::skip(3), - RowSelector::select(35) - ] - ); - - let split = selection.split_off(8); - assert_eq!( - split.selectors, - vec![RowSelector::select(7), RowSelector::skip(1)] - ); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(2), RowSelector::select(35)] - ); - - let split = selection.split_off(200); - assert_eq!( - split.selectors, - vec![RowSelector::skip(2), RowSelector::select(35)] - ); - assert!(selection.selectors.is_empty()); - } - - #[test] - fn test_offset() { - let selection = RowSelection::from(vec![ - RowSelector::select(5), - RowSelector::skip(23), - RowSelector::select(7), - RowSelector::skip(33), - RowSelector::select(6), - ]); - - let selection = selection.offset(2); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(2), - RowSelector::select(3), - RowSelector::skip(23), - RowSelector::select(7), - RowSelector::skip(33), - RowSelector::select(6), - ] - ); - - let selection = selection.offset(5); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(30), - RowSelector::select(5), - RowSelector::skip(33), - RowSelector::select(6), - ] - ); - - let selection = selection.offset(3); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(33), - RowSelector::select(2), - RowSelector::skip(33), - RowSelector::select(6), - ] - ); - - let selection = selection.offset(2); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(68), RowSelector::select(6),] - ); - - let selection = selection.offset(3); - assert_eq!( - selection.selectors, - vec![RowSelector::skip(71), RowSelector::select(3),] - ); - } - - #[test] - fn test_and() { - let mut a = RowSelection::from(vec![ - RowSelector::skip(12), - RowSelector::select(23), - RowSelector::skip(3), - RowSelector::select(5), - ]); - - let b = RowSelection::from(vec![ - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(15), - RowSelector::skip(4), - ]); - - let mut expected = RowSelection::from(vec![ - RowSelector::skip(12), - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(14), - RowSelector::skip(3), - RowSelector::select(1), - RowSelector::skip(4), - ]); - - assert_eq!(a.and_then(&b), expected); - - a.split_off(7); - expected.split_off(7); - assert_eq!(a.and_then(&b), expected); - - let a = RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(3)]); - - let b = RowSelection::from(vec![ - RowSelector::select(2), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(1), - ]); - - assert_eq!( - a.and_then(&b).selectors, - vec![ - RowSelector::select(2), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(4) - ] - ); - } - - #[test] - fn test_combine() { - let a = vec![ - RowSelector::skip(3), - RowSelector::skip(3), - RowSelector::select(10), - RowSelector::skip(4), - ]; - - let b = vec![ - RowSelector::skip(3), - RowSelector::skip(3), - RowSelector::select(10), - RowSelector::skip(4), - RowSelector::skip(0), - ]; - - let c = vec![ - RowSelector::skip(2), - RowSelector::skip(4), - RowSelector::select(3), - RowSelector::select(3), - RowSelector::select(4), - RowSelector::skip(3), - RowSelector::skip(1), - RowSelector::skip(0), - ]; - - let expected = RowSelection::from(vec![ - RowSelector::skip(6), - RowSelector::select(10), - RowSelector::skip(4), - ]); - - assert_eq!(RowSelection::from_iter(a), expected); - assert_eq!(RowSelection::from_iter(b), expected); - assert_eq!(RowSelection::from_iter(c), expected); - } - - #[test] - fn test_combine_2elements() { - let a = vec![RowSelector::select(10), RowSelector::select(5)]; - let a_expect = vec![RowSelector::select(15)]; - assert_eq!(RowSelection::from_iter(a).selectors, a_expect); - - let b = vec![RowSelector::select(10), RowSelector::skip(5)]; - let b_expect = vec![RowSelector::select(10), RowSelector::skip(5)]; - assert_eq!(RowSelection::from_iter(b).selectors, b_expect); - - let c = vec![RowSelector::skip(10), RowSelector::select(5)]; - let c_expect = vec![RowSelector::skip(10), RowSelector::select(5)]; - assert_eq!(RowSelection::from_iter(c).selectors, c_expect); - - let d = vec![RowSelector::skip(10), RowSelector::skip(5)]; - let d_expect = vec![RowSelector::skip(15)]; - assert_eq!(RowSelection::from_iter(d).selectors, d_expect); - } - - #[test] - fn test_from_one_and_empty() { - let a = vec![RowSelector::select(10)]; - let selection1 = RowSelection::from(a.clone()); - assert_eq!(selection1.selectors, a); - - let b = vec![]; - let selection1 = RowSelection::from(b.clone()); - assert_eq!(selection1.selectors, b) - } - - #[test] - #[should_panic(expected = "selection exceeds the number of selected rows")] - fn test_and_longer() { - let a = RowSelection::from(vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(3), - RowSelector::skip(33), - ]); - let b = RowSelection::from(vec![RowSelector::select(36)]); - a.and_then(&b); - } - - #[test] - #[should_panic(expected = "selection contains less than the number of selected rows")] - fn test_and_shorter() { - let a = RowSelection::from(vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(3), - RowSelector::skip(33), - ]); - let b = RowSelection::from(vec![RowSelector::select(3)]); - a.and_then(&b); - } - - #[test] - fn test_intersect_row_selection_and_combine() { - // a size equal b size - let a = vec![ - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(1), - ]; - let b = vec![ - RowSelector::select(8), - RowSelector::skip(1), - RowSelector::select(1), - ]; - - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![ - RowSelector::select(5), - RowSelector::skip(4), - RowSelector::select(1), - ], - ); - - // a size larger than b size - let a = vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(3), - RowSelector::skip(33), - ]; - let b = vec![RowSelector::select(36), RowSelector::skip(36)]; - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![RowSelector::select(3), RowSelector::skip(69)] - ); - - // a size less than b size - let a = vec![RowSelector::select(3), RowSelector::skip(7)]; - let b = vec![ - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - ]; - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![RowSelector::select(2), RowSelector::skip(8)] - ); - - let a = vec![RowSelector::select(3), RowSelector::skip(7)]; - let b = vec![ - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - RowSelector::skip(2), - RowSelector::select(2), - ]; - let res = intersect_row_selections(&a, &b); - assert_eq!( - res.selectors, - vec![RowSelector::select(2), RowSelector::skip(8)] - ); - } - - #[test] - fn test_and_fuzz() { - let mut rand = rng(); - for _ in 0..100 { - let a_len = rand.random_range(10..100); - let a_bools: Vec<_> = (0..a_len).map(|_| rand.random_bool(0.2)).collect(); - let a = RowSelection::from_filters(&[BooleanArray::from(a_bools.clone())]); - - let b_len: usize = a_bools.iter().map(|x| *x as usize).sum(); - let b_bools: Vec<_> = (0..b_len).map(|_| rand.random_bool(0.8)).collect(); - let b = RowSelection::from_filters(&[BooleanArray::from(b_bools.clone())]); - - let mut expected_bools = vec![false; a_len]; - - let mut iter_b = b_bools.iter(); - for (idx, b) in a_bools.iter().enumerate() { - if *b && *iter_b.next().unwrap() { - expected_bools[idx] = true; - } - } - - let expected = RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); - - let total_rows: usize = expected.selectors.iter().map(|s| s.row_count).sum(); - assert_eq!(a_len, total_rows); - - assert_eq!(a.and_then(&b), expected); - } - } - - #[test] - fn test_iter() { - // use the iter() API to show it does what is expected and - // avoid accidental deletion - let selectors = vec![ - RowSelector::select(3), - RowSelector::skip(33), - RowSelector::select(4), - ]; - - let round_tripped = RowSelection::from(selectors.clone()) - .iter() - .cloned() - .collect::>(); - assert_eq!(selectors, round_tripped); - } - - #[test] - fn test_limit() { - // Limit to existing limit should no-op - let selection = RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(90)]); - let limited = selection.limit(10); - assert_eq!(RowSelection::from(vec![RowSelector::select(10)]), limited); - - let selection = RowSelection::from(vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - ]); - - let limited = selection.clone().limit(5); - let expected = vec![RowSelector::select(5)]; - assert_eq!(limited.selectors, expected); - - let limited = selection.clone().limit(15); - let expected = vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(5), - ]; - assert_eq!(limited.selectors, expected); - - let limited = selection.clone().limit(0); - let expected = vec![]; - assert_eq!(limited.selectors, expected); - - let limited = selection.clone().limit(30); - let expected = vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - ]; - assert_eq!(limited.selectors, expected); - - let limited = selection.limit(100); - let expected = vec![ - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(10), - ]; - assert_eq!(limited.selectors, expected); - } - - #[test] - fn test_scan_ranges() { - let index = vec![ - PageLocation { - offset: 0, - compressed_page_size: 10, - first_row_index: 0, - }, - PageLocation { - offset: 10, - compressed_page_size: 10, - first_row_index: 10, - }, - PageLocation { - offset: 20, - compressed_page_size: 10, - first_row_index: 20, - }, - PageLocation { - offset: 30, - compressed_page_size: 10, - first_row_index: 30, - }, - PageLocation { - offset: 40, - compressed_page_size: 10, - first_row_index: 40, - }, - PageLocation { - offset: 50, - compressed_page_size: 10, - first_row_index: 50, - }, - PageLocation { - offset: 60, - compressed_page_size: 10, - first_row_index: 60, - }, - ]; - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to page boundary - RowSelector::skip(5), - RowSelector::select(5), - // Skip full page past page boundary - RowSelector::skip(12), - // Select across page boundaries - RowSelector::select(12), - // Skip final page - RowSelector::skip(12), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, false]); - assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60]); - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to page boundary - RowSelector::skip(5), - RowSelector::select(5), - // Skip full page past page boundary - RowSelector::skip(12), - // Select across page boundaries - RowSelector::select(12), - RowSelector::skip(1), - // Select across page boundaries including final page - RowSelector::select(8), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, true]); - assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to page boundary - RowSelector::skip(5), - RowSelector::select(5), - // Skip full page past page boundary - RowSelector::skip(12), - // Select to final page boundary - RowSelector::select(12), - RowSelector::skip(1), - // Skip across final page boundary - RowSelector::skip(8), - // Select from final page - RowSelector::select(4), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, true]); - assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); - - let selection = RowSelection::from(vec![ - // Skip first page - RowSelector::skip(10), - // Multiple selects in same page - RowSelector::select(3), - RowSelector::skip(3), - RowSelector::select(4), - // Select to remaining in page and first row of next page - RowSelector::skip(5), - RowSelector::select(6), - // Skip remaining - RowSelector::skip(50), - ]); - - let ranges = selection.scan_ranges(&index); - - // assert_eq!(mask, vec![false, true, true, false, true, true, true]); - assert_eq!(ranges, vec![10..20, 20..30, 30..40]); - } - - #[test] - fn test_from_ranges() { - let ranges = [1..3, 4..6, 6..6, 8..8, 9..10]; - let selection = RowSelection::from_consecutive_ranges(ranges.into_iter(), 10); - assert_eq!( - selection.selectors, - vec![ - RowSelector::skip(1), - RowSelector::select(2), - RowSelector::skip(1), - RowSelector::select(2), - RowSelector::skip(3), - RowSelector::select(1) - ] - ); - - let out_of_order_ranges = [1..3, 8..10, 4..7]; - let result = std::panic::catch_unwind(|| { - RowSelection::from_consecutive_ranges(out_of_order_ranges.into_iter(), 10) - }); - assert!(result.is_err()); - } - - #[test] - fn test_empty_selector() { - let selection = RowSelection::from(vec![ - RowSelector::skip(0), - RowSelector::select(2), - RowSelector::skip(0), - RowSelector::select(2), - ]); - assert_eq!(selection.selectors, vec![RowSelector::select(4)]); - - let selection = RowSelection::from(vec![ - RowSelector::select(0), - RowSelector::skip(2), - RowSelector::select(0), - RowSelector::skip(2), - ]); - assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); - } - - #[test] - fn test_intersection() { - let selection = RowSelection::from(vec![RowSelector::select(1048576)]); - let result = selection.intersection(&selection); - assert_eq!(result, selection); - - let a = RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(20), - ]); - - let b = RowSelection::from(vec![ - RowSelector::skip(20), - RowSelector::select(20), - RowSelector::skip(10), - ]); - - let result = a.intersection(&b); - assert_eq!( - result.selectors, - vec![ - RowSelector::skip(30), - RowSelector::select(10), - RowSelector::skip(10) - ] - ); - } - - #[test] - fn test_union() { - let selection = RowSelection::from(vec![RowSelector::select(1048576)]); - let result = selection.union(&selection); - assert_eq!(result, selection); - - // NYNYY - let a = RowSelection::from(vec![ - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - RowSelector::select(20), - ]); - - // NNYYNYN - let b = RowSelection::from(vec![ - RowSelector::skip(20), - RowSelector::select(20), - RowSelector::skip(10), - RowSelector::select(10), - RowSelector::skip(10), - ]); - - let result = a.union(&b); - - // NYYYYYN - assert_eq!( - result.iter().collect::>(), - vec![ - &RowSelector::skip(10), - &RowSelector::select(50), - &RowSelector::skip(10), - ] - ); - } - - #[test] - fn test_row_count() { - let selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]); - - assert_eq!(selection.row_count(), 12 + 35); - assert_eq!(selection.skipped_row_count(), 34 + 3); - - let selection = RowSelection::from(vec![RowSelector::select(12), RowSelector::select(35)]); - - assert_eq!(selection.row_count(), 12 + 35); - assert_eq!(selection.skipped_row_count(), 0); - - let selection = RowSelection::from(vec![RowSelector::skip(34), RowSelector::skip(3)]); - - assert_eq!(selection.row_count(), 0); - assert_eq!(selection.skipped_row_count(), 34 + 3); - - let selection = RowSelection::from(vec![]); - - assert_eq!(selection.row_count(), 0); - assert_eq!(selection.skipped_row_count(), 0); - } - - #[test] - fn test_trim() { - let selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]); - - let expected = vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - RowSelector::select(35), - ]; - - assert_eq!(selection.trim().selectors, expected); - - let selection = RowSelection::from(vec![ - RowSelector::skip(34), - RowSelector::select(12), - RowSelector::skip(3), - ]); - - let expected = vec![RowSelector::skip(34), RowSelector::select(12)]; - - assert_eq!(selection.trim().selectors, expected); - } -} +mod tests; diff --git a/parquet/src/arrow/arrow_reader/selection/strategy.rs b/parquet/src/arrow/arrow_reader/selection/strategy.rs new file mode 100644 index 000000000000..08b49382c7bf --- /dev/null +++ b/parquet/src/arrow/arrow_reader/selection/strategy.rs @@ -0,0 +1,276 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Concepts used to choose how a [`RowSelection`] is executed. +//! +//! The row-filter reader makes two related but separate decisions: +//! +//! ```text +//! RowSelection materialization: +//! RowSelectionPolicy::Auto --> Mask or Selectors +//! +//! Row-group execution: +//! Predicate pushdown --> decode predicates, build RowSelection, decode output +//! Post-filter --> decode output + predicates once, then filter +//! ``` +//! +//! This module keeps the vocabulary for those decisions in one place. The +//! low-level cursors live in `selection.rs`; the push decoder cost model and +//! metrics use the summaries here to explain why a plan was chosen. + +use super::RowSelection; + +/// Fully resolved strategy for materializing [`RowSelection`] during execution. +/// +/// This is determined from a combination of user preference (via +/// [`super::RowSelectionPolicy`]) and safety considerations (for example, page +/// pruning can force a sparse mask representation). +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowSelectionStrategy { + /// Use a queue of [`super::RowSelector`] values. + Selectors, + /// Use a boolean mask to materialize the selection. + Mask, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowGroupExecutionMode { + Pushdown(RowSelectionStrategy), + PostFilter, +} + +impl std::fmt::Display for RowGroupExecutionMode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Pushdown(RowSelectionStrategy::Mask) => f.write_str("Pushdown(Mask)"), + Self::Pushdown(RowSelectionStrategy::Selectors) => f.write_str("Pushdown(Selectors)"), + Self::PostFilter => f.write_str("PostFilter"), + } + } +} + +/// Why a final row-selection read plan used masks or selectors. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum RowSelectionStrategyReason { + /// The caller explicitly requested masks. + ForcedMask, + /// The caller explicitly requested selectors. + ForcedSelectors, + /// Auto chose masks because the selection has no non-empty selectors. + AutoMaskEmptySelection, + /// Auto chose masks because average selector length is below the threshold. + AutoMaskShortRuns, + /// Auto chose masks because selected rows are fragmented into many short runs. + AutoMaskFragmentedSelection, + /// Auto chose masks because most rows are selected and selector skipping is unlikely to pay off. + AutoMaskHighSelectedRatio, + /// Auto chose selectors because selected rows are clustered into long runs. + AutoSelectorClusteredSelection, + /// Auto chose selectors because average selector length reaches the threshold. + AutoSelectorLongRuns, +} + +/// Shape summary for a [`RowSelection`]. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct RowSelectionShape { + pub(crate) selected_rows: usize, + pub(crate) skipped_rows: usize, + pub(crate) selector_count: usize, + pub(crate) selected_run_count: usize, + pub(crate) skipped_run_count: usize, +} + +impl RowSelectionShape { + pub(crate) fn from_selection(selection: Option<&RowSelection>) -> Self { + let Some(selection) = selection else { + return Self::default(); + }; + + selection + .iter() + .fold(Self::default(), |mut shape, selector| { + if selector.row_count == 0 { + return shape; + } + + shape.selector_count += 1; + if selector.skip { + shape.skipped_rows += selector.row_count; + shape.skipped_run_count += 1; + } else { + shape.selected_rows += selector.row_count; + shape.selected_run_count += 1; + } + shape + }) + } + + pub(crate) fn total_rows(self) -> usize { + self.selected_rows + self.skipped_rows + } + + pub(crate) fn selected_ratio(self) -> f64 { + let total = self.total_rows(); + if total == 0 { + 0.0 + } else { + self.selected_rows as f64 / total as f64 + } + } + + pub(crate) fn run_density(self) -> f64 { + let total = self.total_rows(); + if total == 0 { + 0.0 + } else { + self.selector_count as f64 / total as f64 + } + } + + pub(crate) fn average_selected_run_length(self) -> f64 { + average_run_length(self.selected_rows, self.selected_run_count) + } + + pub(crate) fn average_skipped_run_length(self) -> f64 { + average_run_length(self.skipped_rows, self.skipped_run_count) + } + + pub(crate) fn add_assign(&mut self, other: Self) { + self.selected_rows += other.selected_rows; + self.skipped_rows += other.skipped_rows; + self.selector_count += other.selector_count; + self.selected_run_count += other.selected_run_count; + self.skipped_run_count += other.skipped_run_count; + } +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) enum CostModelDecisionReason { + /// Predicate pushdown kept almost everything and did not produce useful pruning. + HighSelectivityNoPruning, + /// Predicate columns are already part of the output projection, and the + /// observed selected-row ratio is high enough that sequential post-filtering + /// is likely cheaper than many selected output reads. + ProjectedPredicateModerateSelectivity, + /// Fragmented runs with moderate selectivity often pay many small skip/read costs. + FragmentedModerateSelectivity, + /// Fragmented runs with high selectivity usually decode most rows plus pay pushdown overhead. + FragmentedHighSelectivity, + /// Not enough row groups have been observed to classify the scan. + ObservationIncomplete, + /// The observed shape still looks suitable for predicate pushdown. + PushdownStillPreferred, +} + +/// Aggregate row-selection shape observed while deciding whether Auto should +/// continue predicate pushdown or switch to post-filter execution. +/// +/// The classifier looks for shapes where row-level pushdown is unlikely to +/// recover its own overhead: +/// +/// ```text +/// no skipped rows -> predicate did not prune +/// tiny selected runs + many runs -> fragmented skip/read pattern +/// high selected ratio -> most output rows are decoded anyway +/// ``` +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct CostModelObservation { + pub(crate) observed_row_groups: usize, + pub(crate) shape: RowSelectionShape, +} + +impl CostModelObservation { + pub(crate) const OBSERVATION_ROW_GROUPS: usize = 1; + pub(crate) const MODERATE_SELECTIVITY_MIN_RATIO: f64 = 0.08; + pub(crate) const PROJECTED_PREDICATE_MIN_RATIO: f64 = 0.15; + pub(crate) const PROJECTED_PREDICATE_MAX_RATIO: f64 = 0.50; + + pub(crate) fn trigger_reason(self) -> CostModelDecisionReason { + if self.observed_row_groups < Self::OBSERVATION_ROW_GROUPS { + return CostModelDecisionReason::ObservationIncomplete; + } + + let shape = self.shape; + if shape.total_rows() > 0 && shape.skipped_rows == 0 && shape.selected_ratio() >= 0.95 { + return CostModelDecisionReason::HighSelectivityNoPruning; + } + + let fragmented = shape.average_selected_run_length() <= 4.0 && shape.run_density() >= 0.01; + + if !fragmented { + return CostModelDecisionReason::PushdownStillPreferred; + } + + let selected_ratio = shape.selected_ratio(); + if (Self::MODERATE_SELECTIVITY_MIN_RATIO..0.50).contains(&selected_ratio) { + return CostModelDecisionReason::FragmentedModerateSelectivity; + } + if selected_ratio < 0.50 { + return CostModelDecisionReason::PushdownStillPreferred; + } + + CostModelDecisionReason::FragmentedHighSelectivity + } + + pub(crate) fn prefers_post_filter(self) -> bool { + matches!( + self.trigger_reason(), + CostModelDecisionReason::HighSelectivityNoPruning + | CostModelDecisionReason::ProjectedPredicateModerateSelectivity + | CostModelDecisionReason::FragmentedModerateSelectivity + | CostModelDecisionReason::FragmentedHighSelectivity + ) + } +} + +/// Fully resolved decision for materializing a [`RowSelection`]. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct RowSelectionStrategyDecision { + pub(crate) strategy: RowSelectionStrategy, + pub(crate) reason: RowSelectionStrategyReason, + pub(crate) shape: RowSelectionShape, +} + +impl RowSelectionStrategyDecision { + pub(crate) fn new( + strategy: RowSelectionStrategy, + reason: RowSelectionStrategyReason, + shape: RowSelectionShape, + ) -> Self { + Self { + strategy, + reason, + shape, + } + } + + pub(crate) fn with_shape(self, shape: RowSelectionShape) -> Self { + Self { shape, ..self } + } + + pub(crate) fn uses_mask(self) -> bool { + matches!(self.strategy, RowSelectionStrategy::Mask) + } +} + +fn average_run_length(rows: usize, runs: usize) -> f64 { + if runs == 0 { + 0.0 + } else { + rows as f64 / runs as f64 + } +} diff --git a/parquet/src/arrow/arrow_reader/selection/tests.rs b/parquet/src/arrow/arrow_reader/selection/tests.rs new file mode 100644 index 000000000000..da1ca7ed1fa5 --- /dev/null +++ b/parquet/src/arrow/arrow_reader/selection/tests.rs @@ -0,0 +1,887 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::*; +use rand::{Rng, rng}; + +#[test] +fn test_loaded_row_ranges_detects_sparse_ranges() { + assert!(!LoadedRowRanges::new(std::iter::once(0..6).collect(), 6).is_sparse()); + assert!(!LoadedRowRanges::new(vec![], 0).is_sparse()); + assert!(LoadedRowRanges::new(vec![0..2, 4..6], 6).is_sparse()); + assert!(LoadedRowRanges::new(std::iter::once(1..6).collect(), 6).is_sparse()); +} + +#[test] +fn test_sparse_mask_cursor_skips_unloaded_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + + let loaded = LoadedRowRanges::new(vec![0..2, 4..6], 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let chunk = cursor.next_sparse_mask_chunk(1024).unwrap().unwrap(); + assert_eq!(chunk.selected_rows, 2); + assert_eq!( + chunk.segments, + vec![ + MaskSegment { + row_range: 0..1, + mask_start: 0, + mask_len: 1, + }, + MaskSegment { + row_range: 5..6, + mask_start: 5, + mask_len: 1, + }, + ] + ); + assert!(cursor.is_empty()); +} + +#[test] +fn test_sparse_mask_cursor_errors_selected_rows_after_loaded_ranges() { + let selection = RowSelection::from(vec![RowSelector::skip(5), RowSelector::select(1)]); + + let loaded = LoadedRowRanges::new(std::iter::once(0..2).collect(), 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); + assert!( + err.to_string() + .contains("sparse mask selected row 5 outside loaded row ranges"), + "{err}" + ); +} + +#[test] +fn test_sparse_mask_cursor_exhausts_empty_loaded_ranges() { + let selection = RowSelection::from(vec![RowSelector::select(6)]); + + let loaded = LoadedRowRanges::new(vec![], 6); + let selectors: Vec = selection.into(); + let mut cursor = SparseMaskCursor::new(selectors, loaded); + + let err = cursor.next_sparse_mask_chunk(1024).unwrap_err(); + assert!( + err.to_string() + .contains("sparse mask selected row 0 outside loaded row ranges"), + "{err}" + ); +} + +#[test] +fn test_from_filters() { + let filters = vec![ + BooleanArray::from(vec![false, false, false, true, true, true, true]), + BooleanArray::from(vec![true, true, false, false, true, true, true]), + BooleanArray::from(vec![false, false, false, false]), + BooleanArray::from(Vec::::new()), + ]; + + let selection = RowSelection::from_filters(&filters[..1]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(3), RowSelector::select(4)] + ); + + let selection = RowSelection::from_filters(&filters[..2]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3) + ] + ); + + let selection = RowSelection::from_filters(&filters); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(4) + ] + ); + + let selection = RowSelection::from_filters(&filters[2..3]); + assert!(!selection.selects_any()); + assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); +} + +#[test] +fn test_split_off() { + let mut selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + let split = selection.split_off(34); + assert_eq!(split.selectors, vec![RowSelector::skip(34)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(5); + assert_eq!(split.selectors, vec![RowSelector::select(5)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(7), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(8); + assert_eq!( + split.selectors, + vec![RowSelector::select(7), RowSelector::skip(1)] + ); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + + let split = selection.split_off(200); + assert_eq!( + split.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + assert!(selection.selectors.is_empty()); +} + +#[test] +fn test_offset() { + let selection = RowSelection::from(vec![ + RowSelector::select(5), + RowSelector::skip(23), + RowSelector::select(7), + RowSelector::skip(33), + RowSelector::select(6), + ]); + + let selection = selection.offset(2); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(23), + RowSelector::select(7), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(5); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(30), + RowSelector::select(5), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(3); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(33), + RowSelector::select(2), + RowSelector::skip(33), + RowSelector::select(6), + ] + ); + + let selection = selection.offset(2); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(68), RowSelector::select(6),] + ); + + let selection = selection.offset(3); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(71), RowSelector::select(3),] + ); +} + +#[test] +fn test_and() { + let mut a = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(23), + RowSelector::skip(3), + RowSelector::select(5), + ]); + + let b = RowSelection::from(vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(15), + RowSelector::skip(4), + ]); + + let mut expected = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(14), + RowSelector::skip(3), + RowSelector::select(1), + RowSelector::skip(4), + ]); + + assert_eq!(a.and_then(&b), expected); + + a.split_off(7); + expected.split_off(7); + assert_eq!(a.and_then(&b), expected); + + let a = RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(3)]); + + let b = RowSelection::from(vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + + assert_eq!( + a.and_then(&b).selectors, + vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(4) + ] + ); +} + +#[test] +fn test_combine() { + let a = vec![ + RowSelector::skip(3), + RowSelector::skip(3), + RowSelector::select(10), + RowSelector::skip(4), + ]; + + let b = vec![ + RowSelector::skip(3), + RowSelector::skip(3), + RowSelector::select(10), + RowSelector::skip(4), + RowSelector::skip(0), + ]; + + let c = vec![ + RowSelector::skip(2), + RowSelector::skip(4), + RowSelector::select(3), + RowSelector::select(3), + RowSelector::select(4), + RowSelector::skip(3), + RowSelector::skip(1), + RowSelector::skip(0), + ]; + + let expected = RowSelection::from(vec![ + RowSelector::skip(6), + RowSelector::select(10), + RowSelector::skip(4), + ]); + + assert_eq!(RowSelection::from_iter(a), expected); + assert_eq!(RowSelection::from_iter(b), expected); + assert_eq!(RowSelection::from_iter(c), expected); +} + +#[test] +fn test_combine_2elements() { + let a = vec![RowSelector::select(10), RowSelector::select(5)]; + let a_expect = vec![RowSelector::select(15)]; + assert_eq!(RowSelection::from_iter(a).selectors, a_expect); + + let b = vec![RowSelector::select(10), RowSelector::skip(5)]; + let b_expect = vec![RowSelector::select(10), RowSelector::skip(5)]; + assert_eq!(RowSelection::from_iter(b).selectors, b_expect); + + let c = vec![RowSelector::skip(10), RowSelector::select(5)]; + let c_expect = vec![RowSelector::skip(10), RowSelector::select(5)]; + assert_eq!(RowSelection::from_iter(c).selectors, c_expect); + + let d = vec![RowSelector::skip(10), RowSelector::skip(5)]; + let d_expect = vec![RowSelector::skip(15)]; + assert_eq!(RowSelection::from_iter(d).selectors, d_expect); +} + +#[test] +fn test_from_one_and_empty() { + let a = vec![RowSelector::select(10)]; + let selection1 = RowSelection::from(a.clone()); + assert_eq!(selection1.selectors, a); + + let b = vec![]; + let selection1 = RowSelection::from(b.clone()); + assert_eq!(selection1.selectors, b) +} + +#[test] +#[should_panic(expected = "selection exceeds the number of selected rows")] +fn test_and_longer() { + let a = RowSelection::from(vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]); + let b = RowSelection::from(vec![RowSelector::select(36)]); + a.and_then(&b); +} + +#[test] +#[should_panic(expected = "selection contains less than the number of selected rows")] +fn test_and_shorter() { + let a = RowSelection::from(vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]); + let b = RowSelection::from(vec![RowSelector::select(3)]); + a.and_then(&b); +} + +#[test] +fn test_intersect_row_selection_and_combine() { + // a size equal b size + let a = vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(1), + ]; + let b = vec![ + RowSelector::select(8), + RowSelector::skip(1), + RowSelector::select(1), + ]; + + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(1), + ], + ); + + // a size larger than b size + let a = vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(3), + RowSelector::skip(33), + ]; + let b = vec![RowSelector::select(36), RowSelector::skip(36)]; + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![RowSelector::select(3), RowSelector::skip(69)] + ); + + // a size less than b size + let a = vec![RowSelector::select(3), RowSelector::skip(7)]; + let b = vec![ + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + ]; + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![RowSelector::select(2), RowSelector::skip(8)] + ); + + let a = vec![RowSelector::select(3), RowSelector::skip(7)]; + let b = vec![ + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + RowSelector::skip(2), + RowSelector::select(2), + ]; + let res = intersect_row_selections(&a, &b); + assert_eq!( + res.selectors, + vec![RowSelector::select(2), RowSelector::skip(8)] + ); +} + +#[test] +fn test_and_fuzz() { + let mut rand = rng(); + for _ in 0..100 { + let a_len = rand.random_range(10..100); + let a_bools: Vec<_> = (0..a_len).map(|_| rand.random_bool(0.2)).collect(); + let a = RowSelection::from_filters(&[BooleanArray::from(a_bools.clone())]); + + let b_len: usize = a_bools.iter().map(|x| *x as usize).sum(); + let b_bools: Vec<_> = (0..b_len).map(|_| rand.random_bool(0.8)).collect(); + let b = RowSelection::from_filters(&[BooleanArray::from(b_bools.clone())]); + + let mut expected_bools = vec![false; a_len]; + + let mut iter_b = b_bools.iter(); + for (idx, b) in a_bools.iter().enumerate() { + if *b && *iter_b.next().unwrap() { + expected_bools[idx] = true; + } + } + + let expected = RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); + + let total_rows: usize = expected.selectors.iter().map(|s| s.row_count).sum(); + assert_eq!(a_len, total_rows); + + assert_eq!(a.and_then(&b), expected); + } +} + +#[test] +fn test_iter() { + // use the iter() API to show it does what is expected and + // avoid accidental deletion + let selectors = vec![ + RowSelector::select(3), + RowSelector::skip(33), + RowSelector::select(4), + ]; + + let round_tripped = RowSelection::from(selectors.clone()) + .iter() + .cloned() + .collect::>(); + assert_eq!(selectors, round_tripped); +} + +#[test] +fn test_limit() { + // Limit to existing limit should no-op + let selection = RowSelection::from(vec![RowSelector::select(10), RowSelector::skip(90)]); + let limited = selection.limit(10); + assert_eq!(RowSelection::from(vec![RowSelector::select(10)]), limited); + + let selection = RowSelection::from(vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]); + + let limited = selection.clone().limit(5); + let expected = vec![RowSelector::select(5)]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(15); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(5), + ]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(0); + let expected = vec![]; + assert_eq!(limited.selectors, expected); + + let limited = selection.clone().limit(30); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]; + assert_eq!(limited.selectors, expected); + + let limited = selection.limit(100); + let expected = vec![ + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(10), + ]; + assert_eq!(limited.selectors, expected); +} + +#[test] +fn test_scan_ranges() { + let index = vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 10, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 20, + }, + PageLocation { + offset: 30, + compressed_page_size: 10, + first_row_index: 30, + }, + PageLocation { + offset: 40, + compressed_page_size: 10, + first_row_index: 40, + }, + PageLocation { + offset: 50, + compressed_page_size: 10, + first_row_index: 50, + }, + PageLocation { + offset: 60, + compressed_page_size: 10, + first_row_index: 60, + }, + ]; + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to page boundary + RowSelector::skip(5), + RowSelector::select(5), + // Skip full page past page boundary + RowSelector::skip(12), + // Select across page boundaries + RowSelector::select(12), + // Skip final page + RowSelector::skip(12), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, false]); + assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60]); + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to page boundary + RowSelector::skip(5), + RowSelector::select(5), + // Skip full page past page boundary + RowSelector::skip(12), + // Select across page boundaries + RowSelector::select(12), + RowSelector::skip(1), + // Select across page boundaries including final page + RowSelector::select(8), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, true]); + assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to page boundary + RowSelector::skip(5), + RowSelector::select(5), + // Skip full page past page boundary + RowSelector::skip(12), + // Select to final page boundary + RowSelector::select(12), + RowSelector::skip(1), + // Skip across final page boundary + RowSelector::skip(8), + // Select from final page + RowSelector::select(4), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, true]); + assert_eq!(ranges, vec![10..20, 20..30, 40..50, 50..60, 60..70]); + + let selection = RowSelection::from(vec![ + // Skip first page + RowSelector::skip(10), + // Multiple selects in same page + RowSelector::select(3), + RowSelector::skip(3), + RowSelector::select(4), + // Select to remaining in page and first row of next page + RowSelector::skip(5), + RowSelector::select(6), + // Skip remaining + RowSelector::skip(50), + ]); + + let ranges = selection.scan_ranges(&index); + + // assert_eq!(mask, vec![false, true, true, false, true, true, true]); + assert_eq!(ranges, vec![10..20, 20..30, 30..40]); +} + +#[test] +fn test_selected_page_row_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let pages = vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 2, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 4, + }, + ]; + + assert_eq!( + selection.selected_page_row_ranges(&pages, 6), + vec![0..2, 4..6] + ); +} + +#[test] +fn test_from_ranges() { + let ranges = [1..3, 4..6, 6..6, 8..8, 9..10]; + let selection = RowSelection::from_consecutive_ranges(ranges.into_iter(), 10); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(1), + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(2), + RowSelector::skip(3), + RowSelector::select(1) + ] + ); + + let out_of_order_ranges = [1..3, 8..10, 4..7]; + let result = std::panic::catch_unwind(|| { + RowSelection::from_consecutive_ranges(out_of_order_ranges.into_iter(), 10) + }); + assert!(result.is_err()); +} + +#[test] +fn test_empty_selector() { + let selection = RowSelection::from(vec![ + RowSelector::skip(0), + RowSelector::select(2), + RowSelector::skip(0), + RowSelector::select(2), + ]); + assert_eq!(selection.selectors, vec![RowSelector::select(4)]); + + let selection = RowSelection::from(vec![ + RowSelector::select(0), + RowSelector::skip(2), + RowSelector::select(0), + RowSelector::skip(2), + ]); + assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); +} + +#[test] +fn test_intersection() { + let selection = RowSelection::from(vec![RowSelector::select(1048576)]); + let result = selection.intersection(&selection); + assert_eq!(result, selection); + + let a = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(20), + ]); + + let b = RowSelection::from(vec![ + RowSelector::skip(20), + RowSelector::select(20), + RowSelector::skip(10), + ]); + + let result = a.intersection(&b); + assert_eq!( + result.selectors, + vec![ + RowSelector::skip(30), + RowSelector::select(10), + RowSelector::skip(10) + ] + ); +} + +#[test] +fn test_union() { + let selection = RowSelection::from(vec![RowSelector::select(1048576)]); + let result = selection.union(&selection); + assert_eq!(result, selection); + + // NYNYY + let a = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + RowSelector::select(20), + ]); + + // NNYYNYN + let b = RowSelection::from(vec![ + RowSelector::skip(20), + RowSelector::select(20), + RowSelector::skip(10), + RowSelector::select(10), + RowSelector::skip(10), + ]); + + let result = a.union(&b); + + // NYYYYYN + assert_eq!( + result.iter().collect::>(), + vec![ + &RowSelector::skip(10), + &RowSelector::select(50), + &RowSelector::skip(10), + ] + ); +} + +#[test] +fn test_row_count() { + let selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + assert_eq!(selection.row_count(), 12 + 35); + assert_eq!(selection.skipped_row_count(), 34 + 3); + + let selection = RowSelection::from(vec![RowSelector::select(12), RowSelector::select(35)]); + + assert_eq!(selection.row_count(), 12 + 35); + assert_eq!(selection.skipped_row_count(), 0); + + let selection = RowSelection::from(vec![RowSelector::skip(34), RowSelector::skip(3)]); + + assert_eq!(selection.row_count(), 0); + assert_eq!(selection.skipped_row_count(), 34 + 3); + + let selection = RowSelection::from(vec![]); + + assert_eq!(selection.row_count(), 0); + assert_eq!(selection.skipped_row_count(), 0); +} + +#[test] +fn test_trim() { + let selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + let expected = vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]; + + assert_eq!(selection.trim().selectors, expected); + + let selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + ]); + + let expected = vec![RowSelector::skip(34), RowSelector::select(12)]; + + assert_eq!(selection.trim().selectors, expected); +} diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 14f7b9b6b41b..9cc5fe87d914 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -275,6 +275,14 @@ pub struct ProjectionMask { mask: Option>, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct RootColumnSelection { + /// Top-level root column indices with at least one selected leaf. + pub(crate) included_indices: Vec, + /// True when every top-level root is either entirely selected or skipped. + pub(crate) selects_whole_roots: bool, +} + impl ProjectionMask { /// Create a [`ProjectionMask`] which selects all columns pub fn all() -> Self { @@ -381,6 +389,52 @@ impl ProjectionMask { self.mask.as_ref().map(|m| m[leaf_idx]).unwrap_or(true) } + /// Returns true if each top-level root column is either fully selected or + /// fully skipped. + /// + /// This is useful for code paths that project decoded [`arrow_array::RecordBatch`] + /// values by top-level Arrow field index. A full `struct` root can be moved + /// as one batch column, but selecting only `struct.child` would require + /// recursively trimming the nested array. + pub(crate) fn selects_whole_root_columns(&self, schema: &SchemaDescriptor) -> bool { + self.root_column_selection(schema).selects_whole_roots + } + + /// Summarizes this leaf mask at top-level parquet root-column granularity. + /// + /// This intentionally combines the included-root list and whole-root check + /// in one leaf scan. Post-filter planning needs both values when converting + /// parquet projection masks to decoded [`arrow_array::RecordBatch`] column indices. + pub(crate) fn root_column_selection(&self, schema: &SchemaDescriptor) -> RootColumnSelection { + let num_roots = schema.root_schema().get_fields().len(); + let mut root_leaf_counts = vec![0usize; num_roots]; + let mut included_leaf_counts = vec![0usize; num_roots]; + let mut included_root_seen = vec![false; num_roots]; + let mut included_indices = Vec::new(); + + for leaf_idx in 0..schema.num_columns() { + let root_idx = schema.get_column_root_idx(leaf_idx); + root_leaf_counts[root_idx] += 1; + if self.leaf_included(leaf_idx) { + included_leaf_counts[root_idx] += 1; + if !included_root_seen[root_idx] { + included_root_seen[root_idx] = true; + included_indices.push(root_idx); + } + } + } + + let selects_whole_roots = included_leaf_counts + .into_iter() + .zip(root_leaf_counts) + .all(|(included, total)| included == 0 || included == total); + + RootColumnSelection { + included_indices, + selects_whole_roots, + } + } + /// Union two projection masks /// /// Example: @@ -785,6 +839,40 @@ mod test { assert_eq!(mask1.mask, None); } + #[test] + fn test_projection_mask_root_column_selection() { + let schema = parse_schema( + " + message test_schema { + OPTIONAL BYTE_ARRAY tag (UTF8); + OPTIONAL group payload { + REQUIRED INT64 id; + REQUIRED BYTE_ARRAY label (UTF8); + } + REQUIRED INT64 value; + } + ", + ); + + let selection = ProjectionMask::all().root_column_selection(&schema); + assert_eq!(selection.included_indices, [0, 1, 2]); + assert!(selection.selects_whole_roots); + + let selection = ProjectionMask::none(schema.num_columns()).root_column_selection(&schema); + assert!(selection.included_indices.is_empty()); + assert!(selection.selects_whole_roots); + + let selection = + ProjectionMask::columns(&schema, ["payload"]).root_column_selection(&schema); + assert_eq!(selection.included_indices, [1]); + assert!(selection.selects_whole_roots); + + let selection = ProjectionMask::columns(&schema, ["tag", "payload.label"]) + .root_column_selection(&schema); + assert_eq!(selection.included_indices, [0, 1]); + assert!(!selection.selects_whole_roots); + } + #[test] fn test_projection_mask_intersect() { let mut mask1 = ProjectionMask { diff --git a/parquet/src/arrow/push_decoder/mod.rs b/parquet/src/arrow/push_decoder/mod.rs index 6dc5520bb975..8bdbe80744f8 100644 --- a/parquet/src/arrow/push_decoder/mod.rs +++ b/parquet/src/arrow/push_decoder/mod.rs @@ -618,6 +618,7 @@ impl ParquetDecoderState { ) -> Result<(Self, DecodeResult), ParquetError> { let mut current_state = self; loop { + current_state.disable_post_filter_cost_model(); let (next_state, decode_result) = current_state.transition()?; // if more data is needed to transition, can't proceed further without it match decode_result { @@ -632,9 +633,15 @@ impl ParquetDecoderState { Self::ReadingRowGroup { .. } => current_state = next_state, // have a reader ready, so return it and set ourself to ReadingRowGroup Self::DecodingRowGroup { - record_batch_reader, + mut record_batch_reader, remaining_row_groups, } => { + // The reader API can advance to future row groups before + // the returned reader is consumed. Disable post-filter + // cost modeling before building row groups for this API; this + // materialization remains only as a guard for mixed API use + // where a post-filter reader was already active. + record_batch_reader.materialize_post_filter()?; let result = DecodeResult::Data(*record_batch_reader); let next_state = Self::ReadingRowGroup { remaining_row_groups, @@ -648,6 +655,15 @@ impl ParquetDecoderState { } } + fn disable_post_filter_cost_model(&mut self) { + if let Self::ReadingRowGroup { + remaining_row_groups, + } = self + { + remaining_row_groups.disable_post_filter_cost_model(); + } + } + /// Current state --> next state + output /// /// This function is called to get the next RecordBatch @@ -872,20 +888,49 @@ impl ParquetDecoderState { mod test { use super::*; use crate::DecodeResult; - use crate::arrow::arrow_reader::{ArrowPredicateFn, RowFilter, RowSelection, RowSelector}; + #[cfg(feature = "async")] + use crate::arrow::ParquetRecordBatchStreamBuilder; + #[cfg(feature = "async")] + use crate::arrow::arrow_reader::ArrowReaderOptions; + use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; + use crate::arrow::arrow_reader::{ + ArrowPredicateFn, ParquetRecordBatchReader, RowFilter, RowSelection, RowSelectionPolicy, + RowSelector, + }; + #[cfg(feature = "async")] + use crate::arrow::async_reader::AsyncFileReader; use crate::arrow::push_decoder::{ParquetPushDecoder, ParquetPushDecoderBuilder}; use crate::arrow::{ArrowWriter, ProjectionMask}; use crate::errors::ParquetError; use crate::file::metadata::ParquetMetaDataPushDecoder; + #[cfg(feature = "async")] + use crate::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; use crate::file::properties::WriterProperties; + #[cfg(feature = "async")] + use arrow::compute::kernels::cmp::neq; use arrow::compute::kernels::cmp::{gt, lt}; + #[cfg(feature = "async")] + use arrow_array::builder::{ArrayBuilder, StringViewBuilder}; use arrow_array::cast::AsArray; use arrow_array::types::Int64Type; - use arrow_array::{ArrayRef, Int64Array, RecordBatch, StringViewArray}; + use arrow_array::{ + ArrayRef, BooleanArray, Int64Array, RecordBatch, StringViewArray, StructArray, + }; + #[cfg(feature = "async")] + use arrow_schema::Schema; + use arrow_schema::{DataType, Field}; use arrow_select::concat::concat_batches; + use arrow_select::filter::filter_record_batch; use bytes::Bytes; + #[cfg(feature = "async")] + use futures::future::BoxFuture; + #[cfg(feature = "async")] + use futures::{FutureExt, StreamExt}; + #[cfg(feature = "async")] + use rand::{Rng, SeedableRng, rngs::StdRng}; use std::fmt::Debug; use std::ops::Range; + use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::{Arc, LazyLock}; /// Test decoder struct size (as they are copied around on each transition, they @@ -1192,136 +1237,1009 @@ mod test { expect_finished(decoder.try_decode()); } - /// Decode with multiple filters that require multiple requests + /// Decode with multiple filters that require multiple requests + #[test] + fn test_decoder_multi_filters() { + // Create a decoder for decoding parquet data (note it does not have any IO / readers) + let builder = + ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata()).unwrap(); + + // Values in column "a" range 0..399 + // Values in column "b" range 400..799 + // First filter: "a" > 175 (last data page in Row Group 0) + // Second filter: "b" < 625 (last data page in Row Group 0 and first DataPage in RowGroup 1) + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + // a > 175 + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + |batch: RecordBatch| { + let scalar_175 = Int64Array::new_scalar(175); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_175) + }, + ); + + // b < 625 + let row_filter_b = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["b"]), + |batch: RecordBatch| { + let scalar_625 = Int64Array::new_scalar(625); + let column = batch.column(0).as_primitive::(); + lt(column, &scalar_625) + }, + ); + + let mut decoder = builder + .with_projection( + ProjectionMask::columns(&schema_descr, ["c"]), // read "c" + ) + .with_row_filter(RowFilter::new(vec![ + Box::new(row_filter_a), + Box::new(row_filter_b), + ])) + .build() + .unwrap(); + + // First row group, first filter (a > 175) + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // first row group, second filter (b < 625) + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // first row group, data pages for "c" + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // expect the first batch to be decoded: rows 176..199, column "c" + let batch1 = expect_data(decoder.try_decode()); + let expected1 = TEST_BATCH.slice(176, 24).project(&[2]).unwrap(); + assert_eq!(batch1, expected1); + + // Second row group, first filter (a > 175) + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // Second row group, second filter (b < 625) + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // Second row group, data pages for "c" + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // expect the second batch to be decoded: rows 200..224, column "c" + let batch2 = expect_data(decoder.try_decode()); + let expected2 = TEST_BATCH.slice(200, 25).project(&[2]).unwrap(); + assert_eq!(batch2, expected2); + + expect_finished(decoder.try_decode()); + } + + /// Decode with a filter that uses a column that is also projected, and expect + /// that the filter pages are reused (don't refetch them) + #[test] + fn test_decoder_reuses_filter_pages() { + // Create a decoder for decoding parquet data (note it does not have any IO / readers) + let builder = + ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata()).unwrap(); + + // Values in column "a" range 0..399 + // First filter: "a" > 250 (nothing in Row Group 0, last data page in Row Group 1) + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + // a > 250 + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + |batch: RecordBatch| { + let scalar_250 = Int64Array::new_scalar(250); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_250) + }, + ); + + let mut decoder = builder + .with_projection( + // read only column "a" to test that filter pages are reused + ProjectionMask::columns(&schema_descr, ["a"]), // read "a" + ) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .build() + .unwrap(); + + // First row group, first filter (a > 175) + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // expect the first row group to be filtered out (no rows match) + + // Second row group, first filter (a > 250) + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + // expect that the second row group is decoded: rows 251..399, column "a" + // Note that the filter pages for "a" should be reused and no additional data + // should be requested + let batch = expect_data(decoder.try_decode()); + let expected = TEST_BATCH.slice(251, 149).project(&[0]).unwrap(); + assert_eq!(batch, expected); + + expect_finished(decoder.try_decode()); + } + + #[test] + fn test_decoder_auto_cost_model_uses_post_filter_after_observation() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 100); + assert_eq!(batch, TEST_BATCH.slice(0, 100).project(&[2]).unwrap()); + + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 200); + assert_eq!(batch, TEST_BATCH.slice(100, 100).project(&[2]).unwrap()); + + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + 300, + "cost model should evaluate predicates while producing the current row group" + ); + assert_eq!(batch, TEST_BATCH.slice(200, 100).project(&[2]).unwrap()); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 300); + + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); + assert_eq!(batch, TEST_BATCH.slice(300, 100).project(&[2]).unwrap()); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_high_selectivity_no_pruning_count(), + Some(1) + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_try_next_reader_skips_post_filter_cost_model() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let mut reader = next_reader_with_data(&mut decoder, data).unwrap(); + assert_eq!( + reader.next().unwrap().unwrap(), + TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[2]) + .unwrap() + ); + assert!(reader.next().is_none()); + } + + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert!(next_reader_with_data(&mut decoder, data).is_none()); + } + + #[test] + #[cfg(feature = "async")] + fn test_decoder_post_filter_without_base_selection_skips_output_selection_resolve() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled_with_phase_profile(); + + let row_filter_c = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["c"]), + move |batch: RecordBatch| Ok(BooleanArray::from(vec![true; batch.num_rows()])), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_c)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[0]) + .unwrap() + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + + let report = metrics.phase_profile_report().unwrap(); + assert_eq!(phase_profile_count(&report, "output_selection_resolve"), 0); + } + + #[test] + fn test_decoder_post_filter_supports_whole_nested_output_projection() { + let data = &NESTED_COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_tag = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["tag"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(BooleanArray::from(vec![true; batch.num_rows()])) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["payload"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_tag)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + NESTED_TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[1]) + .unwrap() + ); + } + + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_post_filter_keeps_partial_nested_predicate_on_pushdown() { + let data = &NESTED_COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_payload_label = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["payload.label"]), + move |batch: RecordBatch| { + let payload = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(payload.num_columns(), 1); + Ok(BooleanArray::from(vec![true; batch.num_rows()])) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["payload"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_payload_label)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + NESTED_TEST_BATCH + .slice(row_group_idx * 100, 100) + .project(&[1]) + .unwrap() + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_post_filter_applies_fragmented_filter() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(not_multiple_of_three_filter(&batch)) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..2 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + assert_eq!( + batch, + expected_c_not_multiple_of_three(row_group_idx * 100, 100) + ); + } + + for row_group_idx in 2..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100, + "cost model should evaluate predicates while producing the current row group" + ); + assert_eq!( + batch, + expected_c_not_multiple_of_three(row_group_idx * 100, 100) + ); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_fragmented_high_selectivity_count(), + Some(1) + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_records_fragmented_moderate_selectivity() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(multiple_of_ten_filter(&batch)) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + assert_eq!(batch, expected_c_multiple_of_ten(row_group_idx * 100, 100)); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + (row_group_idx + 1) * 100 + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_fragmented_moderate_selectivity_count(), + Some(1) + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_switches_for_moderate_fixed_width_deferred_output() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(multiple_of_five_filter(&batch)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(batch, expected_b_multiple_of_five(row_group_idx * 100, 100)); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_fragmented_moderate_selectivity_count(), + Some(1) + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_switches_for_partially_projected_fixed_width_chain() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(multiple_of_ten_filter(&batch)), + ); + let projected_filter_b = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["b"]), + move |batch: RecordBatch| Ok(BooleanArray::from(vec![true; batch.num_rows()])), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![ + Box::new(filter_a), + Box::new(projected_filter_b), + ])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(batch, expected_b_multiple_of_ten(row_group_idx * 100, 100)); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_fragmented_moderate_selectivity_count(), + Some(1) + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_projected_predicate_with_deferred_variable_width_output() + { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + Ok(first_rows_per_hundred_filter(&batch, 20)) + }, + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 20) + ); + } + + assert_eq!(predicate_rows.load(Ordering::Relaxed), 400); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert_eq!( + metrics.cost_model_projected_predicate_moderate_selectivity_count(), + Some(0) + ); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "deferred variable-width output should keep projected predicate pushdown" + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_projected_predicate_with_expensive_deferred_fixed_output() + { + let data = &WIDE_FIXED_COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 20)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns( + &schema_descr, + ["a", "b", "c", "d", "e"], + )) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_wide_fixed_first_rows_per_hundred(row_group_idx * 100, 100, 20) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert_eq!( + metrics.cost_model_projected_predicate_moderate_selectivity_count(), + Some(0) + ); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "expensive deferred fixed-width output should keep projected predicate pushdown" + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_switches_for_projected_predicate_with_deferred_fixed_output() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 20)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_b_first_rows_per_hundred(row_group_idx * 100, 100, 20) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert_eq!( + metrics.cost_model_projected_predicate_moderate_selectivity_count(), + Some(1) + ); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "projected predicate should still reuse cached predicate data" + ); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_uses_post_filter_after_observing_fixed_width_read_projection() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(not_multiple_of_three_filter(&batch)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_not_multiple_of_three(row_group_idx * 100, 100) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_sparse_fixed_width_read_projection() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 1)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_first_rows_per_hundred(row_group_idx * 100, 100, 1) + ); + } + + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "sparse projected pushdown should consume the predicate cache" + ); + } + + #[test] + fn test_decoder_auto_cost_model_observes_fixed_width_deferred_output() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(not_multiple_of_three_filter(&batch)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["b"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_b_not_multiple_of_three(row_group_idx * 100, 100) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(4)); + assert!(next_batch_with_data(&mut decoder, data).is_none()); + } + + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_sparse_projected_predicate() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 5)), + ); + + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 5) + ); + } + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "sparse projected pushdown should consume the predicate cache" + ); + } + #[test] - fn test_decoder_multi_filters() { - // Create a decoder for decoding parquet data (note it does not have any IO / readers) + fn test_decoder_auto_cost_model_reuses_cache_for_very_sparse_projected_predicate_chain() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = - ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata()).unwrap(); - - // Values in column "a" range 0..399 - // Values in column "b" range 400..799 - // First filter: "a" > 175 (last data page in Row Group 0) - // Second filter: "b" < 625 (last data page in Row Group 0 and first DataPage in RowGroup 1) + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); - // a > 175 - let row_filter_a = ArrowPredicateFn::new( + let sparse_filter_a = ArrowPredicateFn::new( ProjectionMask::columns(&schema_descr, ["a"]), - |batch: RecordBatch| { - let scalar_175 = Int64Array::new_scalar(175); - let column = batch.column(0).as_primitive::(); - gt(column, &scalar_175) - }, + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 1)), ); - - // b < 625 - let row_filter_b = ArrowPredicateFn::new( - ProjectionMask::columns(&schema_descr, ["b"]), - |batch: RecordBatch| { - let scalar_625 = Int64Array::new_scalar(625); - let column = batch.column(0).as_primitive::(); - lt(column, &scalar_625) - }, + let cache_reusing_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(BooleanArray::from(vec![true; batch.num_rows()])), ); let mut decoder = builder - .with_projection( - ProjectionMask::columns(&schema_descr, ["c"]), // read "c" - ) + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) .with_row_filter(RowFilter::new(vec![ - Box::new(row_filter_a), - Box::new(row_filter_b), + Box::new(sparse_filter_a), + Box::new(cache_reusing_filter_a), ])) + .with_metrics(metrics.clone()) .build() .unwrap(); - // First row group, first filter (a > 175) - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); - - // first row group, second filter (b < 625) - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); - - // first row group, data pages for "c" - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 1) + ); + } - // expect the first batch to be decoded: rows 176..199, column "c" - let batch1 = expect_data(decoder.try_decode()); - let expected1 = TEST_BATCH.slice(176, 24).project(&[2]).unwrap(); - assert_eq!(batch1, expected1); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert!( + metrics + .records_read_from_cache() + .is_some_and(|records| records > 0), + "projected predicate chains should reuse cached predicate data" + ); + } - // Second row group, first filter (a > 175) - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); + #[test] + fn test_decoder_auto_cost_model_keeps_pushdown_for_high_selectivity_projected_predicate() { + let data = &COST_MODEL_TEST_FILE_DATA; + let builder = + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); - // Second row group, second filter (b < 625) - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + move |batch: RecordBatch| Ok(first_rows_per_hundred_filter(&batch, 90)), + ); - // Second row group, data pages for "c" - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); + let mut decoder = builder + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["a", "c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .build() + .unwrap(); - // expect the second batch to be decoded: rows 200..224, column "c" - let batch2 = expect_data(decoder.try_decode()); - let expected2 = TEST_BATCH.slice(200, 25).project(&[2]).unwrap(); - assert_eq!(batch2, expected2); + for row_group_idx in 0..4 { + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + batch, + expected_a_c_first_rows_per_hundred(row_group_idx * 100, 100, 90) + ); + } - expect_finished(decoder.try_decode()); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_still_preferred_count(), Some(1)); } - /// Decode with a filter that uses a column that is also projected, and expect - /// that the filter pages are reused (don't refetch them) #[test] - fn test_decoder_reuses_filter_pages() { - // Create a decoder for decoding parquet data (note it does not have any IO / readers) + fn test_decoder_auto_cost_model_with_row_selection_does_not_evaluate_current_row_group_twice() { + let data = &COST_MODEL_TEST_FILE_DATA; let builder = - ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata()).unwrap(); - - // Values in column "a" range 0..399 - // First filter: "a" > 250 (nothing in Row Group 0, last data page in Row Group 1) + ParquetPushDecoderBuilder::try_new_decoder(parquet_metadata_for_data(data)).unwrap(); let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); - // a > 250 + let predicate_rows = Arc::new(AtomicUsize::new(0)); + let predicate_rows_for_filter = Arc::clone(&predicate_rows); let row_filter_a = ArrowPredicateFn::new( ProjectionMask::columns(&schema_descr, ["a"]), - |batch: RecordBatch| { - let scalar_250 = Int64Array::new_scalar(250); + move |batch: RecordBatch| { + predicate_rows_for_filter.fetch_add(batch.num_rows(), Ordering::Relaxed); + let scalar_neg_one = Int64Array::new_scalar(-1); let column = batch.column(0).as_primitive::(); - gt(column, &scalar_250) + gt(column, &scalar_neg_one) }, ); + let mut row_selection = Vec::with_capacity(101); + for _ in 0..50 { + row_selection.push(RowSelector::select(1)); + row_selection.push(RowSelector::skip(1)); + } + row_selection.push(RowSelector::select(100)); + let mut decoder = builder - .with_projection( - // read only column "a" to test that filter pages are reused - ProjectionMask::columns(&schema_descr, ["a"]), // read "a" - ) + .with_batch_size(100) + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection(RowSelection::from(row_selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) .build() .unwrap(); - // First row group, first filter (a > 175) - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); - - // expect the first row group to be filtered out (no rows match) - - // Second row group, first filter (a > 250) - let ranges = expect_needs_data(decoder.try_decode()); - push_ranges_to_decoder(&mut decoder, ranges); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!( + predicate_rows.load(Ordering::Relaxed), + 50, + "cost-model observation must not re-run the predicate for the same row group" + ); + assert_eq!(batch, expected_c_every_other(0, 100)); - // expect that the second row group is decoded: rows 251..399, column "a" - // Note that the filter pages for "a" should be reused and no additional data - // should be requested - let batch = expect_data(decoder.try_decode()); - let expected = TEST_BATCH.slice(251, 149).project(&[0]).unwrap(); - assert_eq!(batch, expected); + let batch = next_batch_with_data(&mut decoder, data).unwrap(); + assert_eq!(predicate_rows.load(Ordering::Relaxed), 150); + assert_eq!(batch, TEST_BATCH.slice(100, 100).project(&[2]).unwrap()); - expect_finished(decoder.try_decode()); + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(1)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(1)); } #[test] @@ -1429,6 +2347,50 @@ mod test { ); } + /// Auto post-filter cost modeling is disabled for `LIMIT` because the limit is + /// applied during row-group planning. Limit scans should therefore avoid + /// cost-model observation bookkeeping entirely. + #[test] + fn test_decoder_filter_with_limit_skips_auto_cost_model_observation() { + let builder = + ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata()).unwrap(); + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let metrics = ArrowReaderMetrics::enabled(); + + let row_filter_a = ArrowPredicateFn::new( + ProjectionMask::columns(&schema_descr, ["a"]), + |batch: RecordBatch| { + let scalar_neg_one = Int64Array::new_scalar(-1); + let column = batch.column(0).as_primitive::(); + gt(column, &scalar_neg_one) + }, + ); + + let mut decoder = builder + .with_projection(ProjectionMask::columns(&schema_descr, ["c"])) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 32 }) + .with_row_filter(RowFilter::new(vec![Box::new(row_filter_a)])) + .with_metrics(metrics.clone()) + .with_limit(10) + .build() + .unwrap(); + + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + let ranges = expect_needs_data(decoder.try_decode()); + push_ranges_to_decoder(&mut decoder, ranges); + + let batch = expect_data(decoder.try_decode()); + let expected = TEST_BATCH.slice(0, 10).project(&[2]).unwrap(); + assert_eq!(batch, expected); + expect_finished(decoder.try_decode()); + + assert_eq!(metrics.cost_model_observed_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_pushdown_row_group_count(), Some(0)); + assert_eq!(metrics.cost_model_post_filter_row_group_count(), Some(0)); + } + /// Once the limit has been satisfied by a prior row group, subsequent /// row groups should be skipped entirely — no data request for their /// filter columns. @@ -1743,6 +2705,61 @@ mod test { expect_finished(decoder.try_decode()); } + #[cfg(feature = "async")] + #[test] + #[ignore = "local profiling aid for row-filter phase breakdowns"] + fn profile_utf8_view_row_filter_phases() { + const TOTAL_ROWS: usize = 500_000; + const ROW_GROUP_SIZE: usize = 100_000; + + let parquet_file = Bytes::from(write_utf8_profile_parquet_file(TOTAL_ROWS, ROW_GROUP_SIZE)); + let runtime = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + + for (name, policy) in [ + ("auto", RowSelectionPolicy::default()), + ("mask", RowSelectionPolicy::Mask), + ("selectors", RowSelectionPolicy::Selectors), + ] { + let reader = ProfileInMemoryReader::try_new(&parquet_file).unwrap(); + let schema_descr = reader.metadata().file_metadata().schema_descr(); + let projection = ProjectionMask::roots(schema_descr, [0, 1, 2, 3]); + let predicate_projection = ProjectionMask::roots(schema_descr, [2]); + let row_filter = RowFilter::new(vec![Box::new(ArrowPredicateFn::new( + predicate_projection, + |batch| { + let array = batch.column(batch.schema().index_of("utf8View")?); + neq(array, &StringViewArray::new_scalar("")) + }, + ))]); + let metrics = ArrowReaderMetrics::enabled_with_phase_profile(); + + runtime.block_on(async { + let mut stream = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .with_batch_size(8192) + .with_projection(projection) + .with_row_filter(row_filter) + .with_row_selection_policy(policy) + .with_metrics(metrics.clone()) + .build() + .unwrap(); + + let mut rows = 0; + while let Some(batch) = stream.next().await { + rows += batch.unwrap().num_rows(); + } + assert!(rows > 0 && rows < TOTAL_ROWS); + }); + + println!("phase profile: {name}"); + println!("{}", metrics.phase_profile_report().unwrap()); + } + } + /// `into_builder` between row groups recovers a builder for the /// not-yet-decoded row groups; rebuilding it with a new row filter /// applies that filter to the subsequent row groups while leaving the @@ -2049,13 +3066,67 @@ mod test { /// c | "string_100".."string_199" | 2 | 0 /// c | "string_200".."string_299" | 1 | 1 /// c | "string_300".."string_399" | 2 | 1 - static TEST_FILE_DATA: LazyLock = LazyLock::new(|| { - let input_batch = &TEST_BATCH; + static TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(200, 100)); + + static COST_MODEL_TEST_FILE_DATA: LazyLock = LazyLock::new(|| write_test_file(100, 50)); + + static WIDE_FIXED_TEST_BATCH: LazyLock = LazyLock::new(|| { + let a: ArrayRef = Arc::new(Int64Array::from_iter_values(0..400)); + let b: ArrayRef = Arc::new(Int64Array::from_iter_values(400..800)); + let c: ArrayRef = Arc::new(Int64Array::from_iter_values(800..1200)); + let d: ArrayRef = Arc::new(Int64Array::from_iter_values(1200..1600)); + let e: ArrayRef = Arc::new(Int64Array::from_iter_values(1600..2000)); + + RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c), ("d", d), ("e", e)]).unwrap() + }); + + static WIDE_FIXED_COST_MODEL_TEST_FILE_DATA: LazyLock = + LazyLock::new(|| write_batch_test_file(&WIDE_FIXED_TEST_BATCH, 100, 50)); + + static NESTED_TEST_BATCH: LazyLock = LazyLock::new(|| { + let tag: ArrayRef = Arc::new(StringViewArray::from_iter_values( + (0..400).map(|idx| format!("tag_{}", idx % 7)), + )); + let payload = StructArray::from(vec![ + ( + Arc::new(Field::new("id", DataType::Int64, false)), + Arc::new(Int64Array::from_iter_values(1_000..1_400)) as ArrayRef, + ), + ( + Arc::new(Field::new("label", DataType::Utf8View, false)), + Arc::new(StringViewArray::from_iter_values( + (0..400).map(|idx| format!("payload_{idx}")), + )) as ArrayRef, + ), + ]); + let payload: ArrayRef = Arc::new(payload); + let value: ArrayRef = Arc::new(Int64Array::from_iter_values(10_000..10_400)); + + RecordBatch::try_from_iter(vec![("tag", tag), ("payload", payload), ("value", value)]) + .unwrap() + }); + + static NESTED_COST_MODEL_TEST_FILE_DATA: LazyLock = + LazyLock::new(|| write_batch_test_file(&NESTED_TEST_BATCH, 100, 50)); + + fn write_test_file(max_row_group_row_count: usize, data_page_row_count_limit: usize) -> Bytes { + write_batch_test_file( + &TEST_BATCH, + max_row_group_row_count, + data_page_row_count_limit, + ) + } + + fn write_batch_test_file( + input_batch: &RecordBatch, + max_row_group_row_count: usize, + data_page_row_count_limit: usize, + ) -> Bytes { let mut output = Vec::new(); let writer_options = WriterProperties::builder() - .set_max_row_group_row_count(Some(200)) - .set_data_page_row_count_limit(100) + .set_max_row_group_row_count(Some(max_row_group_row_count)) + .set_data_page_row_count_limit(data_page_row_count_limit) .build(); let mut writer = ArrowWriter::try_new(&mut output, input_batch.schema(), Some(writer_options)).unwrap(); @@ -2071,7 +3142,123 @@ mod test { } writer.close().unwrap(); Bytes::from(output) - }); + } + + #[cfg(feature = "async")] + fn write_utf8_profile_parquet_file(total_rows: usize, row_group_size: usize) -> Vec { + let batch = create_utf8_profile_batch(total_rows); + let props = WriterProperties::builder() + .set_compression(crate::basic::Compression::SNAPPY) + .set_max_row_group_row_count(Some(row_group_size)) + .build(); + let mut buffer = vec![]; + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + buffer + } + + #[cfg(feature = "async")] + fn create_utf8_profile_batch(size: usize) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("int64", DataType::Int64, false), + Field::new("float64", DataType::Float64, false), + Field::new("utf8View", DataType::Utf8View, true), + Field::new("ts", DataType::Int64, false), + ])); + + let int64 = Arc::new(Int64Array::from_iter_values(0..size as i64)) as ArrayRef; + let float64 = Arc::new(arrow_array::Float64Array::from_iter_values( + (0..size).map(|i| (i % 100) as f64), + )) as ArrayRef; + let utf8 = create_profile_utf8_view_array(size); + let ts = Arc::new(Int64Array::from_iter_values( + (0..size).map(|i| (i % 10_000) as i64), + )) as ArrayRef; + + RecordBatch::try_new(schema, vec![int64, float64, utf8, ts]).unwrap() + } + + #[cfg(feature = "async")] + fn create_profile_utf8_view_array(size: usize) -> ArrayRef { + const AVG_RUN_LENGTH: usize = 4; + const EMPTY_DENSITY: u32 = 85; + + let mut builder = StringViewBuilder::with_capacity(size); + let mut rng = StdRng::seed_from_u64(44); + while builder.len() < size { + let mut run_length = rng.random_range(1..AVG_RUN_LENGTH); + if builder.len() + run_length > size { + run_length = size - builder.len(); + } + + if rng.random_range(0..100) < EMPTY_DENSITY { + for _ in 0..run_length { + builder.append_value(""); + } + } else { + for _ in 0..run_length { + builder.append_value(random_profile_string(&mut rng)); + } + } + } + Arc::new(builder.finish()) as ArrayRef + } + + #[cfg(feature = "async")] + fn random_profile_string(rng: &mut StdRng) -> String { + let charset = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + let len = if rng.random_bool(0.5) { + rng.random_range(13..21) + } else { + rng.random_range(3..12) + }; + (0..len) + .map(|_| charset[rng.random_range(0..charset.len())] as char) + .collect() + } + + #[derive(Debug, Clone)] + #[cfg(feature = "async")] + struct ProfileInMemoryReader { + inner: Bytes, + metadata: Arc, + } + + #[cfg(feature = "async")] + impl ProfileInMemoryReader { + fn try_new(inner: &Bytes) -> crate::errors::Result { + let mut metadata_reader = + ParquetMetaDataReader::new().with_page_index_policy(PageIndexPolicy::Required); + metadata_reader.try_parse(inner)?; + let metadata = metadata_reader.finish().map(Arc::new)?; + + Ok(Self { + inner: inner.clone(), + metadata, + }) + } + + fn metadata(&self) -> &Arc { + &self.metadata + } + } + + #[cfg(feature = "async")] + impl AsyncFileReader for ProfileInMemoryReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, crate::errors::Result> { + let data = self.inner.slice(range.start as usize..range.end as usize); + async move { Ok(data) }.boxed() + } + + fn get_metadata<'a>( + &'a mut self, + _options: Option<&'a ArrowReaderOptions>, + ) -> BoxFuture<'a, crate::errors::Result>> { + let metadata = Arc::clone(&self.metadata); + async move { Ok(metadata) }.boxed() + } + } /// Return the length of [`TEST_FILE_DATA`], in bytes fn test_file_len() -> u64 { @@ -2083,17 +3270,18 @@ mod test { 0..test_file_len() } - /// Return a slice of the test file data from the given range - pub fn test_file_slice(range: Range) -> Bytes { - let start: usize = range.start.try_into().unwrap(); - let end: usize = range.end.try_into().unwrap(); - TEST_FILE_DATA.slice(start..end) - } - /// return the metadata for the test file pub fn test_file_parquet_metadata() -> Arc { - let mut metadata_decoder = ParquetMetaDataPushDecoder::try_new(test_file_len()).unwrap(); - push_ranges_to_metadata_decoder(&mut metadata_decoder, vec![test_file_range()]); + parquet_metadata_for_data(&TEST_FILE_DATA) + } + + fn parquet_metadata_for_data(data: &Bytes) -> Arc { + let mut metadata_decoder = ParquetMetaDataPushDecoder::try_new(data.len() as u64).unwrap(); + push_ranges_to_metadata_decoder_with_data( + &mut metadata_decoder, + std::iter::once(0..data.len() as u64).collect(), + data, + ); let metadata = metadata_decoder.try_decode().unwrap(); let DecodeResult::Data(metadata) = metadata else { panic!("Expected metadata to be decoded successfully"); @@ -2101,14 +3289,14 @@ mod test { Arc::new(metadata) } - /// Push the given ranges to the metadata decoder, simulating reading from a file - fn push_ranges_to_metadata_decoder( + fn push_ranges_to_metadata_decoder_with_data( metadata_decoder: &mut ParquetMetaDataPushDecoder, ranges: Vec>, + data: &Bytes, ) { let data = ranges .iter() - .map(|range| test_file_slice(range.clone())) + .map(|range| data.slice(range.start as usize..range.end as usize)) .collect::>(); metadata_decoder.push_ranges(ranges, data).unwrap(); } @@ -2133,13 +3321,197 @@ mod test { } fn push_ranges_to_decoder(decoder: &mut ParquetPushDecoder, ranges: Vec>) { + push_ranges_to_decoder_with_data(decoder, ranges, &TEST_FILE_DATA); + } + + fn push_ranges_to_decoder_with_data( + decoder: &mut ParquetPushDecoder, + ranges: Vec>, + data: &Bytes, + ) { let data = ranges .iter() - .map(|range| test_file_slice(range.clone())) + .map(|range| data.slice(range.start as usize..range.end as usize)) .collect::>(); decoder.push_ranges(ranges, data).unwrap(); } + #[cfg(feature = "async")] + fn phase_profile_count(report: &str, phase: &str) -> usize { + report + .lines() + .skip(1) + .find_map(|line| { + let mut fields = line.split(','); + let name = fields.next()?; + let _total_ms = fields.next()?; + let count = fields.next()?; + (name == phase).then(|| count.parse().unwrap()) + }) + .unwrap_or(0) + } + + fn not_multiple_of_three_filter(batch: &RecordBatch) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 3 != 0) + .collect::>(), + ) + } + + fn expected_c_not_multiple_of_three(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = not_multiple_of_three_filter(&batch); + let projected = batch.project(&[2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_a_not_multiple_of_three(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = not_multiple_of_three_filter(&batch); + let projected = batch.project(&[0]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_a_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + let projected = batch.project(&[0]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_b_not_multiple_of_three(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = not_multiple_of_three_filter(&batch); + let projected = batch.project(&[1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_b_multiple_of_ten(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = multiple_of_ten_filter(&batch); + let projected = batch.project(&[1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn multiple_of_ten_filter(batch: &RecordBatch) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 10 == 0) + .collect::>(), + ) + } + + fn multiple_of_five_filter(batch: &RecordBatch) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 5 == 0) + .collect::>(), + ) + } + + fn expected_b_multiple_of_five(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = multiple_of_five_filter(&batch); + let projected = batch.project(&[1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_c_multiple_of_ten(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = multiple_of_ten_filter(&batch); + let projected = batch.project(&[2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn first_rows_per_hundred_filter(batch: &RecordBatch, rows_per_hundred: i64) -> BooleanArray { + let column = batch.column(0).as_primitive::(); + BooleanArray::from( + (0..batch.num_rows()) + .map(|idx| column.value(idx) % 100 < rows_per_hundred) + .collect::>(), + ) + } + + fn expected_a_c_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + let projected = batch.project(&[0, 2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_a_b_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + let projected = batch.project(&[0, 1]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn expected_wide_fixed_first_rows_per_hundred( + offset: usize, + len: usize, + rows_per_hundred: i64, + ) -> RecordBatch { + let batch = WIDE_FIXED_TEST_BATCH.slice(offset, len); + let filter = first_rows_per_hundred_filter(&batch, rows_per_hundred); + filter_record_batch(&batch, &filter).unwrap() + } + + fn expected_c_every_other(offset: usize, len: usize) -> RecordBatch { + let batch = TEST_BATCH.slice(offset, len); + let filter = BooleanArray::from((0..len).map(|idx| idx % 2 == 0).collect::>()); + let projected = batch.project(&[2]).unwrap(); + filter_record_batch(&projected, &filter).unwrap() + } + + fn next_reader_with_data( + decoder: &mut ParquetPushDecoder, + data: &Bytes, + ) -> Option { + loop { + match decoder + .try_next_reader() + .expect("decoder should produce a reader or request data") + { + DecodeResult::NeedsData(ranges) => { + push_ranges_to_decoder_with_data(decoder, ranges, data); + } + DecodeResult::Data(reader) => return Some(reader), + DecodeResult::Finished => return None, + } + } + } + + fn next_batch_with_data(decoder: &mut ParquetPushDecoder, data: &Bytes) -> Option { + loop { + match decoder + .try_decode() + .expect("decoder should produce a batch or request data") + { + DecodeResult::NeedsData(ranges) => { + push_ranges_to_decoder_with_data(decoder, ranges, data); + } + DecodeResult::Data(batch) => return Some(batch), + DecodeResult::Finished => return None, + } + } + } + /// Expect that the [`DecodeResult`] is a [`DecodeResult::Data`] and return the corresponding element fn expect_data(result: Result, ParquetError>) -> T { match result.expect("Expected Ok(DecodeResult::Data(T))") { diff --git a/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs new file mode 100644 index 000000000000..ba5c80ddabac --- /dev/null +++ b/parquet/src/arrow/push_decoder/reader_builder/cost_model.rs @@ -0,0 +1,452 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Runtime post-filter cost decisions for push decoder row groups. +//! +//! The cost model is intentionally adaptive rather than purely static. There +//! are two ways to enter post-filter execution: +//! +//! * a narrow static rule starts there for variable-width predicate columns +//! that are not already part of the output projection, where building +//! fragmented pushdown selections is commonly expensive +//! * the first eligible row group runs predicate pushdown, records the actual +//! `RowSelection` shape, and lets later row groups use post-filter if the +//! shape suggests pushdown is doing extra work without pruning enough rows. +//! When predicate columns are already part of the output projection, the +//! observed selected-row ratio can also choose post-filter without requiring +//! fragmented selected runs. +//! +//! ```text +//! Start +//! | +//! v +//! Observing -- incomplete observation --> Observing +//! | +//! +-- pushdown still preferred ------> UsePushdown +//! | +//! +-- post-filter preferred + supported --> UsePostFilter +//! ``` +//! +//! The cost model only applies to `Auto`. Explicit `Mask` and `Selectors` are treated +//! as user intent and are not overridden here. + +use super::{RowBudget, RowGroupReaderBuilder}; +use crate::arrow::ProjectionMask; +use crate::arrow::arrow_reader::RowFilter; +use crate::arrow::arrow_reader::RowSelectionPolicy; +use crate::arrow::arrow_reader::selection::{ + CostModelDecisionReason, CostModelObservation, RowSelectionShape, RowSelectionStrategyDecision, +}; +use crate::arrow::schema::{ParquetField, ParquetFieldType}; +use crate::basic::Type as PhysicalType; + +#[derive(Debug)] +pub(super) enum RowGroupCostModelState { + /// Collect row-selection shape from early row groups before choosing a mode. + Observing { observation: CostModelObservation }, + /// Predicate pushdown remains the execution mode for this reader. + UsePushdown, + /// Later row groups should decode once and evaluate predicates after decode. + UsePostFilter, +} + +impl Default for RowGroupCostModelState { + fn default() -> Self { + Self::Observing { + observation: CostModelObservation::default(), + } + } +} + +#[derive(Debug)] +struct PostFilterProjectionRoles { + /// Columns required to evaluate all predicates. + predicate_projection: ProjectionMask, + /// Columns decoded by post-filter execution. + read_projection: ProjectionMask, + /// True when predicate columns are already part of the caller output. + predicate_already_projected: bool, +} + +impl RowGroupReaderBuilder { + const CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW: f64 = 24.0; + + pub(super) fn should_use_post_filter_by_cost(&self, budget: RowBudget) -> bool { + matches!(self.cost_model_state, RowGroupCostModelState::UsePostFilter) + && self.post_filter_context_supported(budget) + } + + fn post_filter_context_supported(&self, budget: RowBudget) -> bool { + // Keep the runtime switch narrow: + // + // * `Auto` means the caller allowed the reader to choose. + // * `limit` and `offset` are applied during row-group planning; moving + // predicates after decode changes where short-circuiting can happen. + // * virtual columns are not read from Parquet pages and need their + // existing projection path. + self.post_filter_cost_model_enabled + && matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) + && budget.is_unbounded() + && !self.has_virtual_columns() + } + + pub(super) fn post_filter_read_projection( + &self, + filter: &RowFilter, + budget: RowBudget, + ) -> Option { + if !self.should_use_post_filter_by_cost(budget) { + return None; + } + + Some(self.post_filter_projection_roles(filter)?.read_projection) + } + + pub(super) fn post_filter_read_projection_for_filter( + &self, + filter: &RowFilter, + budget: RowBudget, + ) -> Option { + if !self.post_filter_context_supported(budget) { + return None; + } + + Some(self.post_filter_projection_roles(filter)?.read_projection) + } + + pub(super) fn should_start_with_post_filter( + &self, + filter: &RowFilter, + row_group_idx: usize, + budget: RowBudget, + ) -> bool { + if !self.post_filter_context_supported(budget) { + return false; + } + + let Some(projections) = self.post_filter_projection_roles(filter) else { + return false; + }; + + self.should_start_with_post_filter_for_unprojected_variable_width_predicate( + &projections, + row_group_idx, + ) || self.should_start_with_post_filter_for_cheap_fixed_width_read( + filter, + &projections, + row_group_idx, + ) + } + + fn should_start_with_post_filter_for_unprojected_variable_width_predicate( + &self, + projections: &PostFilterProjectionRoles, + row_group_idx: usize, + ) -> bool { + !projections.predicate_already_projected + && self.projection_has_variable_width_leaf( + row_group_idx, + &projections.predicate_projection, + ) + } + + fn should_start_with_post_filter_for_cheap_fixed_width_read( + &self, + filter: &RowFilter, + projections: &PostFilterProjectionRoles, + row_group_idx: usize, + ) -> bool { + // If predicate columns are already in the output projection, pushdown + // cannot save a deferred output read for those columns. For cheap + // fixed-width reads, starting directly with post-filter avoids building + // a row selection just to decode the same values again. + // + // Do not apply this to deferred output columns: sparse predicates can + // still win by reading only a handful of output values. + if !projections.predicate_already_projected { + return false; + } + + // Cacheable predicate columns need one pushdown row group to reveal + // whether selection is sparse. Starting post-filter here bypasses the + // predicate cache before the adaptive model can observe that shape. + if self.has_cacheable_projected_predicate(filter) { + return false; + } + + let row_group = self.metadata.row_group(row_group_idx); + if row_group.num_rows() == 0 { + return false; + } + + let mut projected_uncompressed_bytes = 0u64; + for leaf_idx in 0..row_group.num_columns() { + if !projections.read_projection.leaf_included(leaf_idx) { + continue; + } + + let column = row_group.column(leaf_idx); + if column.column_type() == PhysicalType::BYTE_ARRAY { + return false; + } + projected_uncompressed_bytes += column.uncompressed_size().max(0) as u64; + } + + projected_uncompressed_bytes as f64 / row_group.num_rows() as f64 + <= Self::CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW + } + + fn has_cacheable_projected_predicate(&self, filter: &RowFilter) -> bool { + let Some(cache_projection) = self.compute_cache_projection_inner(filter) else { + return false; + }; + + let schema = self.metadata.file_metadata().schema_descr(); + (0..schema.num_columns()).any(|leaf_idx| cache_projection.leaf_included(leaf_idx)) + } + + fn post_filter_projection_roles( + &self, + filter: &RowFilter, + ) -> Option { + // Post-filter execution decodes each row once, so it needs both: + // + // * output columns, which will be returned to the caller + // * predicate columns, which are needed to evaluate the RowFilter + // + // The final reader projects back to the original output projection + // after predicate evaluation. + let predicate_projection = filter.union_projection()?; + let mut read_projection = self.projection.clone(); + read_projection.union(&predicate_projection); + + if !self.post_filter_supports_batch_projection(&self.projection) { + return None; + } + + // The combined read projection may be whole-root even when an individual + // predicate asks for one nested child that is completed by the output + // projection. Check every batch projection that `PostFilterState` will + // materialize, not only their union. + if !filter + .predicates() + .iter() + .all(|predicate| self.post_filter_supports_batch_projection(predicate.projection())) + { + return None; + } + + if !self.post_filter_supports_batch_projection(&read_projection) { + return None; + } + + let predicate_already_projected = + self.projection_includes_all(&self.projection, &predicate_projection); + + Some(PostFilterProjectionRoles { + predicate_projection, + read_projection, + predicate_already_projected, + }) + } + + fn post_filter_supports_batch_projection(&self, projection: &ProjectionMask) -> bool { + // Post-filter projects decoded record batches by top-level Arrow field + // index. A nested root is safe when it is selected as a whole root: + // the decoded batch then contains exactly one top-level field for that + // root and can be projected without recursively trimming children. + // + // Partial nested projections, such as `struct.a` without `struct.b`, + // still need recursive array projection and remain on the pushdown path. + let schema = self.metadata.file_metadata().schema_descr(); + projection.selects_whole_root_columns(schema) + } + + fn projection_has_variable_width_leaf( + &self, + row_group_idx: usize, + projection: &ProjectionMask, + ) -> bool { + let row_group = self.metadata.row_group(row_group_idx); + (0..row_group.num_columns()).any(|leaf_idx| { + projection.leaf_included(leaf_idx) + && row_group.column(leaf_idx).column_type() == PhysicalType::BYTE_ARRAY + }) + } + + fn projection_includes_all(&self, projection: &ProjectionMask, other: &ProjectionMask) -> bool { + let schema = self.metadata.file_metadata().schema_descr(); + (0..schema.num_columns()) + .all(|leaf_idx| !other.leaf_included(leaf_idx) || projection.leaf_included(leaf_idx)) + } + + pub(super) fn observe_cost_model_candidate( + &mut self, + decision: RowSelectionStrategyDecision, + row_group_idx: usize, + row_count: usize, + budget: RowBudget, + ) { + if !matches!(self.row_selection_policy, RowSelectionPolicy::Auto { .. }) { + return; + } + + let observation = { + let RowGroupCostModelState::Observing { observation } = &mut self.cost_model_state + else { + return; + }; + + let mut shape = decision.shape; + if shape.total_rows() == 0 { + // `None` selection means the predicate kept the whole row group. + // Represent it as one selected run so the cost model can + // treat "no pruning" as an observed high-selectivity case. + shape = RowSelectionShape { + selected_rows: row_count, + skipped_rows: 0, + selector_count: 1, + selected_run_count: 1, + skipped_run_count: 0, + }; + } + + observation.observed_row_groups += 1; + observation.shape.add_assign(shape); + *observation + }; + self.metrics.record_cost_model_observed_row_group(); + + let reason = self.cost_model_reason_with_projection_context(observation, row_group_idx); + if matches!(reason, CostModelDecisionReason::ObservationIncomplete) { + self.metrics.record_cost_model_trigger(reason); + return; + } + + let prefers_post_filter = observation.prefers_post_filter() + || matches!( + reason, + CostModelDecisionReason::ProjectedPredicateModerateSelectivity + ); + self.metrics.record_cost_model_trigger(reason); + + if prefers_post_filter && self.post_filter_cost_model_supported(budget) { + self.cost_model_state = RowGroupCostModelState::UsePostFilter; + } else { + self.cost_model_state = RowGroupCostModelState::UsePushdown; + } + } + + fn cost_model_reason_with_projection_context( + &self, + observation: CostModelObservation, + row_group_idx: usize, + ) -> CostModelDecisionReason { + let reason = observation.trigger_reason(); + if !matches!(reason, CostModelDecisionReason::PushdownStillPreferred) { + return reason; + } + + let Some(filter) = self.filter.as_ref() else { + return reason; + }; + let Some(predicate_projection) = filter.union_projection() else { + return reason; + }; + + let selected_ratio = observation.shape.selected_ratio(); + // Projected predicates can reuse decoded predicate values, but sparse + // or clustered filters can still win with page pruning. Keep this + // shortcut to moderate selectivity before switching to post-filter. + // + // A TPC-DS Q2-shaped projected predicate plus one deferred fixed-width + // output column still favors post-filter once selectivity is moderate: + // the saved output decode is smaller than the row-selection and cache + // overhead. Sparse projected predicates stay below this range. + if self.projection_includes_all(&self.projection, &predicate_projection) + && self + .projected_predicate_deferred_output_is_cheap(row_group_idx, &predicate_projection) + && (CostModelObservation::PROJECTED_PREDICATE_MIN_RATIO + ..CostModelObservation::PROJECTED_PREDICATE_MAX_RATIO) + .contains(&selected_ratio) + { + CostModelDecisionReason::ProjectedPredicateModerateSelectivity + } else { + reason + } + } + + fn projected_predicate_deferred_output_is_cheap( + &self, + row_group_idx: usize, + predicate_projection: &ProjectionMask, + ) -> bool { + let row_group = self.metadata.row_group(row_group_idx); + if row_group.num_rows() == 0 { + return true; + } + + let mut deferred_uncompressed_bytes = 0u64; + let mut has_deferred_output = false; + for leaf_idx in 0..row_group.num_columns() { + if !self.projection.leaf_included(leaf_idx) + || predicate_projection.leaf_included(leaf_idx) + { + continue; + } + + has_deferred_output = true; + let column = row_group.column(leaf_idx); + if column.column_type() == PhysicalType::BYTE_ARRAY { + return false; + } + deferred_uncompressed_bytes += column.uncompressed_size().max(0) as u64; + } + + !has_deferred_output + || deferred_uncompressed_bytes as f64 / row_group.num_rows() as f64 + <= Self::CHEAP_FIXED_WIDTH_READ_BYTES_PER_ROW + } + + pub(super) fn post_filter_cost_model_supported(&self, budget: RowBudget) -> bool { + let Some(filter) = self.filter.as_ref() else { + return false; + }; + self.post_filter_supports_filter(filter, budget) + } + + fn post_filter_supports_filter(&self, filter: &RowFilter, budget: RowBudget) -> bool { + self.post_filter_context_supported(budget) + && self.post_filter_projection_roles(filter).is_some() + } + + fn has_virtual_columns(&self) -> bool { + self.fields + .as_deref() + .is_some_and(parquet_field_has_virtual_columns) + } +} + +fn parquet_field_has_virtual_columns(field: &ParquetField) -> bool { + match &field.field_type { + ParquetFieldType::Primitive { .. } => false, + ParquetFieldType::Group { children } => { + children.iter().any(parquet_field_has_virtual_columns) + } + ParquetFieldType::Virtual(_) => true, + } +} diff --git a/parquet/src/arrow/push_decoder/reader_builder/data.rs b/parquet/src/arrow/push_decoder/reader_builder/data.rs index 6fbc2090b06e..04b048ac5763 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/data.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/data.rs @@ -110,6 +110,18 @@ impl DataRequest { Ok(in_memory_row_group) } + + /// Return previously loaded column chunks if they are all dense. + /// + /// Sparse chunks may only contain pages for the predicate selection and are + /// unsafe to reuse for a post-filter read over the base selection. + pub fn into_dense_column_chunks(self) -> Option>>> { + self.column_chunks + .iter() + .flatten() + .all(|chunk| matches!(chunk.as_ref(), ColumnChunkData::Dense { .. })) + .then_some(self.column_chunks) + } } /// Builder for [`DataRequest`] diff --git a/parquet/src/arrow/push_decoder/reader_builder/mod.rs b/parquet/src/arrow/push_decoder/reader_builder/mod.rs index dacf1a2caad9..2d3c313ce4f9 100644 --- a/parquet/src/arrow/push_decoder/reader_builder/mod.rs +++ b/parquet/src/arrow/push_decoder/reader_builder/mod.rs @@ -15,20 +15,26 @@ // specific language governing permissions and limitations // under the License. +mod cost_model; mod data; mod filter; +mod selection_policy; use crate::arrow::ProjectionMask; -use crate::arrow::array_reader::{ArrayReaderBuilder, CacheOptions, RowGroupCache}; -use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; -use crate::arrow::arrow_reader::selection::RowSelectionStrategy; +use crate::arrow::array_reader::{ArrayReader, ArrayReaderBuilder, CacheOptions, RowGroupCache}; +use crate::arrow::arrow_reader::metrics::{ArrowReaderMetrics, ArrowReaderPhase}; +use crate::arrow::arrow_reader::selection::RowGroupExecutionMode; use crate::arrow::arrow_reader::{ ParquetRecordBatchReader, PredicateOptions, ReadPlanBuilder, RowFilter, RowSelection, - RowSelectionPolicy, + RowSelectionPolicy, RowSelector, }; -use crate::arrow::in_memory_row_group::ColumnChunkData; +use crate::arrow::in_memory_row_group::{ColumnChunkData, InMemoryRowGroup}; +use crate::arrow::push_decoder::reader_builder::cost_model::RowGroupCostModelState; use crate::arrow::push_decoder::reader_builder::data::DataRequestBuilder; use crate::arrow::push_decoder::reader_builder::filter::CacheInfo; +use crate::arrow::push_decoder::reader_builder::selection_policy::{ + ExpensiveOutputProfile, resolve_selection_policy_for_expensive_output, +}; use crate::arrow::schema::ParquetField; use crate::errors::ParquetError; use crate::file::metadata::ParquetMetaData; @@ -39,7 +45,7 @@ use data::DataRequest; use filter::AdvanceResult; use filter::FilterInfo; use std::ops::Range; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Mutex, RwLock}; /// The current row group being read, its read plan, and its offset/limit budget. #[derive(Debug)] @@ -47,10 +53,54 @@ struct RowGroupInfo { row_group_idx: usize, row_count: usize, plan_builder: ReadPlanBuilder, + base_selection: Option, budget: RowBudget, } +enum CostModelTransition { + ContinuePushdown, + /// The current row group already evaluated predicates and produced a + /// selection, but Auto now prefers post-filter for this scan shape. Decode + /// the current row group's output once and apply the existing selection + /// after decode instead of evaluating predicates a second time. + StartPostSelection { + selection: RowSelection, + }, +} + +enum FilterExecutionPlan { + /// No predicate work remains for this row group; proceed to output planning. + ReadOutput, + /// Decode the union of output and predicate columns once, then evaluate + /// predicates on decoded batches. + PostFilter { filter: Arc> }, + /// Decode predicate columns first, build a RowSelection, then read output. + Pushdown { filter_info: FilterInfo }, +} + /// This is the inner state machine for reading a single row group. +/// +/// The top-level flow is: +/// +/// ```text +/// Start +/// +-- no filter / no predicates ----------------------> StartData +/// +-- Auto chooses post-filter ------------------------> WaitingOnPostFilterData +/// +-- predicate pushdown ------------------------------> Filters +/// +/// Filters -> WaitingOnFilterData -> Filters | StartData +/// +/// StartData +/// +-- no rows after selection/limit -------------------> Finished +/// +-- output data needed ------------------------------> WaitingOnData +/// +/// WaitingOnData +/// +-- Auto switches current row group to post-selection > WaitingOnPostSelectionData +/// +-- output reader ready -----------------------------> Finished +/// ``` +/// +/// Each state arm delegates to a `transition_*` method so the dispatch table +/// remains readable before diving into the details for each phase. #[derive(Debug)] enum RowGroupDecoderState { Start { @@ -77,6 +127,21 @@ enum RowGroupDecoderState { /// Any cached filter results cache_info: Option, }, + /// Needs data to read the row group once and apply the filter after decode. + WaitingOnPostFilterData { + row_group_info: RowGroupInfo, + data_request: DataRequest, + read_projection: ProjectionMask, + filter: Arc>, + }, + /// Needs data to read the row group once and apply an already-computed + /// selection after decode. + WaitingOnPostSelectionData { + row_group_info: RowGroupInfo, + data_request: DataRequest, + selection: RowSelection, + cache_info: Option, + }, /// Needs data to proceed with reading the output WaitingOnData { row_group_info: RowGroupInfo, @@ -104,6 +169,10 @@ impl RowBudget { matches!(self.limit, Some(0)) } + pub(crate) fn is_unbounded(self) -> bool { + self.offset.is_none() && self.limit.is_none() + } + /// The offset still to be skipped before the next readable row group. pub(crate) fn offset(self) -> Option { self.offset @@ -193,7 +262,7 @@ pub(crate) enum RowGroupBuildResult { NeedsData(Vec>), /// The active row group produced a reader. Data { - batch_reader: ParquetRecordBatchReader, + batch_reader: Box, /// Budget remaining after applying this row group's selection. remaining_budget: RowBudget, }, @@ -252,6 +321,9 @@ pub(crate) struct RowGroupReaderBuilder { /// Optional filter filter: Option, + /// Predicate state reused by later row groups once Auto chooses post-filter. + post_filter: Option>>, + /// The size in bytes of the predicate cache to use /// /// See [`RowGroupCache`] for details. @@ -263,6 +335,12 @@ pub(crate) struct RowGroupReaderBuilder { /// Strategy for materialising row selections row_selection_policy: RowSelectionPolicy, + /// Row-group-local cost-model state used by Auto policy. + cost_model_state: RowGroupCostModelState, + + /// Whether this builder may switch Auto policy to post-filter by cost. + post_filter_cost_model_enabled: bool, + /// Current state of the decoder. /// /// It is taken when processing, and must be put back before returning @@ -312,9 +390,12 @@ impl RowGroupReaderBuilder { metadata, fields, filter, + post_filter: None, metrics, max_predicate_cache_size, row_selection_policy, + cost_model_state: RowGroupCostModelState::default(), + post_filter_cost_model_enabled: true, state: Some(RowGroupDecoderState::Finished), buffers, } @@ -332,9 +413,12 @@ impl RowGroupReaderBuilder { metadata: _, fields, filter, + post_filter: _, max_predicate_cache_size, metrics, row_selection_policy, + cost_model_state: _, + post_filter_cost_model_enabled: _, state: _, buffers, } = self; @@ -373,6 +457,12 @@ impl RowGroupReaderBuilder { self.buffers.clear_all_ranges(); } + /// Disable post-filter cost modeling for APIs that hand row-group readers back to + /// callers before they are consumed. + pub(crate) fn disable_post_filter_cost_model(&mut self) { + self.post_filter_cost_model_enabled = false; + } + /// take the current state, leaving None in its place. /// /// Returns an error if there the state wasn't put back after the previous @@ -408,13 +498,14 @@ impl RowGroupReaderBuilder { ))); } let plan_builder = ReadPlanBuilder::new(self.batch_size) - .with_selection(selection) + .with_selection(selection.clone()) .with_row_selection_policy(self.row_selection_policy); let row_group_info = RowGroupInfo { row_group_idx, row_count, plan_builder, + base_selection: selection, budget, }; @@ -467,354 +558,779 @@ impl RowGroupReaderBuilder { &mut self, current_state: RowGroupDecoderState, ) -> Result { - let result = match current_state { - RowGroupDecoderState::Start { row_group_info } => { - debug_assert!( - !row_group_info.budget.is_exhausted(), - "RowGroupFrontier should not hand off row groups after the output limit is exhausted" - ); + match current_state { + RowGroupDecoderState::Start { row_group_info } => self.transition_start(row_group_info), + RowGroupDecoderState::Filters { + row_group_info, + column_chunks, + filter_info, + } => self.transition_filters(row_group_info, column_chunks, filter_info), + RowGroupDecoderState::WaitingOnFilterData { + row_group_info, + data_request, + filter_info, + } => self.transition_waiting_on_filter_data(row_group_info, data_request, filter_info), + RowGroupDecoderState::StartData { + row_group_info, + column_chunks, + cache_info, + } => self.transition_start_data(row_group_info, column_chunks, cache_info), + RowGroupDecoderState::WaitingOnPostFilterData { + row_group_info, + data_request, + read_projection, + filter, + } => self.transition_waiting_on_post_filter_data( + row_group_info, + data_request, + read_projection, + filter, + ), + RowGroupDecoderState::WaitingOnPostSelectionData { + row_group_info, + data_request, + selection, + cache_info, + } => self.transition_waiting_on_post_selection_data( + row_group_info, + data_request, + selection, + cache_info, + ), + RowGroupDecoderState::WaitingOnData { + row_group_info, + data_request, + cache_info, + } => self.transition_waiting_on_data(row_group_info, data_request, cache_info), + RowGroupDecoderState::Finished => Err(ParquetError::General(String::from( + "Internal Error: try_build called without an active row group", + ))), + } + } - let column_chunks = None; // no prior column chunks + fn transition_start( + &mut self, + row_group_info: RowGroupInfo, + ) -> Result { + debug_assert!( + !row_group_info.budget.is_exhausted(), + "RowGroupFrontier should not hand off row groups after the output limit is exhausted" + ); - let Some(filter) = self.filter.take() else { - // no filter, start trying to read data immediately - return Ok(NextState::again(RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info: None, - })); - }; - // no predicates in filter, so start reading immediately - if filter.predicates.is_empty() { - return Ok(NextState::again(RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info: None, - })); + let column_chunks = None; + + if let Some(filter) = self.post_filter.as_ref().cloned() { + return self.start_post_filter(row_group_info, filter); + } + + let Some(filter) = self.filter.take() else { + return Ok(NextState::again(RowGroupDecoderState::StartData { + row_group_info, + column_chunks, + cache_info: None, + })); + }; + + match self.plan_filter_execution(&row_group_info, filter) { + FilterExecutionPlan::ReadOutput => { + Ok(NextState::again(RowGroupDecoderState::StartData { + row_group_info, + column_chunks, + cache_info: None, + })) + } + FilterExecutionPlan::PostFilter { filter } => { + self.start_post_filter(row_group_info, filter) + } + FilterExecutionPlan::Pushdown { filter_info } => { + Ok(NextState::again(RowGroupDecoderState::Filters { + row_group_info, + filter_info, + column_chunks, + })) + } + } + } + + fn plan_filter_execution( + &mut self, + row_group_info: &RowGroupInfo, + filter: RowFilter, + ) -> FilterExecutionPlan { + if filter.predicates.is_empty() { + return FilterExecutionPlan::ReadOutput; + } + + if self.should_start_with_post_filter( + &filter, + row_group_info.row_group_idx, + row_group_info.budget, + ) { + return FilterExecutionPlan::PostFilter { + filter: self.install_post_filter(filter), + }; + } + + if self.should_use_post_filter_by_cost(row_group_info.budget) { + if self + .post_filter_read_projection(&filter, row_group_info.budget) + .is_some() + { + return FilterExecutionPlan::PostFilter { + filter: self.install_post_filter(filter), }; + } + + self.cost_model_state = RowGroupCostModelState::UsePushdown; + } - // we have predicates to evaluate - let cache_projection = - self.compute_cache_projection(row_group_info.row_group_idx, &filter); + let cache_projection = self.compute_cache_projection(row_group_info.row_group_idx, &filter); + let cache_info = CacheInfo::new( + cache_projection, + Arc::new(RwLock::new(RowGroupCache::new( + self.batch_size, + self.max_predicate_cache_size, + ))), + ); + let filter_info = FilterInfo::new(filter, cache_info); + FilterExecutionPlan::Pushdown { filter_info } + } + + fn install_post_filter(&mut self, filter: RowFilter) -> Arc> { + let filter = Arc::new(Mutex::new(filter)); + self.post_filter = Some(Arc::clone(&filter)); + filter + } + + fn transition_filters( + &mut self, + row_group_info: RowGroupInfo, + column_chunks: Option>>>, + filter_info: FilterInfo, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget, + } = row_group_info; + + if !plan_builder.selects_any() { + self.filter = Some(filter_info.into_filter()); + return Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Finished { + remaining_budget: budget, + }, + )); + } - let cache_info = CacheInfo::new( - cache_projection, - Arc::new(RwLock::new(RowGroupCache::new( + let predicate = filter_info.current(); + let data_request = + self.metrics + .time_phase(ArrowReaderPhase::PredicateRangePlanning, || { + DataRequestBuilder::new( + row_group_idx, + row_count, self.batch_size, - self.max_predicate_cache_size, - ))), - ); + &self.metadata, + predicate.projection(), + ) + .with_selection(plan_builder.selection()) + .with_cache_projection(Some(filter_info.cache_projection())) + .with_column_chunks(column_chunks) + .build() + }); + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget, + }; + + Ok(NextState::again( + RowGroupDecoderState::WaitingOnFilterData { + row_group_info, + filter_info, + data_request, + }, + )) + } - let filter_info = FilterInfo::new(filter, cache_info); + fn transition_waiting_on_filter_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + mut filter_info: FilterInfo, + ) -> Result { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnFilterData { + row_group_info, + filter_info, + data_request, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + mut plan_builder, + base_selection, + budget, + } = row_group_info; + + let predicate = filter_info.current(); + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + predicate.projection(), + &mut self.buffers, + )?; + + let cache_options = filter_info.cache_builder().producer(); + let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_cache_options(Some(&cache_options)) + .with_parquet_metadata(&self.metadata) + .build_array_reader(self.fields.as_deref(), predicate.projection())?; + + plan_builder = self.resolve_output_selection_policy( + plan_builder, + predicate.projection(), + row_group_idx, + row_count, + ); + + let predicate_limit = filter_info + .is_last() + .then(|| budget.selected_row_limit()) + .flatten(); + let mut predicate_options = PredicateOptions::new(array_reader, filter_info.current_mut()) + .with_metrics(self.metrics.clone()); + if let Some(limit) = predicate_limit { + predicate_options = predicate_options.with_limit(limit, row_count); + } + plan_builder = plan_builder.with_predicate_options(predicate_options)?; + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget, + }; + let column_chunks = Some(row_group.column_chunks); + + Ok(match filter_info.advance() { + AdvanceResult::Continue(filter_info) => { NextState::again(RowGroupDecoderState::Filters { row_group_info, + column_chunks, filter_info, + }) + } + AdvanceResult::Done(filter, cache_info) => { + assert!(self.filter.is_none()); + self.filter = Some(filter); + NextState::again(RowGroupDecoderState::StartData { + row_group_info, column_chunks, + cache_info: Some(cache_info), }) } - // need to evaluate filters - RowGroupDecoderState::Filters { - row_group_info, - column_chunks, - filter_info, - } => { - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - budget, - } = row_group_info; - - // If nothing is selected, we are done with this row group - if !plan_builder.selects_any() { - // ruled out entire row group - self.filter = Some(filter_info.into_filter()); - return Ok(NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Finished { - remaining_budget: budget, - }, - )); - } + }) + } + + fn transition_start_data( + &mut self, + row_group_info: RowGroupInfo, + column_chunks: Option>>>, + cache_info: Option, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget, + } = row_group_info; - // Make a request for the data needed to evaluate the current predicate - let predicate = filter_info.current(); + let BudgetedReadPlan { + mut plan_builder, + rows_before_budget, + rows_after_budget, + remaining_budget, + } = budget.apply_to_plan(plan_builder, row_count); + + if rows_before_budget == 0 || rows_after_budget == 0 { + return Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Finished { remaining_budget }, + )); + } - // need to fetch pages the column needs for decoding, figure - // that out based on the current selection and projection - let data_request = DataRequestBuilder::new( + let data_request = self + .metrics + .time_phase(ArrowReaderPhase::OutputRangePlanning, || { + DataRequestBuilder::new( row_group_idx, row_count, self.batch_size, &self.metadata, - predicate.projection(), // use the predicate's projection + &self.projection, ) .with_selection(plan_builder.selection()) - // Fetch predicate columns; expand selection only for cached predicate columns - .with_cache_projection(Some(filter_info.cache_projection())) .with_column_chunks(column_chunks) - .build(); - - let row_group_info = RowGroupInfo { + // Final projection fetch shouldn't expand selection for cache + // so don't call with_cache_projection here. + .build() + }); + + plan_builder = self + .metrics + .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { + self.resolve_output_selection_policy( + plan_builder, + &self.projection, row_group_idx, row_count, - plan_builder, - budget, - }; + ) + }); - NextState::again(RowGroupDecoderState::WaitingOnFilterData { + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection, + budget: remaining_budget, + }; + + Ok(NextState::again(RowGroupDecoderState::WaitingOnData { + row_group_info, + data_request, + cache_info, + })) + } + + fn transition_waiting_on_post_filter_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + read_projection: ProjectionMask, + filter: Arc>, + ) -> Result { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnPostFilterData { row_group_info, - filter_info, data_request, - }) - } - RowGroupDecoderState::WaitingOnFilterData { - row_group_info, - data_request, - mut filter_info, - } => { - // figure out what ranges we still need - let needed_ranges = data_request.needed_ranges(&self.buffers); - if !needed_ranges.is_empty() { - // still need data - return Ok(NextState::result( - RowGroupDecoderState::WaitingOnFilterData { - row_group_info, - filter_info, - data_request, - }, - RowGroupBuildResult::NeedsData(needed_ranges), - )); - } + read_projection, + filter, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } - // otherwise we have all the data we need to evaluate the predicate - let RowGroupInfo { - row_group_idx, - row_count, - mut plan_builder, - budget, - } = row_group_info; + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + budget, + } = row_group_info; - let predicate = filter_info.current(); + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &read_projection, + &mut self.buffers, + )?; + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_parquet_metadata(&self.metadata) + .build_array_reader(self.fields.as_deref(), &read_projection)?; + let reader = ParquetRecordBatchReader::new_post_filter( + array_reader, + plan, + filter, + self.metadata.file_metadata().schema_descr(), + &read_projection, + &self.projection, + self.metrics.clone(), + )?; + + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); + Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Data { + batch_reader: Box::new(reader), + remaining_budget: budget, + }, + )) + } - let row_group = data_request.try_into_in_memory_row_group( - row_group_idx, - row_count, - &self.metadata, - predicate.projection(), - &mut self.buffers, - )?; - - let cache_options = filter_info.cache_builder().producer(); - - let array_reader = ArrayReaderBuilder::new(&row_group, &self.metrics) - .with_batch_size(self.batch_size) - .with_cache_options(Some(&cache_options)) - .with_parquet_metadata(&self.metadata) - .build_array_reader(self.fields.as_deref(), predicate.projection())?; - - // Reset to original policy before each predicate so the override - // can detect page skipping for THIS predicate's columns. - // Without this reset, a prior predicate's override (e.g. Mask) - // carries forward and the check returns early, missing unfetched - // pages for subsequent predicates. - plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - - // Prepare to evaluate the filter. - // Note: first update the selection strategy to properly handle any pages - // pruned during fetch - plan_builder = override_selector_strategy_if_needed( - plan_builder, - predicate.projection(), - self.row_group_offset_index(row_group_idx), + fn transition_waiting_on_post_selection_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + selection: RowSelection, + cache_info: Option, + ) -> Result { + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnPostSelectionData { + row_group_info, + data_request, + selection, + cache_info, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } + + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + budget, + } = row_group_info; + + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &self.projection, + &mut self.buffers, + )?; + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader = self.build_projection_reader(&row_group, cache_info.as_ref())?; + let reader = ParquetRecordBatchReader::new_post_selection_filter( + array_reader, + plan, + selection, + self.metrics.clone(), + ); + + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::PostFilter); + Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Data { + batch_reader: Box::new(reader), + remaining_budget: budget, + }, + )) + } + + fn transition_waiting_on_data( + &mut self, + row_group_info: RowGroupInfo, + data_request: DataRequest, + cache_info: Option, + ) -> Result { + match self.resolve_cost_model_transition(&row_group_info, cache_info.as_ref())? { + CostModelTransition::ContinuePushdown => {} + CostModelTransition::StartPostSelection { selection } => { + let column_chunks = data_request.into_dense_column_chunks(); + return self.start_post_selection_filter( + row_group_info, + selection, + cache_info, + column_chunks, ); + } + } - // When this is the final predicate in the chain and an output - // limit is set, tell the filter evaluation to stop once enough - // matching rows have been accumulated. - let predicate_limit = filter_info - .is_last() - .then(|| budget.selected_row_limit()) - .flatten(); - - // Evaluate the filter via `with_predicate_options`, opting into - // early termination when this is the final predicate and an - // output limit was set. - let mut predicate_options = - PredicateOptions::new(array_reader, filter_info.current_mut()); - if let Some(limit) = predicate_limit { - predicate_options = predicate_options.with_limit(limit, row_count); - } - plan_builder = plan_builder.with_predicate_options(predicate_options)?; + let needed_ranges = data_request.needed_ranges(&self.buffers); + if !needed_ranges.is_empty() { + return Ok(NextState::result( + RowGroupDecoderState::WaitingOnData { + row_group_info, + data_request, + cache_info, + }, + RowGroupBuildResult::NeedsData(needed_ranges), + )); + } - let row_group_info = RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - budget, - }; + let RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: _, + budget, + } = row_group_info; - // Take back the column chunks that were read - let column_chunks = Some(row_group.column_chunks); - - // advance to the next predicate, if any - match filter_info.advance() { - AdvanceResult::Continue(filter_info) => { - NextState::again(RowGroupDecoderState::Filters { - row_group_info, - column_chunks, - filter_info, - }) - } - // done with predicates, proceed to reading data - AdvanceResult::Done(filter, cache_info) => { - // remember we need to put back the filter - assert!(self.filter.is_none()); - self.filter = Some(filter); - NextState::again(RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info: Some(cache_info), - }) - } - } + let row_group = data_request.try_into_in_memory_row_group( + row_group_idx, + row_count, + &self.metadata, + &self.projection, + &mut self.buffers, + )?; + let plan = plan_builder.build_with_metrics(&self.metrics); + let array_reader = self.build_projection_reader(&row_group, cache_info.as_ref())?; + let reader = + ParquetRecordBatchReader::new_with_metrics(array_reader, plan, self.metrics.clone()); + + Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Data { + batch_reader: Box::new(reader), + remaining_budget: budget, + }, + )) + } + + fn resolve_cost_model_transition( + &mut self, + row_group_info: &RowGroupInfo, + cache_info: Option<&CacheInfo>, + ) -> Result { + if cache_info.is_none() + || !matches!( + self.cost_model_state, + RowGroupCostModelState::Observing { .. } + ) + || !self.post_filter_cost_model_supported(row_group_info.budget) + { + return Ok(CostModelTransition::ContinuePushdown); + } + + let decision = row_group_info + .plan_builder + .resolve_selection_strategy_decision(); + let observed_selection = row_group_info.plan_builder.selection().cloned(); + + self.observe_cost_model_candidate( + decision, + row_group_info.row_group_idx, + row_group_info.row_count, + row_group_info.budget, + ); + + if matches!(self.cost_model_state, RowGroupCostModelState::UsePostFilter) { + if row_group_info.base_selection.is_none() { + let selection = observed_selection.unwrap_or_else(|| { + RowSelection::from(vec![RowSelector::select(row_group_info.row_count)]) + }); + return Ok(CostModelTransition::StartPostSelection { selection }); } - RowGroupDecoderState::StartData { - row_group_info, - column_chunks, - cache_info, - } => { - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - budget, - } = row_group_info; - - let BudgetedReadPlan { - mut plan_builder, - rows_before_budget, - rows_after_budget, - remaining_budget, - } = budget.apply_to_plan(plan_builder, row_count); - - if rows_before_budget == 0 { - // ruled out entire row group - return Ok(NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Finished { remaining_budget }, - )); - } - if rows_after_budget == 0 { - // no rows left after applying limit/offset - return Ok(NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Finished { remaining_budget }, - )); - } + self.ensure_post_filter_state()?; + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + // This row group was already planned with a base selection, so keep + // its current pushdown path. The state above enables post-filter + // execution for later row groups. + return Ok(CostModelTransition::ContinuePushdown); + } - let data_request = DataRequestBuilder::new( + self.metrics + .record_cost_model_row_group(RowGroupExecutionMode::Pushdown(decision.strategy)); + Ok(CostModelTransition::ContinuePushdown) + } + + fn ensure_post_filter_state(&mut self) -> Result<(), ParquetError> { + if self.post_filter.is_some() { + return Ok(()); + } + + let filter = self.filter.take().ok_or_else(|| { + ParquetError::General( + "post-filter cost model selected without a row filter".to_string(), + ) + })?; + self.post_filter = Some(Arc::new(Mutex::new(filter))); + Ok(()) + } + + fn resolve_output_selection_policy( + &self, + plan_builder: ReadPlanBuilder, + projection: &ProjectionMask, + row_group_idx: usize, + row_count: usize, + ) -> ReadPlanBuilder { + resolve_selection_policy_for_expensive_output( + plan_builder.with_row_selection_policy(self.row_selection_policy), + projection, + self.row_group_offset_index(row_group_idx), + row_count, + ExpensiveOutputProfile::from_row_group( + self.metadata.row_group(row_group_idx), + projection, + row_count, + ), + ) + } + + fn build_projection_reader( + &self, + row_group: &InMemoryRowGroup<'_>, + cache_info: Option<&CacheInfo>, + ) -> Result, ParquetError> { + let array_reader_builder = ArrayReaderBuilder::new(row_group, &self.metrics) + .with_batch_size(self.batch_size) + .with_parquet_metadata(&self.metadata); + + if let Some(cache_info) = cache_info { + let cache_options: CacheOptions = cache_info.builder().consumer(); + array_reader_builder + .with_cache_options(Some(&cache_options)) + .build_array_reader(self.fields.as_deref(), &self.projection) + } else { + array_reader_builder.build_array_reader(self.fields.as_deref(), &self.projection) + } + } + + fn start_post_filter( + &mut self, + row_group_info: RowGroupInfo, + filter: Arc>, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + base_selection, + budget, + .. + } = row_group_info; + + let mut plan_builder = ReadPlanBuilder::new(self.batch_size) + .with_selection(base_selection) + .with_row_selection_policy(self.row_selection_policy); + + if !plan_builder.selects_any() { + return Ok(NextState::result( + RowGroupDecoderState::Finished, + RowGroupBuildResult::Finished { + remaining_budget: budget, + }, + )); + } + + let read_projection = { + let filter = filter.lock().map_err(|_| { + ParquetError::General("post-filter predicate state was poisoned".to_string()) + })?; + self.post_filter_read_projection_for_filter(&filter, budget) + .ok_or_else(|| { + ParquetError::General( + "post-filter cost model selected an unsupported projection".to_string(), + ) + })? + }; + + let data_request = self + .metrics + .time_phase(ArrowReaderPhase::OutputRangePlanning, || { + DataRequestBuilder::new( row_group_idx, row_count, self.batch_size, &self.metadata, - &self.projection, + &read_projection, ) .with_selection(plan_builder.selection()) - .with_column_chunks(column_chunks) - // Final projection fetch shouldn't expand selection for cache - // so don't call with_cache_projection here - .build(); - - plan_builder = plan_builder.with_row_selection_policy(self.row_selection_policy); - - plan_builder = override_selector_strategy_if_needed( - plan_builder, - &self.projection, - self.row_group_offset_index(row_group_idx), - ); + .build() + }); + + if plan_builder.selection().is_some() { + plan_builder = + self.metrics + .time_phase(ArrowReaderPhase::OutputSelectionResolve, || { + self.resolve_output_selection_policy( + plan_builder, + &read_projection, + row_group_idx, + row_count, + ) + }); + } - let row_group_info = RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - budget: remaining_budget, - }; + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: None, + budget, + }; - NextState::again(RowGroupDecoderState::WaitingOnData { - row_group_info, - data_request, - cache_info, - }) - } - // Waiting on data to proceed with reading the output - RowGroupDecoderState::WaitingOnData { + Ok(NextState::again( + RowGroupDecoderState::WaitingOnPostFilterData { row_group_info, data_request, - cache_info, - } => { - let needed_ranges = data_request.needed_ranges(&self.buffers); - if !needed_ranges.is_empty() { - // still need data - return Ok(NextState::result( - RowGroupDecoderState::WaitingOnData { - row_group_info, - data_request, - cache_info, - }, - RowGroupBuildResult::NeedsData(needed_ranges), - )); - } + read_projection, + filter, + }, + )) + } - // otherwise we have all the data we need to proceed - let RowGroupInfo { - row_group_idx, - row_count, - plan_builder, - budget, - } = row_group_info; + fn start_post_selection_filter( + &mut self, + row_group_info: RowGroupInfo, + selection: RowSelection, + cache_info: Option, + column_chunks: Option>>>, + ) -> Result { + let RowGroupInfo { + row_group_idx, + row_count, + base_selection, + budget, + .. + } = row_group_info; + + let plan_builder = ReadPlanBuilder::new(self.batch_size) + .with_selection(base_selection) + .with_row_selection_policy(self.row_selection_policy); - let row_group = data_request.try_into_in_memory_row_group( + let data_request = self + .metrics + .time_phase(ArrowReaderPhase::OutputRangePlanning, || { + DataRequestBuilder::new( row_group_idx, row_count, + self.batch_size, &self.metadata, &self.projection, - &mut self.buffers, - )?; - - let plan = plan_builder.build(); - - // if we have any cached results, connect them up - let array_reader_builder = ArrayReaderBuilder::new(&row_group, &self.metrics) - .with_batch_size(self.batch_size) - .with_parquet_metadata(&self.metadata); - let array_reader = if let Some(cache_info) = cache_info.as_ref() { - let cache_options: CacheOptions = cache_info.builder().consumer(); - array_reader_builder - .with_cache_options(Some(&cache_options)) - .build_array_reader(self.fields.as_deref(), &self.projection) - } else { - array_reader_builder - .build_array_reader(self.fields.as_deref(), &self.projection) - }?; - - let reader = ParquetRecordBatchReader::new(array_reader, plan); - NextState::result( - RowGroupDecoderState::Finished, - RowGroupBuildResult::Data { - batch_reader: reader, - remaining_budget: budget, - }, ) - } - RowGroupDecoderState::Finished => { - return Err(ParquetError::General(String::from( - "Internal Error: try_build called without an active row group", - ))); - } + .with_selection(plan_builder.selection()) + .with_column_chunks(column_chunks) + .build() + }); + + let row_group_info = RowGroupInfo { + row_group_idx, + row_count, + plan_builder, + base_selection: None, + budget, }; - Ok(result) + + Ok(NextState::again( + RowGroupDecoderState::WaitingOnPostSelectionData { + row_group_info, + data_request, + selection, + cache_info, + }, + )) } /// Which columns should be cached? @@ -857,66 +1373,14 @@ impl RowGroupReaderBuilder { } } -/// Override the selection strategy if needed. -/// -/// Some pages can be skipped during row-group construction if they are not read -/// by the selections. This means that the data pages for those rows are never -/// loaded and definition/repetition levels are never read. When using -/// `RowSelections` selection works because `skip_records()` handles this -/// case and skips the page accordingly. -/// -/// However, with the current mask design, all values must be read and decoded -/// and then a mask filter is applied. Thus if any pages are skipped during -/// row-group construction, the data pages are missing and cannot be decoded. -/// -/// A simple example: -/// * the page size is 2, the mask is 100001, row selection should be read(1) skip(4) read(1) -/// * the `ColumnChunkData` would be page1(10), page2(skipped), page3(01) -/// -/// Using the row selection to skip(4), page2 won't be read at all, so in this -/// case we can't decode all the rows and apply a mask. To correctly apply the -/// bit mask, we need all 6 values be read, but page2 is not in memory. -fn override_selector_strategy_if_needed( - plan_builder: ReadPlanBuilder, - projection_mask: &ProjectionMask, - offset_index: Option<&[OffsetIndexMetaData]>, -) -> ReadPlanBuilder { - // override only applies to Auto policy, If the policy is already Mask or Selectors, respect that - let RowSelectionPolicy::Auto { .. } = plan_builder.row_selection_policy() else { - return plan_builder; - }; - - let preferred_strategy = plan_builder.resolve_selection_strategy(); - - let force_selectors = matches!(preferred_strategy, RowSelectionStrategy::Mask) - && plan_builder.selection().is_some_and(|selection| { - selection.should_force_selectors(projection_mask, offset_index) - }); - - let resolved_strategy = if force_selectors { - RowSelectionStrategy::Selectors - } else { - preferred_strategy - }; - - // override the plan builder strategy with the resolved one - let new_policy = match resolved_strategy { - RowSelectionStrategy::Mask => RowSelectionPolicy::Mask, - RowSelectionStrategy::Selectors => RowSelectionPolicy::Selectors, - }; - - plan_builder.with_row_selection_policy(new_policy) -} - #[cfg(test)] mod tests { use super::*; - use crate::arrow::arrow_reader::{RowSelection, RowSelector}; #[test] // Verify that the size of RowGroupDecoderState does not grow too large fn test_structure_size() { - assert_eq!(std::mem::size_of::(), 232); + assert_eq!(std::mem::size_of::(), 288); } #[test] diff --git a/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs new file mode 100644 index 000000000000..f3092d3e9fd1 --- /dev/null +++ b/parquet/src/arrow/push_decoder/reader_builder/selection_policy.rs @@ -0,0 +1,599 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Row-selection policy resolution for push decoder read plans. +//! +//! This module is the final safety gate between the high-level +//! `RowSelectionPolicy` requested by the caller and the concrete cursor used by +//! the record batch reader. It handles two independent concerns: +//! +//! ```text +//! Caller policy Selection/page shape Resolved plan +//! ------------------------------------------------------------------------------- +//! Auto dense, short/fragmented runs Mask +//! Auto sparse page-loaded ranges Selectors +//! Auto expensive variable-width sparse output Selectors +//! Mask dense page-loaded ranges dense Mask +//! Mask sparse page-loaded ranges SparseMaskCursor +//! Selectors any shape Selectors +//! ``` +//! +//! The distinction between `Auto` and explicit `Mask` matters. `Auto` may +//! choose selectors to avoid a bad strategy. Explicit `Mask` must be honored, +//! so sparse page-loaded data is represented explicitly instead of being +//! silently converted to selectors. + +use crate::arrow::ProjectionMask; +use crate::arrow::arrow_reader::selection::{ + LoadedRowRanges, RowSelectionShape, RowSelectionStrategy, +}; +use crate::arrow::arrow_reader::{ReadPlanBuilder, RowSelection, RowSelectionPolicy}; +use crate::basic::Type as PhysicalType; +use crate::file::metadata::RowGroupMetaData; +use crate::file::page_index::offset_index::OffsetIndexMetaData; +use std::ops::Range; + +#[cfg(test)] +pub(super) fn resolve_selection_policy_for_projection( + plan_builder: ReadPlanBuilder, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, +) -> ReadPlanBuilder { + resolve_selection_policy_for_expensive_output( + plan_builder, + projection_mask, + offset_index, + total_rows, + ExpensiveOutputProfile::default(), + ) +} + +pub(super) fn resolve_selection_policy_for_expensive_output( + plan_builder: ReadPlanBuilder, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, + output_profile: ExpensiveOutputProfile, +) -> ReadPlanBuilder { + // Page pruning can load only the pages that intersect selected rows. If the + // projected columns have sparse loaded ranges, a dense mask would try to + // decode rows for pages that are not present. Auto avoids that by choosing + // selectors; explicit Mask carries the sparse ranges to the reader. + let loaded = loaded_ranges_for_projection( + plan_builder.selection(), + projection_mask, + offset_index, + total_rows, + ); + let loaded_is_sparse = loaded.as_ref().is_some_and(LoadedRowRanges::is_sparse); + let sparse_loaded = loaded.filter(LoadedRowRanges::is_sparse); + + match plan_builder.row_selection_policy() { + RowSelectionPolicy::Auto { .. } => { + let decision = plan_builder.resolve_selection_strategy_decision(); + match decision.strategy { + RowSelectionStrategy::Mask + if loaded_is_sparse + || should_prefer_selectors_for_expensive_output( + decision.shape, + output_profile, + ) => + { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) + } + RowSelectionStrategy::Mask => { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Mask) + } + RowSelectionStrategy::Selectors => { + plan_builder.with_row_selection_policy(RowSelectionPolicy::Selectors) + } + } + } + RowSelectionPolicy::Mask => plan_builder.with_loaded_row_ranges(sparse_loaded), + RowSelectionPolicy::Selectors => plan_builder, + } +} + +#[derive(Clone, Copy, Debug, Default)] +pub(super) struct ExpensiveOutputProfile { + pub(super) variable_width_columns: usize, + pub(super) uncompressed_bytes_per_row: f64, +} + +impl ExpensiveOutputProfile { + pub(super) fn from_row_group( + row_group: &RowGroupMetaData, + projection_mask: &ProjectionMask, + total_rows: usize, + ) -> Self { + if total_rows == 0 { + return Self::default(); + } + + let mut variable_width_columns = 0; + let mut uncompressed_bytes = 0u64; + for leaf_idx in 0..row_group.num_columns() { + if !projection_mask.leaf_included(leaf_idx) { + continue; + } + + let column = row_group.column(leaf_idx); + if column.column_type() == PhysicalType::BYTE_ARRAY { + variable_width_columns += 1; + } + uncompressed_bytes += column.uncompressed_size().max(0) as u64; + } + + Self { + variable_width_columns, + uncompressed_bytes_per_row: uncompressed_bytes as f64 / total_rows as f64, + } + } +} + +fn should_prefer_selectors_for_expensive_output( + shape: RowSelectionShape, + output_profile: ExpensiveOutputProfile, +) -> bool { + // Sparse, low-selectivity output over variable-width columns can be worse + // with masks because masks decode and then filter many values that selectors + // can skip. This is intentionally narrow; most fragmented selections remain + // good candidates for masks. + let selected_ratio = shape.selected_ratio(); + output_profile.variable_width_columns > 0 + && output_profile.uncompressed_bytes_per_row >= 16.0 + && selected_ratio > 0.0 + && selected_ratio < 0.10 + && shape.average_selected_run_length() <= 4.0 +} + +#[cfg_attr(test, allow(dead_code))] +pub(super) fn loaded_ranges_for_projection( + selection: Option<&RowSelection>, + projection_mask: &ProjectionMask, + offset_index: Option<&[OffsetIndexMetaData]>, + total_rows: usize, +) -> Option { + // Loaded ranges are row ranges backed by page data for all projected + // columns. When projections include multiple columns, a row is safe for + // sparse-mask decoding only if every projected column loaded the page that + // contains that row. Therefore projected-column ranges are intersected. + // + // ```text + // column A pages loaded: [0..50) [80..100) + // column B pages loaded: [20..70) [80..100) + // usable loaded ranges: [20..50) [80..100) + // ``` + let selection = selection?; + let columns = offset_index?; + let mut ranges: Option>> = None; + + for (leaf_idx, column) in columns.iter().enumerate() { + if !projection_mask.leaf_included(leaf_idx) { + continue; + } + let column_ranges = selection.selected_page_row_ranges(column.page_locations(), total_rows); + ranges = Some(match ranges { + Some(existing) => intersect_ranges(existing, column_ranges), + None => column_ranges, + }); + } + + ranges.map(|ranges| LoadedRowRanges::new(coalesce_adjacent_ranges(ranges), total_rows)) +} + +fn intersect_ranges(left: Vec>, right: Vec>) -> Vec> { + let mut out = Vec::new(); + let mut left_idx = 0; + let mut right_idx = 0; + + while left_idx < left.len() && right_idx < right.len() { + let l = &left[left_idx]; + let r = &right[right_idx]; + let start = l.start.max(r.start); + let end = l.end.min(r.end); + + if start < end { + out.push(start..end); + } + + if l.end <= r.end { + left_idx += 1; + } else { + right_idx += 1; + } + } + + out +} + +fn coalesce_adjacent_ranges(ranges: Vec>) -> Vec> { + let mut out: Vec> = Vec::with_capacity(ranges.len()); + for range in ranges { + if range.is_empty() { + continue; + } + if let Some(last) = out.last_mut() { + if last.end == range.start { + last.end = range.end; + continue; + } + } + out.push(range); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::ProjectionMask; + use crate::arrow::arrow_reader::selection::LoadedRowRanges; + use crate::arrow::arrow_reader::{ + ReadPlanBuilder, RowSelection, RowSelectionCursor, RowSelectionPolicy, RowSelector, + }; + use crate::file::page_index::offset_index::{OffsetIndexMetaData, PageLocation}; + + #[test] + fn test_resolve_selection_policy_preserves_mask_choice() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(99), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + + assert_eq!( + resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + None, + 101 + ) + .row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_resolve_selection_policy_preserves_selector_choice() { + let selection = RowSelection::from(vec![RowSelector::select(128)]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1 }); + + assert_eq!( + resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + None, + 128 + ) + .row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_resolve_selection_policy_respects_explicit_policy() { + let selection = RowSelection::from(vec![RowSelector::select(1), RowSelector::skip(1)]); + let mask_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection.clone())) + .with_row_selection_policy(RowSelectionPolicy::Mask); + let selector_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Selectors); + + assert_eq!( + resolve_selection_policy_for_projection(mask_builder, &ProjectionMask::all(), None, 2) + .row_selection_policy(), + &RowSelectionPolicy::Mask + ); + assert_eq!( + resolve_selection_policy_for_projection( + selector_builder, + &ProjectionMask::all(), + None, + 2 + ) + .row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_sparse_loaded_ranges_force_selectors() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_dense_loaded_ranges_preserve_mask() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_explicit_mask_keeps_sparse_loaded_ranges() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(4), + RowSelector::select(1), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Mask); + let offset_index = sparse_test_offset_index(); + + let plan_builder = resolve_selection_policy_for_projection( + plan_builder, + &ProjectionMask::all(), + Some(&offset_index), + 6, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + + let mut plan = plan_builder.build(); + let RowSelectionCursor::Mask(cursor) = plan.row_selection_cursor_mut() else { + panic!("expected mask cursor"); + }; + assert!(cursor.is_sparse()); + } + + #[test] + fn test_loaded_ranges_intersects_many_ranges_across_projected_columns() { + let selection = RowSelection::from(vec![ + RowSelector::skip(10), + RowSelector::select(1), + RowSelector::skip(39), + RowSelector::select(1), + RowSelector::skip(39), + RowSelector::select(1), + RowSelector::skip(9), + ]); + let offset_index = vec![ + offset_index_column(&[0, 20, 40, 60, 80]), + offset_index_column(&[0, 15, 35, 55, 75]), + offset_index_column(&[0, 10, 30, 50, 70, 90]), + ]; + + let loaded = loaded_ranges_for_projection( + Some(&selection), + &ProjectionMask::all(), + Some(&offset_index), + 100, + ); + + assert_eq!( + loaded, + Some(LoadedRowRanges::new(vec![10..15, 50..55, 90..100], 100)) + ); + } + + #[test] + fn test_auto_expensive_fragmented_output_prefers_selectors() { + let selection = q38_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_800, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_expensive_fragmented_output_prefers_selectors_without_selector_count_gate() { + let selection = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + RowSelector::select(1), + RowSelector::skip(12), + ]); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 52, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Selectors + ); + } + + #[test] + fn test_auto_cheap_fragmented_output_keeps_mask() { + let selection = q38_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 8.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_800, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + #[test] + fn test_auto_moderate_selectivity_expensive_output_keeps_mask() { + let selection = q26_like_fragmented_selection(); + let plan_builder = ReadPlanBuilder::new(1024) + .with_selection(Some(selection)) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }); + let profile = ExpensiveOutputProfile { + variable_width_columns: 1, + uncompressed_bytes_per_row: 64.0, + }; + + let plan_builder = resolve_selection_policy_for_expensive_output( + plan_builder, + &ProjectionMask::all(), + None, + 7_200, + profile, + ); + + assert_eq!( + plan_builder.row_selection_policy(), + &RowSelectionPolicy::Mask + ); + } + + fn q38_like_fragmented_selection() -> RowSelection { + let mut selectors = Vec::new(); + for _ in 0..600 { + selectors.push(RowSelector::select(1)); + selectors.push(RowSelector::skip(12)); + } + RowSelection::from(selectors) + } + + fn q26_like_fragmented_selection() -> RowSelection { + let mut selectors = Vec::new(); + for _ in 0..600 { + selectors.push(RowSelector::select(2)); + selectors.push(RowSelector::skip(10)); + } + RowSelection::from(selectors) + } + + fn sparse_test_offset_index() -> Vec { + vec![OffsetIndexMetaData { + page_locations: vec![ + PageLocation { + offset: 0, + compressed_page_size: 10, + first_row_index: 0, + }, + PageLocation { + offset: 10, + compressed_page_size: 10, + first_row_index: 2, + }, + PageLocation { + offset: 20, + compressed_page_size: 10, + first_row_index: 4, + }, + ], + unencoded_byte_array_data_bytes: None, + }] + } + + fn offset_index_column(first_rows: &[i64]) -> OffsetIndexMetaData { + OffsetIndexMetaData { + page_locations: first_rows + .iter() + .enumerate() + .map(|(idx, first_row_index)| PageLocation { + offset: (idx * 10) as i64, + compressed_page_size: 10, + first_row_index: *first_row_index, + }) + .collect(), + unencoded_byte_array_data_bytes: None, + } + } +} diff --git a/parquet/src/arrow/push_decoder/remaining.rs b/parquet/src/arrow/push_decoder/remaining.rs index d1070d2aa69f..49ba8be0ac0d 100644 --- a/parquet/src/arrow/push_decoder/remaining.rs +++ b/parquet/src/arrow/push_decoder/remaining.rs @@ -287,6 +287,13 @@ impl RemainingRowGroups { self.row_group_reader_builder.clear_all_ranges(); } + /// Prevent Auto selection from switching to post-filter by cost for reader + /// handoff APIs. + pub(crate) fn disable_post_filter_cost_model(&mut self) { + self.row_group_reader_builder + .disable_post_filter_cost_model(); + } + /// True iff the inner row-group reader is between row groups (state /// `Finished`). Forward to [`RowGroupReaderBuilder::is_finished`]. pub fn is_at_row_group_boundary(&self) -> bool { @@ -345,7 +352,7 @@ impl RemainingRowGroups { self.frontier .update_budget_after_row_group(remaining_budget); // ready to read the row group - return Ok(DecodeResult::Data(batch_reader)); + return Ok(DecodeResult::Data(*batch_reader)); } } } diff --git a/parquet/tests/arrow_reader/io/async_reader.rs b/parquet/tests/arrow_reader/io/async_reader.rs index db06dda8ee89..8fa993fd50bd 100644 --- a/parquet/tests/arrow_reader/io/async_reader.rs +++ b/parquet/tests/arrow_reader/io/async_reader.rs @@ -178,8 +178,9 @@ async fn test_read_single_row_filter() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_575_625(&schema_descr)); - // Expect to see I/O for column b in both row groups to evaluate filter, - // then a single pages for the "a" column in each row group + // Auto keeps pushdown for projected predicates so the filtered "b" column + // can be reused from cache. The remaining projected "a" column is read + // after filtering, trimmed to the matching pages by the page index. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -215,10 +216,8 @@ async fn test_read_single_row_filter_no_page_index() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_575_625(&schema_descr)); - // Since we don't have the page index, expect to see: - // 1. I/O for all pages of column b to evaluate the filter - // 2. IO for all pages of column a as the reader doesn't know where the page - // boundaries are so needs to scan them. + // Without page indexes, auto still evaluates and caches the projected + // predicate first, then reads the remaining projected column separately. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" @@ -295,8 +294,8 @@ async fn test_read_single_row_filter_all() { .with_projection(ProjectionMask::columns(&schema_descr, ["a", "b"])) .with_row_filter(filter_b_false(&schema_descr)); - // Expect to see reads for column "b" to evaluate the filter, but no reads - // for column "a" as no rows pass the filter + // Auto keeps pushdown for projected predicates, so the non-predicate + // column is not read when the predicate rejects every row. insta::assert_debug_snapshot!(run( &test_file, builder).await, @r#" diff --git a/parquet/tests/arrow_reader/predicate_cache.rs b/parquet/tests/arrow_reader/predicate_cache.rs index 4029b4e19e20..1ed7160172b7 100644 --- a/parquet/tests/arrow_reader/predicate_cache.rs +++ b/parquet/tests/arrow_reader/predicate_cache.rs @@ -30,7 +30,9 @@ use arrow_schema::{DataType, Field}; use bytes::Bytes; use futures::StreamExt; use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics; -use parquet::arrow::arrow_reader::{ArrowPredicateFn, ArrowReaderOptions, RowFilter}; +use parquet::arrow::arrow_reader::{ + ArrowPredicateFn, ArrowReaderOptions, RowFilter, RowSelectionPolicy, +}; use parquet::arrow::arrow_reader::{ArrowReaderBuilder, ParquetRecordBatchReaderBuilder}; use parquet::arrow::{ArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::file::properties::WriterProperties; @@ -50,7 +52,15 @@ async fn test_default_read() { #[tokio::test] async fn test_async_cache_with_filters() { let test = ParquetPredicateCacheTest::new().with_expected_records_read_from_cache(49); - let async_builder = test.async_builder().await.add_project_ab_and_filter_b(); + let async_builder = test + .async_builder() + .await + .add_project_ab_and_filter_b() + // The default Auto policy may choose post-filter execution for this + // cheap projected predicate, which avoids the predicate cache entirely. + // Use an explicit pushdown policy so this test continues to exercise + // predicate cache reads. + .with_row_selection_policy(RowSelectionPolicy::Selectors); test.run_async(async_builder).await; } diff --git a/parquet/tests/arrow_reader/row_filter/async.rs b/parquet/tests/arrow_reader/row_filter/async.rs index 66840bb8147b..2f50bcc7ef45 100644 --- a/parquet/tests/arrow_reader/row_filter/async.rs +++ b/parquet/tests/arrow_reader/row_filter/async.rs @@ -24,8 +24,8 @@ use arrow::{ datatypes::{Int32Type, TimestampNanosecondType}, }; use arrow_array::{ - ArrayRef, BooleanArray, Int8Array, Int32Array, Int64Array, RecordBatch, Scalar, StringArray, - StructArray, + ArrayRef, BooleanArray, Int8Array, Int32Array, Int64Array, ListArray, RecordBatch, Scalar, + StringArray, StructArray, }; use arrow_schema::{DataType, Field, Schema}; use bytes::Bytes; @@ -526,19 +526,133 @@ async fn test_predicate_pushdown_with_skipped_pages() { } } -/// Regression test: when multiple predicates are used, the first predicate's -/// override of the selection strategy (to Mask) must NOT carry forward to -/// subsequent predicates. Each predicate must get a fresh Auto policy so the -/// override can detect page skipping for that predicate's specific columns. -/// -/// Scenario: -/// - Dense initial RowSelection (alternating select/skip) covers all pages → Auto resolves to Mask -/// - Predicate 1 evaluates on column A, narrows selection to skip middle pages -/// - Predicate 2's column B is fetched sparsely with the narrowed selection (missing middle pages) -/// - Without the fix, the override for predicate 2 returns early (policy=Mask, not Auto), -/// so Mask is used and tries to read missing pages → "Invalid offset" error +/// Regression test for explicit mask predicate pushdown attempting to read skipped pages. +/// Related issue: https://github.com/apache/arrow-rs/issues/9239 #[tokio::test] -async fn test_multi_predicate_mask_policy_carryover() { +async fn test_explicit_mask_predicate_pushdown_with_skipped_pages() { + use arrow_array::TimestampNanosecondArray; + use arrow_schema::TimeUnit; + + const TIME_IN_RANGE_START: i64 = 1_704_092_400_000_000_000; + const TIME_IN_RANGE_END: i64 = 1_704_110_400_000_000_000; + const TIME_BEFORE_RANGE: i64 = 1_704_078_000_000_000_000; + + let schema = Arc::new(Schema::new(vec![ + Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + ), + Field::new("tag", DataType::Utf8, false), + ])); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(300)) + .set_data_page_row_count_limit(33) + .build(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); + + for _ in 0..2 { + for (tag_idx, tag) in ["a", "b", "c"].iter().enumerate() { + let times: Vec = (0..100) + .map(|j| { + let row_idx = tag_idx * 100 + j; + if row_idx % 2 == 0 { + TIME_IN_RANGE_START + (j as i64 * 1_000_000) + } else { + TIME_BEFORE_RANGE + (j as i64 * 1_000_000) + } + }) + .collect(); + let tags: Vec<&str> = (0..100).map(|_| *tag).collect(); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(TimestampNanosecondArray::from(times)) as ArrayRef, + Arc::new(StringArray::from(tags)) as ArrayRef, + ], + ) + .unwrap(); + writer.write(&batch).unwrap(); + } + writer.flush().unwrap(); + } + writer.close().unwrap(); + let buffer = Bytes::from(buffer); + + for policy in [ + PageIndexPolicy::Skip, + PageIndexPolicy::Optional, + PageIndexPolicy::Required, + ] { + let reader = TestReader::new(buffer.clone()); + let options = ArrowReaderOptions::default().with_page_index_policy(policy); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(reader, options) + .await + .unwrap(); + + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + let num_row_groups = builder.metadata().num_row_groups(); + + let mut selectors = Vec::new(); + for _ in 0..num_row_groups { + selectors.push(RowSelector::select(100)); + selectors.push(RowSelector::skip(100)); + selectors.push(RowSelector::select(100)); + } + let selection = RowSelection::from(selectors); + + let time_gte_predicate = + ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|t| t.map(|v| v >= TIME_IN_RANGE_START)), + )) + }); + + let time_lt_predicate = + ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|t| t.map(|v| v < TIME_IN_RANGE_END)), + )) + }); + + let row_filter = RowFilter::new(vec![ + Box::new(time_gte_predicate), + Box::new(time_lt_predicate), + ]); + let projection = ProjectionMask::roots(&schema_descr, [1]); + + let stream = builder + .with_row_filter(row_filter) + .with_row_selection(selection) + .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Mask) + .build() + .unwrap(); + + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); + assert_eq!(batch.num_columns(), 1); + let expected = StringArray::from_iter_values( + std::iter::repeat_n("a", 50) + .chain(std::iter::repeat_n("c", 50)) + .chain(std::iter::repeat_n("a", 50)) + .chain(std::iter::repeat_n("c", 50)), + ); + assert_eq!(batch.column(0).as_string(), &expected); + } +} + +/// Regression test: Auto falls back to selectors when an earlier predicate +/// prunes away whole pages. Explicit Mask still exercises sparse loaded ranges +/// in the tests below. +#[tokio::test] +async fn test_auto_sparse_pages_fall_back_to_selectors_across_predicates() { // 300 rows, 1 row group, 100 rows per page (3 pages) let num_rows = 300usize; let rows_per_page = 100; @@ -620,12 +734,13 @@ async fn test_multi_predicate_mask_policy_carryover() { .with_row_filter(row_filter) .with_row_selection(selection) .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Auto { threshold: 1024 }) .with_max_predicate_cache_size(0) .build() .unwrap(); - // Without the fix, this panics with: - // "Invalid offset in sparse column chunk data: ..., no matching page found." + // This exercises Auto after page pruning. Without the Auto sparse-page gate, + // the second predicate would use a sparse mask and can regress heavily. let batches: Vec = stream.try_collect().await.unwrap(); let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); @@ -633,4 +748,203 @@ async fn test_multi_predicate_mask_policy_carryover() { // That's even-indexed rows in [0,100) with value<250 → rows 0,2,4,...,98 (50 rows) // Plus even-indexed rows in [200,250) with value<250 → rows 200,202,...,248 (25 rows) assert_eq!(batch.num_rows(), 75); + assert_eq!(batch.num_columns(), 2); + + let expected_filter_col = Int32Array::from(vec![0; 75]); + assert_eq!( + batch.column(0).as_primitive::(), + &expected_filter_col + ); + + let expected_values = + Int32Array::from_iter_values((0..100).step_by(2).chain((200..250).step_by(2))); + assert_eq!( + batch.column(1).as_primitive::(), + &expected_values + ); +} + +#[tokio::test] +async fn test_explicit_mask_final_projection_with_sparse_pages() { + let num_rows = 300usize; + let rows_per_page = 100; + + let schema = Arc::new(Schema::new(vec![ + Field::new("filter_col", DataType::Int32, false), + Field::new("value_col", DataType::Int32, false), + ])); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(num_rows)) + .set_data_page_row_count_limit(rows_per_page) + .set_write_batch_size(rows_per_page) + .set_dictionary_enabled(false) + .build(); + + let filter_values: Vec = (0..num_rows as i32) + .map(|i| if (100..200).contains(&i) { 1 } else { 0 }) + .collect(); + let value_values: Vec = (0..num_rows as i32).collect(); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(filter_values)) as ArrayRef, + Arc::new(Int32Array::from(value_values)) as ArrayRef, + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + let buffer = Bytes::from(buffer); + + let reader = TestReader::new(buffer); + let options = ArrowReaderOptions::default().with_page_index_policy(PageIndexPolicy::Required); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(reader, options) + .await + .unwrap(); + + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + let selectors: Vec = (0..num_rows / 2) + .flat_map(|_| vec![RowSelector::select(1), RowSelector::skip(1)]) + .collect(); + let selection = RowSelection::from(selectors); + + let pred1 = ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|v| v.map(|val| val == 0)), + )) + }); + + let pred2 = ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [1]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|v| v.map(|val| val < 250)), + )) + }); + + let row_filter = RowFilter::new(vec![Box::new(pred1), Box::new(pred2)]); + let projection = ProjectionMask::roots(&schema_descr, [0, 1]); + + let stream = builder + .with_row_filter(row_filter) + .with_row_selection(selection) + .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Mask) + .with_max_predicate_cache_size(0) + .build() + .unwrap(); + + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); + + assert_eq!(batch.num_rows(), 75); + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.schema().field(0).name(), "filter_col"); + assert_eq!(batch.schema().field(1).name(), "value_col"); + + let expected_filter_col = Int32Array::from(vec![0; 75]); + assert_eq!( + batch.column(0).as_primitive::(), + &expected_filter_col + ); + + let expected_values = + Int32Array::from_iter_values((0..100).step_by(2).chain((200..250).step_by(2))); + assert_eq!( + batch.column(1).as_primitive::(), + &expected_values + ); +} + +#[tokio::test] +async fn test_explicit_mask_list_projection_with_sparse_pages() { + let num_rows = 300usize; + let rows_per_page = 100; + + let schema = Arc::new(Schema::new(vec![ + Field::new("filter_col", DataType::Int32, false), + Field::new( + "list_col", + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + true, + ), + ])); + + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(num_rows)) + .set_data_page_row_count_limit(rows_per_page) + .set_write_batch_size(rows_per_page) + .set_dictionary_enabled(false) + .build(); + + let filter_values: Vec = (0..num_rows as i32) + .map(|i| if (100..200).contains(&i) { 1 } else { 0 }) + .collect(); + let list_values = ListArray::from_iter_primitive::( + (0..num_rows as i32).map(|i| Some(vec![Some(i), Some(i + 1000)])), + ); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(filter_values)) as ArrayRef, + Arc::new(list_values) as ArrayRef, + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, schema.clone(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + let buffer = Bytes::from(buffer); + + let reader = TestReader::new(buffer); + let options = ArrowReaderOptions::default().with_page_index_policy(PageIndexPolicy::Required); + let builder = ParquetRecordBatchStreamBuilder::new_with_options(reader, options) + .await + .unwrap(); + + let schema_descr = builder.metadata().file_metadata().schema_descr_ptr(); + + let pred = ArrowPredicateFn::new(ProjectionMask::roots(&schema_descr, [0]), |batch| { + let col = batch.column(0).as_primitive::(); + Ok(BooleanArray::from_iter( + col.iter().map(|v| v.map(|val| val == 0)), + )) + }); + let row_filter = RowFilter::new(vec![Box::new(pred)]); + let projection = ProjectionMask::roots(&schema_descr, [1]); + + let stream = builder + .with_row_filter(row_filter) + .with_projection(projection) + .with_row_selection_policy(RowSelectionPolicy::Mask) + .build() + .unwrap(); + + let batches: Vec = stream.try_collect().await.unwrap(); + let batch = concat_batches(&batches[0].schema(), &batches).unwrap(); + + assert_eq!(batch.num_rows(), 200); + assert_eq!(batch.num_columns(), 1); + + let expected_indices = (0..100).chain(200..300); + let expected = ListArray::from_iter_primitive::( + expected_indices.map(|i| Some(vec![Some(i), Some(i + 1000)])), + ); + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(), + &expected + ); }