Skip to content

Commit d5593b7

Browse files
authored
feat: add parquet page filter (#664)
## Rationale Part of #589 ## Detailed Changes - Introduce `PagePruningPredicate` when build `ParquetRecordBatchStream` ## Test Plan
1 parent 96fc8ab commit d5593b7

File tree

5 files changed

+95
-31
lines changed

5 files changed

+95
-31
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
target
22
.DS_Store
33
.idea/
4-
.vscode
4+
.vscode
5+
.dir-locals.el

Cargo.lock

Lines changed: 19 additions & 19 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ cluster = { path = "cluster" }
7676
criterion = "0.3"
7777
common_types = { path = "common_types" }
7878
common_util = { path = "common_util" }
79-
datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "06e9f53637f20dd91bef43b74942ec36c38c22d5" }
80-
datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "06e9f53637f20dd91bef43b74942ec36c38c22d5" }
79+
datafusion = { git = "https://github.com/jiacai2050/arrow-datafusion.git", rev = "13314c37020b90246db9b80f8294370c06e61018" }
80+
datafusion-proto = { git = "https://github.com/jiacai2050/arrow-datafusion.git", rev = "13314c37020b90246db9b80f8294370c06e61018" }
8181
df_operator = { path = "df_operator" }
8282
etcd-client = "0.10.3"
8383
env_logger = "0.6"
@@ -89,10 +89,10 @@ lazy_static = "1.4.0"
8989
log = "0.4"
9090
logger = { path = "components/logger" }
9191
lru = "0.7.6"
92-
influxql-logical-planner = { git = "https://github.com/CeresDB/influxql", package = "iox_query_influxql" }
93-
influxql-parser = { git = "https://github.com/CeresDB/influxql", package = "influxdb_influxql_parser" }
94-
influxql-query = { git = "https://github.com/CeresDB/influxql", package = "iox_query" }
95-
influxql-schema = { git = "https://github.com/CeresDB/influxql", package = "schema" }
92+
influxql-logical-planner = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "iox_query_influxql" }
93+
influxql-parser = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "influxdb_influxql_parser" }
94+
influxql-query = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "iox_query" }
95+
influxql-schema = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "schema" }
9696
interpreters = { path = "interpreters" }
9797
itertools = "0.10.5"
9898
meta_client = { path = "meta_client" }

analytic_engine/src/sst/parquet/async_reader.rs

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,22 @@ use common_util::{
2222
runtime::{AbortOnDropMany, JoinHandle, Runtime},
2323
time::InstantExt,
2424
};
25+
use datafusion::{
26+
common::ToDFSchema,
27+
physical_expr::{create_physical_expr, execution_props::ExecutionProps},
28+
physical_plan::{
29+
file_format::{parquet::page_filter::PagePruningPredicate, ParquetFileMetrics},
30+
metrics::ExecutionPlanMetricsSet,
31+
},
32+
};
2533
use futures::{future::BoxFuture, FutureExt, Stream, StreamExt, TryFutureExt};
2634
use log::{debug, error};
2735
use object_store::{ObjectStoreRef, Path};
2836
use parquet::{
29-
arrow::{async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder, ProjectionMask},
37+
arrow::{
38+
arrow_reader::RowSelection, async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder,
39+
ProjectionMask,
40+
},
3041
file::metadata::RowGroupMetaData,
3142
};
3243
use parquet_ext::meta_data::ChunkReader;
@@ -71,6 +82,7 @@ pub struct Reader<'a> {
7182

7283
/// Options for `read_parallelly`
7384
metrics: Metrics,
85+
df_plan_metrics: ExecutionPlanMetricsSet,
7486
}
7587

7688
#[derive(Default, Debug, Clone, TraceMetricWhenDrop)]
@@ -94,7 +106,7 @@ impl<'a> Reader<'a> {
94106
metrics_collector: Option<MetricsCollector>,
95107
) -> Self {
96108
let store = store_picker.pick_by_freq(options.frequency);
97-
109+
let df_plan_metrics = ExecutionPlanMetricsSet::new();
98110
let metrics = Metrics {
99111
metrics_collector,
100112
..Default::default()
@@ -112,6 +124,7 @@ impl<'a> Reader<'a> {
112124
meta_data: None,
113125
row_projector: None,
114126
metrics,
127+
df_plan_metrics,
115128
}
116129
}
117130

@@ -182,6 +195,36 @@ impl<'a> Reader<'a> {
182195
suggested.min(num_row_groups).max(1)
183196
}
184197

198+
fn build_row_selection(
199+
&self,
200+
arrow_schema: SchemaRef,
201+
row_groups: &[usize],
202+
file_metadata: &parquet_ext::ParquetMetaData,
203+
) -> Result<Option<RowSelection>> {
204+
// TODO: remove fixed partition
205+
let partition = 0;
206+
let exprs = datafusion::optimizer::utils::conjunction(self.predicate.exprs().to_vec());
207+
let exprs = match exprs {
208+
Some(exprs) => exprs,
209+
None => return Ok(None),
210+
};
211+
212+
let df_schema = arrow_schema
213+
.clone()
214+
.to_dfschema()
215+
.context(DataFusionError)?;
216+
let physical_expr =
217+
create_physical_expr(&exprs, &df_schema, &arrow_schema, &ExecutionProps::new())
218+
.context(DataFusionError)?;
219+
let page_predicate = PagePruningPredicate::try_new(&physical_expr, arrow_schema.clone())
220+
.context(DataFusionError)?;
221+
222+
let metrics = ParquetFileMetrics::new(partition, self.path.as_ref(), &self.df_plan_metrics);
223+
page_predicate
224+
.prune(row_groups, file_metadata, &metrics)
225+
.context(DataFusionError)
226+
}
227+
185228
async fn fetch_record_batch_streams(
186229
&mut self,
187230
suggested_parallelism: usize,
@@ -190,10 +233,10 @@ impl<'a> Reader<'a> {
190233

191234
let meta_data = self.meta_data.as_ref().unwrap();
192235
let row_projector = self.row_projector.as_ref().unwrap();
193-
236+
let arrow_schema = meta_data.custom().schema.to_arrow_schema_ref();
194237
// Get target row groups.
195238
let target_row_groups = self.prune_row_groups(
196-
meta_data.custom().schema.to_arrow_schema_ref(),
239+
arrow_schema.clone(),
197240
meta_data.parquet().row_groups(),
198241
meta_data.custom().parquet_filter.as_ref(),
199242
)?;
@@ -226,6 +269,7 @@ impl<'a> Reader<'a> {
226269
target_row_group_chunks[chunk_idx].push(row_group);
227270
}
228271

272+
let parquet_metadata = meta_data.parquet();
229273
let proj_mask = ProjectionMask::leaves(
230274
meta_data.parquet().file_metadata().schema_descr(),
231275
row_projector.existed_source_projection().iter().copied(),
@@ -239,9 +283,15 @@ impl<'a> Reader<'a> {
239283
for chunk in target_row_group_chunks {
240284
let object_store_reader =
241285
ObjectStoreReader::new(self.store.clone(), self.path.clone(), meta_data.clone());
242-
let builder = ParquetRecordBatchStreamBuilder::new(object_store_reader)
286+
let mut builder = ParquetRecordBatchStreamBuilder::new(object_store_reader)
243287
.await
244288
.with_context(|| ParquetError)?;
289+
let row_selection =
290+
self.build_row_selection(arrow_schema.clone(), &chunk, parquet_metadata)?;
291+
if let Some(selection) = row_selection {
292+
builder = builder.with_row_selection(selection);
293+
};
294+
245295
let stream = builder
246296
.with_batch_size(self.num_rows_per_row_group)
247297
.with_row_groups(chunk)
@@ -353,6 +403,16 @@ impl<'a> Reader<'a> {
353403
}
354404
}
355405

406+
impl<'a> Drop for Reader<'a> {
407+
fn drop(&mut self) {
408+
debug!(
409+
"Parquet reader dropped, path:{:?}, df_plan_metrics:{}",
410+
self.path,
411+
self.df_plan_metrics.clone_inner().to_string()
412+
);
413+
}
414+
}
415+
356416
#[derive(Clone)]
357417
struct ObjectStoreReader {
358418
storage: ObjectStoreRef,

analytic_engine/src/sst/parquet/row_group_pruner.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ pub struct RowGroupPruner<'a> {
5151
}
5252

5353
impl<'a> RowGroupPruner<'a> {
54+
// TODO: DataFusion already change predicates to PhyscialExpr, we should keep up
55+
// with upstream.
56+
// https://github.com/apache/arrow-datafusion/issues/4695
5457
pub fn try_new(
5558
schema: &'a SchemaRef,
5659
row_groups: &'a [RowGroupMetaData],

0 commit comments

Comments
 (0)