Skip to content

Commit a9f2191

Browse files
authored
feat(bloom-filter): integrate indexer with mito2 (#5236)
* feat(bloom-filter): integrate indexer with mito2 Signed-off-by: Zhenchi <[email protected]> * rename skippingindextype Signed-off-by: Zhenchi <[email protected]> * address comments Signed-off-by: Zhenchi <[email protected]> --------- Signed-off-by: Zhenchi <[email protected]>
1 parent 039989f commit a9f2191

File tree

22 files changed

+1032
-254
lines changed

22 files changed

+1032
-254
lines changed

src/datatypes/src/schema.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use crate::error::{self, DuplicateColumnSnafu, Error, ProjectArrowSchemaSnafu, R
2929
use crate::prelude::ConcreteDataType;
3030
pub use crate::schema::column_schema::{
3131
ColumnSchema, FulltextAnalyzer, FulltextOptions, Metadata, SkippingIndexOptions,
32-
COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE, COLUMN_FULLTEXT_OPT_KEY_ANALYZER,
32+
SkippingIndexType, COLUMN_FULLTEXT_CHANGE_OPT_KEY_ENABLE, COLUMN_FULLTEXT_OPT_KEY_ANALYZER,
3333
COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
3434
COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY, FULLTEXT_KEY, INVERTED_INDEX_KEY,
3535
SKIPPING_INDEX_KEY, TIME_INDEX_KEY,

src/datatypes/src/schema/column_schema.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ pub struct SkippingIndexOptions {
543543
pub granularity: u32,
544544
/// The type of the skip index.
545545
#[serde(default)]
546-
pub index_type: SkipIndexType,
546+
pub index_type: SkippingIndexType,
547547
}
548548

549549
impl fmt::Display for SkippingIndexOptions {
@@ -556,15 +556,15 @@ impl fmt::Display for SkippingIndexOptions {
556556

557557
/// Skip index types.
558558
#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, Visit, VisitMut)]
559-
pub enum SkipIndexType {
559+
pub enum SkippingIndexType {
560560
#[default]
561561
BloomFilter,
562562
}
563563

564-
impl fmt::Display for SkipIndexType {
564+
impl fmt::Display for SkippingIndexType {
565565
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
566566
match self {
567-
SkipIndexType::BloomFilter => write!(f, "BLOOM"),
567+
SkippingIndexType::BloomFilter => write!(f, "BLOOM"),
568568
}
569569
}
570570
}
@@ -587,15 +587,15 @@ impl TryFrom<HashMap<String, String>> for SkippingIndexOptions {
587587
// Parse index type with default value BloomFilter
588588
let index_type = match options.get(COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE) {
589589
Some(typ) => match typ.to_ascii_uppercase().as_str() {
590-
"BLOOM" => SkipIndexType::BloomFilter,
590+
"BLOOM" => SkippingIndexType::BloomFilter,
591591
_ => {
592592
return error::InvalidSkippingIndexOptionSnafu {
593593
msg: format!("Invalid index type: {typ}, expected: 'BLOOM'"),
594594
}
595595
.fail();
596596
}
597597
},
598-
None => SkipIndexType::default(),
598+
None => SkippingIndexType::default(),
599599
};
600600

601601
Ok(SkippingIndexOptions {

src/index/src/bloom_filter/creator.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ impl BloomFilterCreator {
7373
/// `rows_per_segment` <= 0
7474
pub fn new(
7575
rows_per_segment: usize,
76-
intermediate_provider: Box<dyn ExternalTempFileProvider>,
76+
intermediate_provider: Arc<dyn ExternalTempFileProvider>,
7777
global_memory_usage: Arc<AtomicUsize>,
7878
global_memory_usage_threshold: Option<usize>,
7979
) -> Self {
@@ -252,7 +252,7 @@ mod tests {
252252
let mut writer = Cursor::new(Vec::new());
253253
let mut creator = BloomFilterCreator::new(
254254
2,
255-
Box::new(MockExternalTempFileProvider::new()),
255+
Arc::new(MockExternalTempFileProvider::new()),
256256
Arc::new(AtomicUsize::new(0)),
257257
None,
258258
);
@@ -322,7 +322,7 @@ mod tests {
322322
let mut writer = Cursor::new(Vec::new());
323323
let mut creator = BloomFilterCreator::new(
324324
2,
325-
Box::new(MockExternalTempFileProvider::new()),
325+
Arc::new(MockExternalTempFileProvider::new()),
326326
Arc::new(AtomicUsize::new(0)),
327327
None,
328328
);

src/index/src/bloom_filter/creator/finalize_segment.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ pub struct FinalizedBloomFilterStorage {
4343
intermediate_prefix: String,
4444

4545
/// The provider for intermediate Bloom filter files.
46-
intermediate_provider: Box<dyn ExternalTempFileProvider>,
46+
intermediate_provider: Arc<dyn ExternalTempFileProvider>,
4747

4848
/// The memory usage of the in-memory Bloom filters.
4949
memory_usage: usize,
@@ -59,7 +59,7 @@ pub struct FinalizedBloomFilterStorage {
5959
impl FinalizedBloomFilterStorage {
6060
/// Creates a new `FinalizedBloomFilterStorage`.
6161
pub fn new(
62-
intermediate_provider: Box<dyn ExternalTempFileProvider>,
62+
intermediate_provider: Arc<dyn ExternalTempFileProvider>,
6363
global_memory_usage: Arc<AtomicUsize>,
6464
global_memory_usage_threshold: Option<usize>,
6565
) -> Self {
@@ -132,7 +132,7 @@ impl FinalizedBloomFilterStorage {
132132
/// Drains the storage and returns a stream of finalized Bloom filter segments.
133133
pub async fn drain(
134134
&mut self,
135-
) -> Result<Pin<Box<dyn Stream<Item = Result<FinalizedBloomFilterSegment>> + '_>>> {
135+
) -> Result<Pin<Box<dyn Stream<Item = Result<FinalizedBloomFilterSegment>> + Send + '_>>> {
136136
// FAST PATH: memory only
137137
if self.intermediate_file_id_counter == 0 {
138138
return Ok(Box::pin(stream::iter(self.in_memory.drain(..).map(Ok))));
@@ -257,7 +257,7 @@ mod tests {
257257

258258
let global_memory_usage = Arc::new(AtomicUsize::new(0));
259259
let global_memory_usage_threshold = Some(1024 * 1024); // 1MB
260-
let provider = Box::new(mock_provider);
260+
let provider = Arc::new(mock_provider);
261261
let mut storage = FinalizedBloomFilterStorage::new(
262262
provider,
263263
global_memory_usage.clone(),

src/index/src/bloom_filter/reader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ mod tests {
190190
let mut writer = Cursor::new(vec![]);
191191
let mut creator = BloomFilterCreator::new(
192192
2,
193-
Box::new(MockExternalTempFileProvider::new()),
193+
Arc::new(MockExternalTempFileProvider::new()),
194194
Arc::new(AtomicUsize::new(0)),
195195
None,
196196
);

src/mito2/src/compaction/compactor.rs

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ use common_telemetry::{info, warn};
2121
use common_time::TimeToLive;
2222
use object_store::manager::ObjectStoreManagerRef;
2323
use serde::{Deserialize, Serialize};
24-
use smallvec::SmallVec;
2524
use snafu::{OptionExt, ResultExt};
2625
use store_api::metadata::RegionMetadataRef;
2726
use store_api::storage::RegionId;
@@ -41,7 +40,7 @@ use crate::region::options::RegionOptions;
4140
use crate::region::version::VersionRef;
4241
use crate::region::{ManifestContext, RegionLeaderState, RegionRoleState};
4342
use crate::schedule::scheduler::LocalScheduler;
44-
use crate::sst::file::{FileMeta, IndexType};
43+
use crate::sst::file::FileMeta;
4544
use crate::sst::file_purger::LocalFilePurger;
4645
use crate::sst::index::intermediate::IntermediateManager;
4746
use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -336,16 +335,7 @@ impl Compactor for DefaultCompactor {
336335
time_range: sst_info.time_range,
337336
level: output.output_level,
338337
file_size: sst_info.file_size,
339-
available_indexes: {
340-
let mut indexes = SmallVec::new();
341-
if sst_info.index_metadata.inverted_index.is_available() {
342-
indexes.push(IndexType::InvertedIndex);
343-
}
344-
if sst_info.index_metadata.fulltext_index.is_available() {
345-
indexes.push(IndexType::FulltextIndex);
346-
}
347-
indexes
348-
},
338+
available_indexes: sst_info.index_metadata.build_available_indexes(),
349339
index_file_size: sst_info.index_metadata.file_size,
350340
num_rows: sst_info.num_rows as u64,
351341
num_row_groups: sst_info.num_row_groups,

src/mito2/src/error.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -816,8 +816,8 @@ pub enum Error {
816816
location: Location,
817817
},
818818

819-
#[snafu(display("Failed to retrieve fulltext options from column metadata"))]
820-
FulltextOptions {
819+
#[snafu(display("Failed to retrieve index options from column metadata"))]
820+
IndexOptions {
821821
#[snafu(implicit)]
822822
location: Location,
823823
source: datatypes::error::Error,
@@ -904,6 +904,20 @@ pub enum Error {
904904
#[snafu(implicit)]
905905
location: Location,
906906
},
907+
908+
#[snafu(display("Failed to push value to bloom filter"))]
909+
PushBloomFilterValue {
910+
source: index::bloom_filter::error::Error,
911+
#[snafu(implicit)]
912+
location: Location,
913+
},
914+
915+
#[snafu(display("Failed to finish bloom filter"))]
916+
BloomFilterFinish {
917+
source: index::bloom_filter::error::Error,
918+
#[snafu(implicit)]
919+
location: Location,
920+
},
907921
}
908922

909923
pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -1029,7 +1043,7 @@ impl ErrorExt for Error {
10291043
UnsupportedOperation { .. } => StatusCode::Unsupported,
10301044
RemoteCompaction { .. } => StatusCode::Unexpected,
10311045

1032-
FulltextOptions { source, .. } => source.status_code(),
1046+
IndexOptions { source, .. } => source.status_code(),
10331047
CreateFulltextCreator { source, .. } => source.status_code(),
10341048
CastVector { source, .. } => source.status_code(),
10351049
FulltextPushText { source, .. }
@@ -1039,7 +1053,12 @@ impl ErrorExt for Error {
10391053
RegionBusy { .. } => StatusCode::RegionBusy,
10401054
GetSchemaMetadata { source, .. } => source.status_code(),
10411055
Timeout { .. } => StatusCode::Cancelled,
1056+
10421057
DecodeArrowRowGroup { .. } => StatusCode::Internal,
1058+
1059+
PushBloomFilterValue { source, .. } | BloomFilterFinish { source, .. } => {
1060+
source.status_code()
1061+
}
10431062
}
10441063
}
10451064

src/mito2/src/flush.rs

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ use std::sync::atomic::{AtomicUsize, Ordering};
1919
use std::sync::Arc;
2020

2121
use common_telemetry::{debug, error, info, trace};
22-
use smallvec::SmallVec;
2322
use snafu::ResultExt;
2423
use store_api::storage::RegionId;
2524
use strum::IntoStaticStr;
@@ -45,7 +44,7 @@ use crate::request::{
4544
SenderWriteRequest, WorkerRequest,
4645
};
4746
use crate::schedule::scheduler::{Job, SchedulerRef};
48-
use crate::sst::file::{FileId, FileMeta, IndexType};
47+
use crate::sst::file::{FileId, FileMeta};
4948
use crate::sst::parquet::WriteOptions;
5049
use crate::worker::WorkerListener;
5150

@@ -378,16 +377,7 @@ impl RegionFlushTask {
378377
time_range: sst_info.time_range,
379378
level: 0,
380379
file_size: sst_info.file_size,
381-
available_indexes: {
382-
let mut indexes = SmallVec::new();
383-
if sst_info.index_metadata.inverted_index.is_available() {
384-
indexes.push(IndexType::InvertedIndex);
385-
}
386-
if sst_info.index_metadata.fulltext_index.is_available() {
387-
indexes.push(IndexType::FulltextIndex);
388-
}
389-
indexes
390-
},
380+
available_indexes: sst_info.index_metadata.build_available_indexes(),
391381
index_file_size: sst_info.index_metadata.file_size,
392382
num_rows: sst_info.num_rows as u64,
393383
num_row_groups: sst_info.num_row_groups,

src/mito2/src/sst/file.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ pub enum IndexType {
143143
InvertedIndex,
144144
/// Full-text index.
145145
FulltextIndex,
146+
/// Bloom filter.
147+
BloomFilter,
146148
}
147149

148150
impl FileMeta {
@@ -156,6 +158,11 @@ impl FileMeta {
156158
self.available_indexes.contains(&IndexType::FulltextIndex)
157159
}
158160

161+
/// Returns true if the file has a bloom filter
162+
pub fn bloom_filter_available(&self) -> bool {
163+
self.available_indexes.contains(&IndexType::BloomFilter)
164+
}
165+
159166
/// Returns the size of the inverted index file
160167
pub fn inverted_index_size(&self) -> Option<u64> {
161168
if self.available_indexes.len() == 1 && self.inverted_index_available() {

0 commit comments

Comments
 (0)