From 47ea4529b1bc575bbafd1852731a0c774cbd7e37 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 6 Jul 2022 13:37:11 +0200 Subject: [PATCH 01/25] introduce the roaring multiop in the grenad merger --- benchmarks/Cargo.toml | 2 +- infos/Cargo.toml | 3 ++- milli/Cargo.toml | 2 +- .../src/update/index_documents/helpers/merge_functions.rs | 8 +++----- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 3a832defb..36d3a46b2 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -15,7 +15,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } criterion = { version = "0.3.5", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = "0.9.0" +roaring = { git = "https://github.com/irevoire/roaring-rs", branch = "tamo-treemap" } [build-dependencies] anyhow = "1.0.56" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 6b6b1f300..91cb521e5 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -11,7 +11,8 @@ byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } csv = "1.1.6" milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } -roaring = "0.9.0" +# roaring = "0.9.0" +roaring = { git = "https://github.com/irevoire/roaring-rs", branch = "tamo-treemap" } serde_json = "1.0.79" stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 56a1da5cd..99289ec10 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -26,7 +26,7 @@ obkv = "0.2.0" once_cell = "1.10.0" ordered-float = "2.10.0" rayon = "1.5.1" -roaring = "0.9.0" +roaring = { git = "https://github.com/irevoire/roaring-rs", branch = "tamo-treemap" } rstar = { version = "0.9.2", features = ["serde"] } serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.79", features = ["preserve_order"] } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index c5385e347..941e8351d 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::io; use std::result::Result as StdResult; -use roaring::RoaringBitmap; +use roaring::{IterExt, RoaringBitmap}; use super::read_u32_ne_bytes; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; @@ -41,8 +41,7 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul .map(AsRef::as_ref) .map(RoaringBitmap::deserialize_from) .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); + .or(); let mut buffer = Vec::new(); serialize_roaring_bitmap(&merged, &mut buffer)?; Ok(Cow::Owned(buffer)) @@ -65,8 +64,7 @@ pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( .map(|(_, bitmap_bytes)| bitmap_bytes) .map(RoaringBitmap::deserialize_from) .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); + .or(); let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); let mut buffer = Vec::with_capacity(cap); From c0714030a5c5bfeef95e483af23527e92eb79aa6 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 6 Jul 2022 14:48:41 +0200 Subject: [PATCH 02/25] improve the word pair proximity --- milli/src/search/criteria/mod.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index f48865ba5..54d0e30c4 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; use std::collections::HashMap; +use std::result::Result as StdResult; -use roaring::RoaringBitmap; +use roaring::{IterExt, RoaringBitmap}; use self::asc_desc::AscDesc; use self::attribute::Attribute; @@ -386,16 +387,17 @@ fn all_word_pair_proximity_docids, U: AsRef>( right_words: &[(U, u8)], proximity: u8, ) -> Result { - let mut docids = RoaringBitmap::new(); - for (left, _l_typo) in left_words { - for (right, _r_typo) in right_words { - let current_docids = ctx - .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? - .unwrap_or_default(); - docids |= current_docids; - } - } - Ok(docids) + let docids = left_words + .into_iter() + .flat_map(|(left, _l_typo)| { + right_words.into_iter().map(move |(right, _r_typo)| { + ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity) + .map(|res| res.unwrap_or_default()) + }) + }) + .collect::, _>>()?; + + Ok(docids.or()) } fn query_docids( From ce362b346662dc346121ba1b96bf87d0e4447062 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 6 Jul 2022 14:57:14 +0200 Subject: [PATCH 03/25] improve the query tree OR --- milli/src/search/criteria/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 54d0e30c4..397fabe03 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -329,12 +329,12 @@ pub fn resolve_query_tree( } Phrase(words) => resolve_phrase(ctx, &words), Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, wdcache)?; - candidates |= docids; - } - Ok(candidates) + let candidates = ops + .into_iter() + .map(|op| resolve_operation(ctx, op, wdcache)) + .collect::, _>>()?; + + Ok(candidates.or()) } Query(q) => Ok(query_docids(ctx, q, wdcache)?), } From 23d099775afda5d1252feb990ae30a5d8eb7c0b1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 6 Jul 2022 15:43:01 +0200 Subject: [PATCH 04/25] improve the query tree Phrase --- milli/src/search/criteria/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 397fabe03..0490d8fa4 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -327,7 +327,6 @@ pub fn resolve_query_tree( } Ok(candidates) } - Phrase(words) => resolve_phrase(ctx, &words), Or(_, ops) => { let candidates = ops .into_iter() @@ -336,6 +335,7 @@ pub fn resolve_query_tree( Ok(candidates.or()) } + Phrase(words) => resolve_phrase(ctx, &words), Query(q) => Ok(query_docids(ctx, q, wdcache)?), } } From 026934c7c727b170b88c5e6a0e8fd7f30bbe594d Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 6 Jul 2022 15:45:29 +0200 Subject: [PATCH 05/25] improve the query tree AND --- milli/src/search/criteria/mod.rs | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 0490d8fa4..1beae435c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -308,24 +308,12 @@ pub fn resolve_query_tree( match query_tree { And(ops) => { - let mut ops = ops + let candidates = ops .iter() .map(|op| resolve_operation(ctx, op, wdcache)) .collect::>>()?; - ops.sort_unstable_by_key(|cds| cds.len()); - - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for docids in ops { - if first_loop { - candidates = docids; - first_loop = false; - } else { - candidates &= &docids; - } - } - Ok(candidates) + Ok(candidates.and()) } Or(_, ops) => { let candidates = ops From ce6857264e3642884a2ed3ade5bed036b5fa5ecc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 6 Jul 2022 16:36:00 +0200 Subject: [PATCH 06/25] improve proximity slightly --- milli/src/search/criteria/proximity.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e942a7bef..b29187d28 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -3,7 +3,7 @@ use std::collections::hash_map::HashMap; use std::mem::take; use log::debug; -use roaring::RoaringBitmap; +use roaring::{IterExt, RoaringBitmap}; use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, @@ -257,17 +257,14 @@ fn resolve_candidates<'t>( for (ll, lr, lcandidates) in lefts { for (rl, rr, rcandidates) in rights { - let mut candidates = + let candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; - if lcandidates.len() < rcandidates.len() { - candidates &= lcandidates; - candidates &= rcandidates; - } else { - candidates &= rcandidates; - candidates &= lcandidates; - } if !candidates.is_empty() { - output.push((ll.clone(), rr.clone(), candidates)); + output.push(( + ll.clone(), + rr.clone(), + [&candidates, lcandidates, rcandidates].and(), + )); } } } From 03d63a8a59120359cac12804a943eaf9ca765048 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 12 Jul 2022 10:46:16 +0200 Subject: [PATCH 07/25] simplify the cbo roaring bitmap merge_into function --- .../cbo_roaring_bitmap_codec.rs | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 1bd132974..8cde7f92e 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -3,7 +3,7 @@ use std::io; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; -use roaring::RoaringBitmap; +use roaring::{IterExt, RoaringBitmap}; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able @@ -59,37 +59,29 @@ impl CboRoaringBitmapCodec { /// serialized in the buffer else a RoaringBitmap is created from the /// values and is serialized in the buffer. pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { - let mut roaring = RoaringBitmap::new(); let mut vec = Vec::new(); - for bytes in slices { - if bytes.len() <= THRESHOLD * size_of::() { - let mut reader = bytes.as_ref(); - while let Ok(integer) = reader.read_u32::() { - vec.push(integer); + let roaring = slices + .iter() + .map(|slice| { + if slice.len() <= THRESHOLD * size_of::() { + let mut reader = slice.as_ref(); + while let Ok(integer) = reader.read_u32::() { + vec.push(integer); + } + vec.sort_unstable(); + // we can unwrap safely because the vector is sorted + let res = RoaringBitmap::from_sorted_iter(vec.iter().copied()).unwrap(); + vec.clear(); + Ok(res) + } else { + RoaringBitmap::deserialize_from(slice.as_ref()).into() } - } else { - roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; - } - } + }) + .collect::>>()?; + let roaring = roaring.or(); - if roaring.is_empty() { - vec.sort_unstable(); - vec.dedup(); - - if vec.len() <= THRESHOLD { - for integer in vec { - buffer.extend_from_slice(&integer.to_ne_bytes()); - } - } else { - // We can unwrap safely because the vector is sorted upper. - let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); - roaring.serialize_into(buffer)?; - } - } else { - roaring.extend(vec); - roaring.serialize_into(buffer)?; - } + roaring.serialize_into(buffer)?; Ok(()) } From 6e924c6485b5df4eab2c56dec8fd0313f086dc16 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 12 Jul 2022 16:21:10 +0200 Subject: [PATCH 08/25] fix the code --- .../heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 8cde7f92e..518048f31 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -81,7 +81,13 @@ impl CboRoaringBitmapCodec { .collect::>>()?; let roaring = roaring.or(); - roaring.serialize_into(buffer)?; + if roaring.len() as usize <= THRESHOLD { + for elem in roaring { + buffer.extend_from_slice(&elem.to_ne_bytes()); + } + } else { + roaring.serialize_into(buffer)?; + } Ok(()) } From be499c99bdec10df63c8b9e417320cf3ddb4a407 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 13 Jul 2022 12:21:36 +0200 Subject: [PATCH 09/25] =?UTF-8?q?HORRIBLE=E2=80=AFCODE=20+=20HORRIBLE?= =?UTF-8?q?=E2=80=AFBENCHMARKS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cbo_roaring_bitmap_codec.rs | 138 +++++++++++++++--- milli/src/lib.rs | 3 + 2 files changed, 124 insertions(+), 17 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 518048f31..95068bd96 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -54,45 +54,102 @@ impl CboRoaringBitmapCodec { } /// Merge serialized CboRoaringBitmaps in a buffer. + /// The buffer MUST BE empty. /// /// if the merged values length is under the threshold, values are directly /// serialized in the buffer else a RoaringBitmap is created from the /// values and is serialized in the buffer. pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { - let mut vec = Vec::new(); + debug_assert!(buffer.len() == 0); - let roaring = slices + let bitmaps = slices .iter() - .map(|slice| { + .filter_map(|slice| { if slice.len() <= THRESHOLD * size_of::() { - let mut reader = slice.as_ref(); - while let Ok(integer) = reader.read_u32::() { - vec.push(integer); - } - vec.sort_unstable(); - // we can unwrap safely because the vector is sorted - let res = RoaringBitmap::from_sorted_iter(vec.iter().copied()).unwrap(); - vec.clear(); - Ok(res) + buffer.extend(slice.as_ref()); + None } else { RoaringBitmap::deserialize_from(slice.as_ref()).into() } }) .collect::>>()?; - let roaring = roaring.or(); - if roaring.len() as usize <= THRESHOLD { - for elem in roaring { - buffer.extend_from_slice(&elem.to_ne_bytes()); + let u32_buffer: &mut Vec = unsafe { convert_vec(buffer) }; + u32_buffer.sort_unstable(); + u32_buffer.dedup(); + + if bitmaps.is_empty() { + if u32_buffer.len() > THRESHOLD { + // We can unwrap safely because the vector is sorted above. + let roaring = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); + + let buffer: &mut Vec = unsafe { convert_vec(u32_buffer) }; + buffer.clear(); + roaring.serialize_into(buffer)?; + } else { + // we still need to fix the size of the buffer + let _buffer: &mut Vec = unsafe { convert_vec(u32_buffer) }; } } else { - roaring.serialize_into(buffer)?; + let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); + let buffer: &mut Vec = unsafe { convert_vec(u32_buffer) }; + let bitmap = bitmaps.into_iter().chain(std::iter::once(bitmap)).or(); + buffer.clear(); + bitmap.serialize_into(buffer)?; + } + + /* + match (bitmaps.is_empty(), buffer.len() >= THRESHOLD * size_of::()) { + (true, true) => { + // we need to sort the buffer as if it was containing u32 + let u32_buffer: &mut Vec = unsafe { convert_vec(buffer) }; + u32_buffer.sort_unstable(); + u32_buffer.dedup(); + + let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); + buffer.clear(); + bitmap.serialize_into(buffer)?; + } + (true, false) => (), + (false, true) => { + // we need to sort the buffer as if it was containing u32 + let u32_buffer: &mut Vec = unsafe { convert_vec(buffer) }; + u32_buffer.sort_unstable(); + u32_buffer.dedup(); + + let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); + let bitmap = bitmaps.into_iter().chain(std::iter::once(bitmap)).or(); + buffer.clear(); + bitmap.serialize_into(buffer)?; + } + (false, false) => { + let bitmap = bitmaps.into_iter().or(); + bitmap.serialize_into(buffer)?; + } } + */ Ok(()) } } +/// Convert a `Vec` of `T` into a `Vec` of `U` by keeping the same allocation and +/// only updating the size of the `Vec`. +/// To make this works `size_of::() * input.len() % size_of::()` must be equal to zero. +unsafe fn convert_vec(input: &mut Vec) -> &mut Vec { + debug_assert!( + size_of::() * input.len() % size_of::() == 0, + "called with incompatible types" + ); + + let new_len = size_of::() * input.len() / size_of::(); + + let ret: &mut Vec = std::mem::transmute(input); + ret.set_len(new_len); + + ret +} + impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { type DItem = RoaringBitmap; @@ -181,4 +238,51 @@ mod tests { let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap(); assert_eq!(bitmap, expected); } + + extern crate test; + use test::Bencher; + + #[bench] + fn bench_small_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { + std::thread::sleep_ms(30000); + #[rustfmt::skip] + let inputs = [ + vec![Cow::Owned(vec![255, 56, 14, 0]), Cow::Owned(vec![196, 43, 14, 0])], + vec![Cow::Owned(vec![63, 101, 3, 0]), Cow::Owned(vec![71, 136, 3, 0])], + vec![Cow::Owned(vec![68, 108, 0, 0]), Cow::Owned(vec![85, 104, 0, 0]), Cow::Owned(vec![204, 103, 0, 0])], + vec![Cow::Owned(vec![199, 101, 7, 0]), Cow::Owned(vec![94, 42, 7, 0])], + vec![Cow::Owned(vec![173, 219, 12, 0]), Cow::Owned(vec![146, 3, 13, 0])], + vec![Cow::Owned(vec![13, 152, 3, 0]), Cow::Owned(vec![64, 120, 3, 0])], + vec![Cow::Owned(vec![109, 253, 13, 0]), Cow::Owned(vec![108, 232, 13, 0])], + vec![Cow::Owned(vec![73, 176, 3, 0]), Cow::Owned(vec![126, 167, 3, 0])], + ]; + + let mut vec = Vec::new(); + for input in inputs { + bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); + vec.clear(); + } + } + + #[bench] + fn bench_medium_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { + std::thread::sleep_ms(1000); + #[rustfmt::skip] + let inputs = [ + vec![Cow::Owned(vec![232, 35, 9, 0]), Cow::Owned(vec![192, 10, 9, 0]), Cow::Owned(vec![91, 33, 9, 0]), Cow::Owned(vec![204, 29, 9, 0])], + vec![Cow::Owned(vec![144, 39, 9, 0]), Cow::Owned(vec![162, 66, 9, 0]), Cow::Owned(vec![146, 11, 9, 0]), Cow::Owned(vec![174, 61, 9, 0])], + vec![Cow::Owned(vec![83, 70, 7, 0]), Cow::Owned(vec![115, 72, 7, 0]), Cow::Owned(vec![219, 54, 7, 0]), Cow::Owned(vec![1, 93, 7, 0]), Cow::Owned(vec![195, 77, 7, 0]), Cow::Owned(vec![21, 86, 7, 0])], + vec![Cow::Owned(vec![244, 112, 0, 0]), Cow::Owned(vec![48, 126, 0, 0]), Cow::Owned(vec![72, 142, 0, 0]), Cow::Owned(vec![255, 113, 0, 0]), Cow::Owned(vec![101, 114, 0, 0]), Cow::Owned(vec![66, 88, 0, 0]), Cow::Owned(vec![84, 92, 0, 0]), Cow::Owned(vec![194, 137, 0, 0]), Cow::Owned(vec![208, 132, 0, 0])], + vec![Cow::Owned(vec![8, 57, 7, 0]), Cow::Owned(vec![133, 115, 7, 0]), Cow::Owned(vec![219, 94, 7, 0]), Cow::Owned(vec![46, 95, 7, 0]), Cow::Owned(vec![156, 111, 7, 0]), Cow::Owned(vec![63, 107, 7, 0]), Cow::Owned(vec![31, 47, 7, 0])], + vec![Cow::Owned(vec![165, 78, 0, 0]), Cow::Owned(vec![197, 95, 0, 0]), Cow::Owned(vec![194, 82, 0, 0]), Cow::Owned(vec![142, 91, 0, 0]), Cow::Owned(vec![120, 94, 0, 0])], + vec![Cow::Owned(vec![185, 187, 13, 0]), Cow::Owned(vec![41, 187, 13, 0]), Cow::Owned(vec![245, 223, 13, 0]), Cow::Owned(vec![211, 251, 13, 0]), Cow::Owned(vec![192, 193, 13, 0]), Cow::Owned(vec![215, 230, 13, 0]), Cow::Owned(vec![252, 207, 13, 0]), Cow::Owned(vec![131, 213, 13, 0]), Cow::Owned(vec![219, 187, 13, 0]), Cow::Owned(vec![105, 236, 13, 0]), Cow::Owned(vec![30, 239, 13, 0]), Cow::Owned(vec![13, 200, 13, 0]), Cow::Owned(vec![111, 197, 13, 0]), Cow::Owned(vec![87, 222, 13, 0]), Cow::Owned(vec![7, 205, 13, 0]), Cow::Owned(vec![90, 211, 13, 0])], + vec![Cow::Owned(vec![215, 253, 13, 0]), Cow::Owned(vec![225, 194, 13, 0]), Cow::Owned(vec![37, 189, 13, 0]), Cow::Owned(vec![242, 212, 13, 0])], + ]; + + let mut vec = Vec::new(); + for input in inputs { + bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); + vec.clear(); + } + } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 517d28ccc..fead8eb75 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,6 @@ +#![allow(soft_unstable)] +#![feature(test)] + #[macro_use] pub mod documents; From d274042197da3f9386d5ef9f6c6c84f13dffcd6f Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 14 Jul 2022 11:28:39 +0200 Subject: [PATCH 10/25] make the crate compile on stable again --- milli/Cargo.toml | 1 + .../cbo_roaring_bitmap_codec.rs | 43 ++++++++++--------- milli/src/lib.rs | 3 +- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 99289ec10..7a6eab737 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -55,6 +55,7 @@ insta = "1.18.1" maplit = "1.0.2" md5 = "0.7.0" rand = "0.8.5" +select-rustc = "0.1" [features] default = [] diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 95068bd96..47b8591c9 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -239,13 +239,15 @@ mod tests { assert_eq!(bitmap, expected); } - extern crate test; - use test::Bencher; - - #[bench] - fn bench_small_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { - std::thread::sleep_ms(30000); - #[rustfmt::skip] + #[cfg(feature = "nightly")] + mod bench { + extern crate test; + use test::Bencher; + + #[bench] + fn bench_small_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { + std::thread::sleep_ms(30000); + #[rustfmt::skip] let inputs = [ vec![Cow::Owned(vec![255, 56, 14, 0]), Cow::Owned(vec![196, 43, 14, 0])], vec![Cow::Owned(vec![63, 101, 3, 0]), Cow::Owned(vec![71, 136, 3, 0])], @@ -257,17 +259,17 @@ mod tests { vec![Cow::Owned(vec![73, 176, 3, 0]), Cow::Owned(vec![126, 167, 3, 0])], ]; - let mut vec = Vec::new(); - for input in inputs { - bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); - vec.clear(); + let mut vec = Vec::new(); + for input in inputs { + bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); + vec.clear(); + } } - } - #[bench] - fn bench_medium_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { - std::thread::sleep_ms(1000); - #[rustfmt::skip] + #[bench] + fn bench_medium_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { + std::thread::sleep_ms(1000); + #[rustfmt::skip] let inputs = [ vec![Cow::Owned(vec![232, 35, 9, 0]), Cow::Owned(vec![192, 10, 9, 0]), Cow::Owned(vec![91, 33, 9, 0]), Cow::Owned(vec![204, 29, 9, 0])], vec![Cow::Owned(vec![144, 39, 9, 0]), Cow::Owned(vec![162, 66, 9, 0]), Cow::Owned(vec![146, 11, 9, 0]), Cow::Owned(vec![174, 61, 9, 0])], @@ -279,10 +281,11 @@ mod tests { vec![Cow::Owned(vec![215, 253, 13, 0]), Cow::Owned(vec![225, 194, 13, 0]), Cow::Owned(vec![37, 189, 13, 0]), Cow::Owned(vec![242, 212, 13, 0])], ]; - let mut vec = Vec::new(); - for input in inputs { - bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); - vec.clear(); + let mut vec = Vec::new(); + for input in inputs { + bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); + vec.clear(); + } } } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fead8eb75..e08144f4f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,5 +1,4 @@ -#![allow(soft_unstable)] -#![feature(test)] +#![cfg_attr(feature = "nightly", feature(test))] #[macro_use] pub mod documents; From 5aa594df41525694aafe4d31f5c1c41a6540f1f8 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 14 Jul 2022 11:30:23 +0200 Subject: [PATCH 11/25] remove useless comment --- .../cbo_roaring_bitmap_codec.rs | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 47b8591c9..9cb1bae42 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -98,37 +98,6 @@ impl CboRoaringBitmapCodec { bitmap.serialize_into(buffer)?; } - /* - match (bitmaps.is_empty(), buffer.len() >= THRESHOLD * size_of::()) { - (true, true) => { - // we need to sort the buffer as if it was containing u32 - let u32_buffer: &mut Vec = unsafe { convert_vec(buffer) }; - u32_buffer.sort_unstable(); - u32_buffer.dedup(); - - let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); - buffer.clear(); - bitmap.serialize_into(buffer)?; - } - (true, false) => (), - (false, true) => { - // we need to sort the buffer as if it was containing u32 - let u32_buffer: &mut Vec = unsafe { convert_vec(buffer) }; - u32_buffer.sort_unstable(); - u32_buffer.dedup(); - - let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); - let bitmap = bitmaps.into_iter().chain(std::iter::once(bitmap)).or(); - buffer.clear(); - bitmap.serialize_into(buffer)?; - } - (false, false) => { - let bitmap = bitmaps.into_iter().or(); - bitmap.serialize_into(buffer)?; - } - } - */ - Ok(()) } } From c18cff101c0e3267054872f5316ef73fcfae1e23 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 14 Jul 2022 11:34:22 +0200 Subject: [PATCH 12/25] remove the sleep in the benchmarks --- milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 9cb1bae42..90925e56f 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -215,7 +215,6 @@ mod tests { #[bench] fn bench_small_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { - std::thread::sleep_ms(30000); #[rustfmt::skip] let inputs = [ vec![Cow::Owned(vec![255, 56, 14, 0]), Cow::Owned(vec![196, 43, 14, 0])], @@ -237,7 +236,6 @@ mod tests { #[bench] fn bench_medium_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { - std::thread::sleep_ms(1000); #[rustfmt::skip] let inputs = [ vec![Cow::Owned(vec![232, 35, 9, 0]), Cow::Owned(vec![192, 10, 9, 0]), Cow::Owned(vec![91, 33, 9, 0]), Cow::Owned(vec![204, 29, 9, 0])], From b618c2ca8e1ff2a3d38adbed2d09110b210e357e Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 18 Aug 2022 19:53:04 +0200 Subject: [PATCH 13/25] simplify some unions --- milli/src/search/criteria/mod.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1beae435c..e03ed62d1 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -315,14 +315,7 @@ pub fn resolve_query_tree( Ok(candidates.and()) } - Or(_, ops) => { - let candidates = ops - .into_iter() - .map(|op| resolve_operation(ctx, op, wdcache)) - .collect::, _>>()?; - - Ok(candidates.or()) - } + Or(_, ops) => ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).or(), Phrase(words) => resolve_phrase(ctx, &words), Query(q) => Ok(query_docids(ctx, q, wdcache)?), } @@ -383,9 +376,9 @@ fn all_word_pair_proximity_docids, U: AsRef>( .map(|res| res.unwrap_or_default()) }) }) - .collect::, _>>()?; + .or()?; - Ok(docids.or()) + Ok(docids) } fn query_docids( From 14bcd787e338e9aba1b3a4eb392fcc5ca03567fd Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 08:14:13 +0200 Subject: [PATCH 14/25] resolve phrase --- milli/src/search/criteria/mod.rs | 58 +++++++++----------------------- 1 file changed, 16 insertions(+), 42 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index e03ed62d1..caa0b2007 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -15,7 +15,7 @@ use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; +use crate::{AscDesc as AscDescName, DocumentId, Error, FieldId, Index, Member, Result}; mod asc_desc; mod attribute; @@ -307,14 +307,7 @@ pub fn resolve_query_tree( use Operation::{And, Or, Phrase, Query}; match query_tree { - And(ops) => { - let candidates = ops - .iter() - .map(|op| resolve_operation(ctx, op, wdcache)) - .collect::>>()?; - - Ok(candidates.and()) - } + And(ops) => ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).and(), Or(_, ops) => ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).or(), Phrase(words) => resolve_phrase(ctx, &words), Query(q) => Ok(query_docids(ctx, q, wdcache)?), @@ -325,41 +318,22 @@ pub fn resolve_query_tree( } pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; let winsize = phrase.len().min(7); - for win in phrase.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } - Ok(candidates) + phrase + .windows(winsize) + .flat_map(|win| { + win.iter().enumerate().flat_map(move |(offset, s1)| { + win.iter().skip(offset + 1).enumerate().map(move |(dist, s2)| { + ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1) + // If there are no document for this distance, there will be no + // results for the phrase query. + .map(|m| m.unwrap_or_default()) + }) + }) + }) + .and() + .map_err(Error::from) } fn all_word_pair_proximity_docids, U: AsRef>( From dc507421cc9c623d4e28118e03e1449423cf73b6 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 08:56:55 +0200 Subject: [PATCH 15/25] tolerant --- milli/src/search/criteria/mod.rs | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index caa0b2007..68608dc8c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -391,15 +391,19 @@ fn query_docids( } QueryKind::Tolerant { typo, word } => { let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, typo) in words { - let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - if *typo == 0 { - current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default() - } - docids |= current_docids; - } - Ok(docids) + + words + .into_iter() + .flat_map(|(word, typo)| { + let current_docids = + ctx.word_docids(&word).map(|word| word.unwrap_or_default()); + let typo = (*typo == 0) + .then(|| ctx.exact_word_docids(&word).map(|word| word.unwrap_or_default())); + + std::iter::once(current_docids).chain(typo) + }) + .or() + .map_err(Error::from) } } } From b3a87aeeeafc7d67dcac51786399beccd60c062b Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 09:06:00 +0200 Subject: [PATCH 16/25] exact --- milli/src/search/criteria/mod.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 68608dc8c..4417e4049 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -371,15 +371,20 @@ fn query_docids( Ok(docids) } else if query.prefix { let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - docids |= ctx.word_docids(&word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); - } - } - Ok(docids) + + words + .into_iter() + .flat_map(|(word, _typo)| { + let current_docids = + ctx.word_docids(&word).map(|word| word.unwrap_or_default()); + let typo = (*original_typo == 0).then(|| { + ctx.exact_word_docids(&word).map(|word| word.unwrap_or_default()) + }); + + std::iter::once(current_docids).chain(typo) + }) + .or() + .map_err(Error::from) } else { let mut docids = ctx.word_docids(&word)?.unwrap_or_default(); // only add the exact docids if the word hasn't been derived From 94602e65cf458887008c2edb817b932bc14192a9 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 09:15:35 +0200 Subject: [PATCH 17/25] tolerant --- milli/src/search/criteria/mod.rs | 37 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4417e4049..2e6436105 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -452,23 +452,28 @@ fn query_pair_proximity_docids( let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - let current_docids = match ctx.word_prefix_pair_proximity_docids( - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { - let r_words = - word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + l_words + .into_iter() + .map(|(left, _)| -> Result<_> { + match ctx.word_prefix_pair_proximity_docids( + left.as_str(), + right.as_str(), + proximity, + )? { + Some(docids) => Ok(docids), + None => { + let r_words = + word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) + } } - }?; - docids |= current_docids; - } - Ok(docids) + }) + .or() } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } From c175b75c59a5e45fa824c7b07c277e407043676c Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 09:49:11 +0200 Subject: [PATCH 18/25] get rids of the intersection_of function --- milli/src/search/criteria/exactness.rs | 50 ++++++++++++-------------- milli/src/search/criteria/mod.rs | 1 - 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index e7775423c..c0281d724 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,10 +1,9 @@ use std::convert::TryFrom; use std::mem::take; -use std::ops::BitOr; use itertools::Itertools; use log::debug; -use roaring::RoaringBitmap; +use roaring::{IterExt, RoaringBitmap}; use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, @@ -173,34 +172,40 @@ fn resolve_state( use State::*; match state { ExactAttribute(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); if let Ok(query_len) = u8::try_from(query.len()) { let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - if let Some(attribute_allowed_docids) = - ctx.field_id_word_count_docids(id, query_len)? - { + + let mut candidates = attributes_ids + .into_iter() + .filter_map(|id| { + ctx.field_id_word_count_docids(id, query_len) + .transpose() + .map(|res| (id, res)) + }) + .map(|(id, attribute_allowed_docids)| -> Result<_> { let mut attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - attribute_candidates_array.push(attribute_allowed_docids); - candidates |= intersection_of(attribute_candidates_array.iter().collect()); - } - } + attribute_candidates_array.push(attribute_allowed_docids?); + Ok(attribute_candidates_array.into_iter().and()) + }) + .or()?; // only keep allowed candidates candidates &= &allowed_candidates; // remove current candidates from allowed candidates allowed_candidates -= &candidates; - } - Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) + Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) + } else { + Ok((RoaringBitmap::new(), Some(AttributeStartsWith(allowed_candidates)))) + } } AttributeStartsWith(mut allowed_candidates) => { let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - candidates |= intersection_of(attribute_candidates_array.iter().collect()); + candidates |= attribute_candidates_array.and(); } // only keep allowed candidates @@ -238,7 +243,7 @@ fn resolve_state( } } } - candidates |= intersection_of(bitmaps.iter().collect()); + candidates |= bitmaps.and(); } } parts_candidates_array.push(candidates); @@ -247,7 +252,7 @@ fn resolve_state( let mut candidates_array = Vec::new(); // compute documents that contain all exact words. - let mut all_exact_candidates = intersection_of(parts_candidates_array.iter().collect()); + let mut all_exact_candidates = parts_candidates_array.iter().and(); all_exact_candidates &= &allowed_candidates; allowed_candidates -= &all_exact_candidates; @@ -258,9 +263,9 @@ fn resolve_state( // create all `c_count` combinations of exact words .combinations(c_count) // intersect each word candidates in combinations - .map(intersection_of) + .map(IterExt::and) // union combinations of `c_count` exact words - .fold(RoaringBitmap::new(), RoaringBitmap::bitor); + .or(); // only keep allowed candidates combinations_candidates &= &allowed_candidates; // remove current candidates from allowed candidates @@ -324,15 +329,6 @@ fn attribute_start_with_docids( Ok(attribute_candidates_array) } -fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { - rbs.sort_unstable_by_key(|rb| rb.len()); - let mut iter = rbs.into_iter(); - match iter.next() { - Some(first) => iter.fold(first.clone(), |acc, rb| acc & rb), - None => RoaringBitmap::new(), - } -} - #[derive(Debug, Clone)] pub enum ExactQueryPart { Phrase(Vec), diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 2e6436105..3cb9390e2 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::result::Result as StdResult; use roaring::{IterExt, RoaringBitmap}; From 3a5eb1c256c2292472d703f250b7a97f85369f34 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 12:21:30 +0200 Subject: [PATCH 19/25] resolve_state --- milli/src/search/criteria/exactness.rs | 43 ++++++++++++-------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index c0281d724..27ed84e6d 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -201,12 +201,12 @@ fn resolve_state( } } AttributeStartsWith(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - candidates |= attribute_candidates_array.and(); - } + + let mut candidates = attributes_ids + .into_iter() + .map(|id| attribute_start_with_docids(ctx, id, query).map(IterExt::and)) + .or()?; // only keep allowed candidates candidates &= &allowed_candidates; @@ -223,27 +223,24 @@ fn resolve_state( use ExactQueryPart::*; match part { Synonyms(synonyms) => { - for synonym in synonyms { - if let Some(synonym_candidates) = ctx.word_docids(synonym)? { - candidates |= synonym_candidates; - } - } + let tmp = synonyms + .into_iter() + .filter_map(|synonym| ctx.word_docids(synonym).transpose()) + .or()?; + + candidates |= tmp; } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { - let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); - for words in phrase.windows(2) { - if let [left, right] = words { - match ctx.word_pair_proximity_docids(left, right, 0)? { - Some(docids) => bitmaps.push(docids), - None => { - bitmaps.clear(); - break; - } - } - } - } - candidates |= bitmaps.and(); + let bitmaps = phrase + .windows(2) + .map(|words| { + ctx.word_pair_proximity_docids(&words[0], &words[1], 0) + .map(|o| o.unwrap_or_default()) + }) + .and()?; + + candidates |= bitmaps; } } parts_candidates_array.push(candidates); From 57a47910783a414dd9046d3de5a9424cf3c0abbd Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 12:33:07 +0200 Subject: [PATCH 20/25] attribute_start_with_docids --- milli/src/search/criteria/exactness.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 27ed84e6d..859eea1ec 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -301,13 +301,10 @@ fn attribute_start_with_docids( use ExactQueryPart::*; match part { Synonyms(synonyms) => { - let mut synonyms_candidates = RoaringBitmap::new(); - for word in synonyms { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - synonyms_candidates |= word_candidates; - } - } + let synonyms_candidates = synonyms + .into_iter() + .filter_map(|word| ctx.word_position_docids(word, pos).transpose()) + .or()?; attribute_candidates_array.push(synonyms_candidates); pos += 1; } From d568e37dcc0e878616fa9ecd73424d7880460e86 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 12:38:53 +0200 Subject: [PATCH 21/25] resolve_candidates --- milli/src/search/criteria/proximity.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b29187d28..d7b9bc7aa 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -315,10 +315,11 @@ fn resolve_candidates<'t>( } } - let mut candidates = RoaringBitmap::new(); - for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? { - candidates |= cds; - } + let candidates = resolve_operation(ctx, query_tree, proximity, cache, wdcache)? + .into_iter() + .map(|(_, _, cds)| cds) + .or(); + Ok(candidates) } From 52e8ac17b2daccc47d5dd5f7f87d20531dd40d80 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 12:55:21 +0200 Subject: [PATCH 22/25] filter parser --- milli/src/search/facet/filter.rs | 60 +++++++++++--------------------- 1 file changed, 20 insertions(+), 40 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7241dab2b..204102edb 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -6,7 +6,7 @@ use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; use log::debug; -use roaring::RoaringBitmap; +use roaring::{IterExt, RoaringBitmap}; use super::FacetNumberRange; use crate::error::{Error, UserError}; @@ -365,13 +365,18 @@ impl<'a> Filter<'a> { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - let mut bitmap = RoaringBitmap::new(); + let bitmap = els + .into_iter() + .map(|el| { + Self::evaluate_operator( + rtxn, + index, + fid, + &Condition::Equal(el.clone()), + ) + }) + .or()?; - for el in els { - let op = Condition::Equal(el.clone()); - let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; - bitmap |= el_bitmap; - } Ok(bitmap) } else { Ok(RoaringBitmap::new()) @@ -413,39 +418,14 @@ impl<'a> Filter<'a> { } } } - FilterCondition::Or(subfilters) => { - let mut bitmap = RoaringBitmap::new(); - for f in subfilters { - bitmap |= - Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; - } - Ok(bitmap) - } - FilterCondition::And(subfilters) => { - let mut subfilters_iter = subfilters.iter(); - if let Some(first_subfilter) = subfilters_iter.next() { - let mut bitmap = Self::inner_evaluate( - &(first_subfilter.clone()).into(), - rtxn, - index, - filterable_fields, - )?; - for f in subfilters_iter { - if bitmap.is_empty() { - return Ok(bitmap); - } - bitmap &= Self::inner_evaluate( - &(f.clone()).into(), - rtxn, - index, - filterable_fields, - )?; - } - Ok(bitmap) - } else { - Ok(RoaringBitmap::new()) - } - } + FilterCondition::Or(subfilters) => subfilters + .into_iter() + .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .or(), + FilterCondition::And(subfilters) => subfilters + .into_iter() + .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .and(), FilterCondition::GeoLowerThan { point, radius } => { if filterable_fields.contains("_geo") { let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; From 9ce9a4b15d4eb85d3b7a6d49fb975e91ca88afe7 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 13:06:03 +0200 Subject: [PATCH 23/25] facet number & strings --- milli/src/update/facets.rs | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 108acae4f..fd7c9b5ff 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -132,7 +132,7 @@ use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, BytesEncode, Error}; use log::debug; -use roaring::RoaringBitmap; +use roaring::{IterExt, RoaringBitmap}; use time::OffsetDateTime; use crate::error::InternalError; @@ -301,9 +301,7 @@ fn compute_facet_number_levels<'t>( first_level_size, level_group_size, &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } + number_document_ids |= bitmaps.or(); Ok(()) }, &|_i, (_field_id, _level, left, _right)| *left, @@ -316,11 +314,11 @@ fn compute_facet_number_levels<'t>( Ok((subwriters, number_document_ids)) } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, docids) = result?; - documents_ids |= docids; - } + let documents_ids = db + .range(rtxn, &(level_0_start..))? + .take(first_level_size) + .map(|result| result.map(|(_key, docids)| docids)) + .or()?; Ok((vec![], documents_ids)) } @@ -389,11 +387,11 @@ fn compute_facet_strings_levels<'t>( Ok((subwriters, strings_document_ids)) } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; - } + let documents_ids = db + .range(rtxn, &(level_0_start..))? + .take(first_level_size) + .map(|result| result.map(|(_key, (_original_value, docids))| docids)) + .or()?; Ok((vec![], documents_ids)) } From 36e27e2106366755c2efcf0aa26509e833f519b4 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 22 Aug 2022 18:19:14 +0200 Subject: [PATCH 24/25] import bytemuck --- milli/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7a6eab737..338924046 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -7,6 +7,7 @@ edition = "2018" [dependencies] bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" +bytemuck = { version = "1.12.1", features = ["extern_crate_alloc"] } bstr = "0.2.17" byteorder = "1.4.3" charabia = "0.6.0" From 5115a46686b93d02844c9a10166c3b3848cfa8b9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 30 Aug 2022 15:11:53 +0200 Subject: [PATCH 25/25] bump roaring --- benchmarks/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- .../cbo_roaring_bitmap_codec.rs | 4 ++-- milli/src/search/criteria/exactness.rs | 22 +++++++++---------- milli/src/search/criteria/mod.rs | 18 ++++++++------- milli/src/search/criteria/proximity.rs | 6 ++--- milli/src/search/facet/filter.rs | 8 +++---- milli/src/update/facets.rs | 8 +++---- .../helpers/merge_functions.rs | 6 ++--- 10 files changed, 40 insertions(+), 38 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 36d3a46b2..8b44bbaf0 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -15,7 +15,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } criterion = { version = "0.3.5", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = { git = "https://github.com/irevoire/roaring-rs", branch = "tamo-treemap" } +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" } [build-dependencies] anyhow = "1.0.56" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 91cb521e5..f56edd231 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -12,7 +12,7 @@ csv = "1.1.6" milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } # roaring = "0.9.0" -roaring = { git = "https://github.com/irevoire/roaring-rs", branch = "tamo-treemap" } +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" } serde_json = "1.0.79" stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 338924046..4bd0e40e2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,7 +27,7 @@ obkv = "0.2.0" once_cell = "1.10.0" ordered-float = "2.10.0" rayon = "1.5.1" -roaring = { git = "https://github.com/irevoire/roaring-rs", branch = "tamo-treemap" } +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" } rstar = { version = "0.9.2", features = ["serde"] } serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.79", features = ["preserve_order"] } diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 90925e56f..a835f872c 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -3,7 +3,7 @@ use std::io; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; -use roaring::{IterExt, RoaringBitmap}; +use roaring::{MultiOps, RoaringBitmap}; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able @@ -93,7 +93,7 @@ impl CboRoaringBitmapCodec { } else { let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); let buffer: &mut Vec = unsafe { convert_vec(u32_buffer) }; - let bitmap = bitmaps.into_iter().chain(std::iter::once(bitmap)).or(); + let bitmap = bitmaps.into_iter().chain(std::iter::once(bitmap)).union(); buffer.clear(); bitmap.serialize_into(buffer)?; } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 859eea1ec..108850353 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -3,7 +3,7 @@ use std::mem::take; use itertools::Itertools; use log::debug; -use roaring::{IterExt, RoaringBitmap}; +use roaring::{MultiOps, RoaringBitmap}; use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, @@ -186,9 +186,9 @@ fn resolve_state( let mut attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; attribute_candidates_array.push(attribute_allowed_docids?); - Ok(attribute_candidates_array.into_iter().and()) + Ok(attribute_candidates_array.into_iter().intersection()) }) - .or()?; + .union()?; // only keep allowed candidates candidates &= &allowed_candidates; @@ -205,8 +205,8 @@ fn resolve_state( let mut candidates = attributes_ids .into_iter() - .map(|id| attribute_start_with_docids(ctx, id, query).map(IterExt::and)) - .or()?; + .map(|id| attribute_start_with_docids(ctx, id, query).map(MultiOps::intersection)) + .union()?; // only keep allowed candidates candidates &= &allowed_candidates; @@ -226,7 +226,7 @@ fn resolve_state( let tmp = synonyms .into_iter() .filter_map(|synonym| ctx.word_docids(synonym).transpose()) - .or()?; + .union()?; candidates |= tmp; } @@ -238,7 +238,7 @@ fn resolve_state( ctx.word_pair_proximity_docids(&words[0], &words[1], 0) .map(|o| o.unwrap_or_default()) }) - .and()?; + .intersection()?; candidates |= bitmaps; } @@ -249,7 +249,7 @@ fn resolve_state( let mut candidates_array = Vec::new(); // compute documents that contain all exact words. - let mut all_exact_candidates = parts_candidates_array.iter().and(); + let mut all_exact_candidates = parts_candidates_array.iter().intersection(); all_exact_candidates &= &allowed_candidates; allowed_candidates -= &all_exact_candidates; @@ -260,9 +260,9 @@ fn resolve_state( // create all `c_count` combinations of exact words .combinations(c_count) // intersect each word candidates in combinations - .map(IterExt::and) + .map(MultiOps::intersection) // union combinations of `c_count` exact words - .or(); + .union(); // only keep allowed candidates combinations_candidates &= &allowed_candidates; // remove current candidates from allowed candidates @@ -304,7 +304,7 @@ fn attribute_start_with_docids( let synonyms_candidates = synonyms .into_iter() .filter_map(|word| ctx.word_position_docids(word, pos).transpose()) - .or()?; + .union()?; attribute_candidates_array.push(synonyms_candidates); pos += 1; } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3cb9390e2..75073fb97 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; -use roaring::{IterExt, RoaringBitmap}; +use roaring::{MultiOps, RoaringBitmap}; use self::asc_desc::AscDesc; use self::attribute::Attribute; @@ -306,8 +306,10 @@ pub fn resolve_query_tree( use Operation::{And, Or, Phrase, Query}; match query_tree { - And(ops) => ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).and(), - Or(_, ops) => ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).or(), + And(ops) => { + ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).intersection() + } + Or(_, ops) => ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).union(), Phrase(words) => resolve_phrase(ctx, &words), Query(q) => Ok(query_docids(ctx, q, wdcache)?), } @@ -331,7 +333,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result, U: AsRef>( .map(|res| res.unwrap_or_default()) }) }) - .or()?; + .union()?; Ok(docids) } @@ -382,7 +384,7 @@ fn query_docids( std::iter::once(current_docids).chain(typo) }) - .or() + .union() .map_err(Error::from) } else { let mut docids = ctx.word_docids(&word)?.unwrap_or_default(); @@ -406,7 +408,7 @@ fn query_docids( std::iter::once(current_docids).chain(typo) }) - .or() + .union() .map_err(Error::from) } } @@ -472,7 +474,7 @@ fn query_pair_proximity_docids( } } }) - .or() + .union() } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index d7b9bc7aa..ef8718686 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -3,7 +3,7 @@ use std::collections::hash_map::HashMap; use std::mem::take; use log::debug; -use roaring::{IterExt, RoaringBitmap}; +use roaring::{MultiOps, RoaringBitmap}; use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, @@ -263,7 +263,7 @@ fn resolve_candidates<'t>( output.push(( ll.clone(), rr.clone(), - [&candidates, lcandidates, rcandidates].and(), + [&candidates, lcandidates, rcandidates].intersection(), )); } } @@ -318,7 +318,7 @@ fn resolve_candidates<'t>( let candidates = resolve_operation(ctx, query_tree, proximity, cache, wdcache)? .into_iter() .map(|(_, _, cds)| cds) - .or(); + .union(); Ok(candidates) } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 204102edb..220e98891 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -6,7 +6,7 @@ use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; use log::debug; -use roaring::{IterExt, RoaringBitmap}; +use roaring::{MultiOps, RoaringBitmap}; use super::FacetNumberRange; use crate::error::{Error, UserError}; @@ -375,7 +375,7 @@ impl<'a> Filter<'a> { &Condition::Equal(el.clone()), ) }) - .or()?; + .union()?; Ok(bitmap) } else { @@ -421,11 +421,11 @@ impl<'a> Filter<'a> { FilterCondition::Or(subfilters) => subfilters .into_iter() .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) - .or(), + .union(), FilterCondition::And(subfilters) => subfilters .into_iter() .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) - .and(), + .intersection(), FilterCondition::GeoLowerThan { point, radius } => { if filterable_fields.contains("_geo") { let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index fd7c9b5ff..048c2e3f4 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -132,7 +132,7 @@ use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, BytesEncode, Error}; use log::debug; -use roaring::{IterExt, RoaringBitmap}; +use roaring::{MultiOps, RoaringBitmap}; use time::OffsetDateTime; use crate::error::InternalError; @@ -301,7 +301,7 @@ fn compute_facet_number_levels<'t>( first_level_size, level_group_size, &mut |bitmaps, _, _| { - number_document_ids |= bitmaps.or(); + number_document_ids |= bitmaps.union(); Ok(()) }, &|_i, (_field_id, _level, left, _right)| *left, @@ -318,7 +318,7 @@ fn compute_facet_number_levels<'t>( .range(rtxn, &(level_0_start..))? .take(first_level_size) .map(|result| result.map(|(_key, docids)| docids)) - .or()?; + .union()?; Ok((vec![], documents_ids)) } @@ -391,7 +391,7 @@ fn compute_facet_strings_levels<'t>( .range(rtxn, &(level_0_start..))? .take(first_level_size) .map(|result| result.map(|(_key, (_original_value, docids))| docids)) - .or()?; + .union()?; Ok((vec![], documents_ids)) } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 941e8351d..8c1c15054 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::io; use std::result::Result as StdResult; -use roaring::{IterExt, RoaringBitmap}; +use roaring::{MultiOps, RoaringBitmap}; use super::read_u32_ne_bytes; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; @@ -41,7 +41,7 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul .map(AsRef::as_ref) .map(RoaringBitmap::deserialize_from) .map(StdResult::unwrap) - .or(); + .union(); let mut buffer = Vec::new(); serialize_roaring_bitmap(&merged, &mut buffer)?; Ok(Cow::Owned(buffer)) @@ -64,7 +64,7 @@ pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( .map(|(_, bitmap_bytes)| bitmap_bytes) .map(RoaringBitmap::deserialize_from) .map(StdResult::unwrap) - .or(); + .union(); let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); let mut buffer = Vec::with_capacity(cap);