diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 3a832defb..8b44bbaf0 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -15,7 +15,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } criterion = { version = "0.3.5", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = "0.9.0" +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" } [build-dependencies] anyhow = "1.0.56" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 6b6b1f300..f56edd231 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -11,7 +11,8 @@ byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } csv = "1.1.6" milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } -roaring = "0.9.0" +# roaring = "0.9.0" +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" } serde_json = "1.0.79" stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 56a1da5cd..4bd0e40e2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -7,6 +7,7 @@ edition = "2018" [dependencies] bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" +bytemuck = { version = "1.12.1", features = ["extern_crate_alloc"] } bstr = "0.2.17" byteorder = "1.4.3" charabia = "0.6.0" @@ -26,7 +27,7 @@ obkv = "0.2.0" once_cell = "1.10.0" ordered-float = "2.10.0" rayon = "1.5.1" -roaring = "0.9.0" +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "main" } rstar = { version = "0.9.2", features = ["serde"] } serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.79", features = ["preserve_order"] } @@ -55,6 +56,7 @@ insta = "1.18.1" maplit = "1.0.2" md5 = "0.7.0" rand = "0.8.5" +select-rustc = "0.1" [features] default = [] diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 1bd132974..a835f872c 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -3,7 +3,7 @@ use std::io; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able @@ -54,47 +54,71 @@ impl CboRoaringBitmapCodec { } /// Merge serialized CboRoaringBitmaps in a buffer. + /// The buffer MUST BE empty. /// /// if the merged values length is under the threshold, values are directly /// serialized in the buffer else a RoaringBitmap is created from the /// values and is serialized in the buffer. pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { - let mut roaring = RoaringBitmap::new(); - let mut vec = Vec::new(); - - for bytes in slices { - if bytes.len() <= THRESHOLD * size_of::() { - let mut reader = bytes.as_ref(); - while let Ok(integer) = reader.read_u32::() { - vec.push(integer); + debug_assert!(buffer.len() == 0); + + let bitmaps = slices + .iter() + .filter_map(|slice| { + if slice.len() <= THRESHOLD * size_of::() { + buffer.extend(slice.as_ref()); + None + } else { + RoaringBitmap::deserialize_from(slice.as_ref()).into() } - } else { - roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; - } - } + }) + .collect::>>()?; - if roaring.is_empty() { - vec.sort_unstable(); - vec.dedup(); + let u32_buffer: &mut Vec = unsafe { convert_vec(buffer) }; + u32_buffer.sort_unstable(); + u32_buffer.dedup(); - if vec.len() <= THRESHOLD { - for integer in vec { - buffer.extend_from_slice(&integer.to_ne_bytes()); - } - } else { - // We can unwrap safely because the vector is sorted upper. - let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); + if bitmaps.is_empty() { + if u32_buffer.len() > THRESHOLD { + // We can unwrap safely because the vector is sorted above. + let roaring = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); + + let buffer: &mut Vec = unsafe { convert_vec(u32_buffer) }; + buffer.clear(); roaring.serialize_into(buffer)?; + } else { + // we still need to fix the size of the buffer + let _buffer: &mut Vec = unsafe { convert_vec(u32_buffer) }; } } else { - roaring.extend(vec); - roaring.serialize_into(buffer)?; + let bitmap = RoaringBitmap::from_sorted_iter(u32_buffer.iter().copied()).unwrap(); + let buffer: &mut Vec = unsafe { convert_vec(u32_buffer) }; + let bitmap = bitmaps.into_iter().chain(std::iter::once(bitmap)).union(); + buffer.clear(); + bitmap.serialize_into(buffer)?; } Ok(()) } } +/// Convert a `Vec` of `T` into a `Vec` of `U` by keeping the same allocation and +/// only updating the size of the `Vec`. +/// To make this works `size_of::() * input.len() % size_of::()` must be equal to zero. +unsafe fn convert_vec(input: &mut Vec) -> &mut Vec { + debug_assert!( + size_of::() * input.len() % size_of::() == 0, + "called with incompatible types" + ); + + let new_len = size_of::() * input.len() / size_of::(); + + let ret: &mut Vec = std::mem::transmute(input); + ret.set_len(new_len); + + ret +} + impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { type DItem = RoaringBitmap; @@ -183,4 +207,52 @@ mod tests { let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap(); assert_eq!(bitmap, expected); } + + #[cfg(feature = "nightly")] + mod bench { + extern crate test; + use test::Bencher; + + #[bench] + fn bench_small_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { + #[rustfmt::skip] + let inputs = [ + vec![Cow::Owned(vec![255, 56, 14, 0]), Cow::Owned(vec![196, 43, 14, 0])], + vec![Cow::Owned(vec![63, 101, 3, 0]), Cow::Owned(vec![71, 136, 3, 0])], + vec![Cow::Owned(vec![68, 108, 0, 0]), Cow::Owned(vec![85, 104, 0, 0]), Cow::Owned(vec![204, 103, 0, 0])], + vec![Cow::Owned(vec![199, 101, 7, 0]), Cow::Owned(vec![94, 42, 7, 0])], + vec![Cow::Owned(vec![173, 219, 12, 0]), Cow::Owned(vec![146, 3, 13, 0])], + vec![Cow::Owned(vec![13, 152, 3, 0]), Cow::Owned(vec![64, 120, 3, 0])], + vec![Cow::Owned(vec![109, 253, 13, 0]), Cow::Owned(vec![108, 232, 13, 0])], + vec![Cow::Owned(vec![73, 176, 3, 0]), Cow::Owned(vec![126, 167, 3, 0])], + ]; + + let mut vec = Vec::new(); + for input in inputs { + bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); + vec.clear(); + } + } + + #[bench] + fn bench_medium_merge_cbo_roaring_bitmaps(bencher: &mut Bencher) { + #[rustfmt::skip] + let inputs = [ + vec![Cow::Owned(vec![232, 35, 9, 0]), Cow::Owned(vec![192, 10, 9, 0]), Cow::Owned(vec![91, 33, 9, 0]), Cow::Owned(vec![204, 29, 9, 0])], + vec![Cow::Owned(vec![144, 39, 9, 0]), Cow::Owned(vec![162, 66, 9, 0]), Cow::Owned(vec![146, 11, 9, 0]), Cow::Owned(vec![174, 61, 9, 0])], + vec![Cow::Owned(vec![83, 70, 7, 0]), Cow::Owned(vec![115, 72, 7, 0]), Cow::Owned(vec![219, 54, 7, 0]), Cow::Owned(vec![1, 93, 7, 0]), Cow::Owned(vec![195, 77, 7, 0]), Cow::Owned(vec![21, 86, 7, 0])], + vec![Cow::Owned(vec![244, 112, 0, 0]), Cow::Owned(vec![48, 126, 0, 0]), Cow::Owned(vec![72, 142, 0, 0]), Cow::Owned(vec![255, 113, 0, 0]), Cow::Owned(vec![101, 114, 0, 0]), Cow::Owned(vec![66, 88, 0, 0]), Cow::Owned(vec![84, 92, 0, 0]), Cow::Owned(vec![194, 137, 0, 0]), Cow::Owned(vec![208, 132, 0, 0])], + vec![Cow::Owned(vec![8, 57, 7, 0]), Cow::Owned(vec![133, 115, 7, 0]), Cow::Owned(vec![219, 94, 7, 0]), Cow::Owned(vec![46, 95, 7, 0]), Cow::Owned(vec![156, 111, 7, 0]), Cow::Owned(vec![63, 107, 7, 0]), Cow::Owned(vec![31, 47, 7, 0])], + vec![Cow::Owned(vec![165, 78, 0, 0]), Cow::Owned(vec![197, 95, 0, 0]), Cow::Owned(vec![194, 82, 0, 0]), Cow::Owned(vec![142, 91, 0, 0]), Cow::Owned(vec![120, 94, 0, 0])], + vec![Cow::Owned(vec![185, 187, 13, 0]), Cow::Owned(vec![41, 187, 13, 0]), Cow::Owned(vec![245, 223, 13, 0]), Cow::Owned(vec![211, 251, 13, 0]), Cow::Owned(vec![192, 193, 13, 0]), Cow::Owned(vec![215, 230, 13, 0]), Cow::Owned(vec![252, 207, 13, 0]), Cow::Owned(vec![131, 213, 13, 0]), Cow::Owned(vec![219, 187, 13, 0]), Cow::Owned(vec![105, 236, 13, 0]), Cow::Owned(vec![30, 239, 13, 0]), Cow::Owned(vec![13, 200, 13, 0]), Cow::Owned(vec![111, 197, 13, 0]), Cow::Owned(vec![87, 222, 13, 0]), Cow::Owned(vec![7, 205, 13, 0]), Cow::Owned(vec![90, 211, 13, 0])], + vec![Cow::Owned(vec![215, 253, 13, 0]), Cow::Owned(vec![225, 194, 13, 0]), Cow::Owned(vec![37, 189, 13, 0]), Cow::Owned(vec![242, 212, 13, 0])], + ]; + + let mut vec = Vec::new(); + for input in inputs { + bencher.iter(|| CboRoaringBitmapCodec::merge_into(&input, &mut vec)); + vec.clear(); + } + } + } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 517d28ccc..e08144f4f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "nightly", feature(test))] + #[macro_use] pub mod documents; diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index e7775423c..108850353 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,10 +1,9 @@ use std::convert::TryFrom; use std::mem::take; -use std::ops::BitOr; use itertools::Itertools; use log::debug; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, @@ -173,35 +172,41 @@ fn resolve_state( use State::*; match state { ExactAttribute(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); if let Ok(query_len) = u8::try_from(query.len()) { let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - if let Some(attribute_allowed_docids) = - ctx.field_id_word_count_docids(id, query_len)? - { + + let mut candidates = attributes_ids + .into_iter() + .filter_map(|id| { + ctx.field_id_word_count_docids(id, query_len) + .transpose() + .map(|res| (id, res)) + }) + .map(|(id, attribute_allowed_docids)| -> Result<_> { let mut attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - attribute_candidates_array.push(attribute_allowed_docids); - candidates |= intersection_of(attribute_candidates_array.iter().collect()); - } - } + attribute_candidates_array.push(attribute_allowed_docids?); + Ok(attribute_candidates_array.into_iter().intersection()) + }) + .union()?; // only keep allowed candidates candidates &= &allowed_candidates; // remove current candidates from allowed candidates allowed_candidates -= &candidates; - } - Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) + Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) + } else { + Ok((RoaringBitmap::new(), Some(AttributeStartsWith(allowed_candidates)))) + } } AttributeStartsWith(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - candidates |= intersection_of(attribute_candidates_array.iter().collect()); - } + + let mut candidates = attributes_ids + .into_iter() + .map(|id| attribute_start_with_docids(ctx, id, query).map(MultiOps::intersection)) + .union()?; // only keep allowed candidates candidates &= &allowed_candidates; @@ -218,27 +223,24 @@ fn resolve_state( use ExactQueryPart::*; match part { Synonyms(synonyms) => { - for synonym in synonyms { - if let Some(synonym_candidates) = ctx.word_docids(synonym)? { - candidates |= synonym_candidates; - } - } + let tmp = synonyms + .into_iter() + .filter_map(|synonym| ctx.word_docids(synonym).transpose()) + .union()?; + + candidates |= tmp; } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { - let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); - for words in phrase.windows(2) { - if let [left, right] = words { - match ctx.word_pair_proximity_docids(left, right, 0)? { - Some(docids) => bitmaps.push(docids), - None => { - bitmaps.clear(); - break; - } - } - } - } - candidates |= intersection_of(bitmaps.iter().collect()); + let bitmaps = phrase + .windows(2) + .map(|words| { + ctx.word_pair_proximity_docids(&words[0], &words[1], 0) + .map(|o| o.unwrap_or_default()) + }) + .intersection()?; + + candidates |= bitmaps; } } parts_candidates_array.push(candidates); @@ -247,7 +249,7 @@ fn resolve_state( let mut candidates_array = Vec::new(); // compute documents that contain all exact words. - let mut all_exact_candidates = intersection_of(parts_candidates_array.iter().collect()); + let mut all_exact_candidates = parts_candidates_array.iter().intersection(); all_exact_candidates &= &allowed_candidates; allowed_candidates -= &all_exact_candidates; @@ -258,9 +260,9 @@ fn resolve_state( // create all `c_count` combinations of exact words .combinations(c_count) // intersect each word candidates in combinations - .map(intersection_of) + .map(MultiOps::intersection) // union combinations of `c_count` exact words - .fold(RoaringBitmap::new(), RoaringBitmap::bitor); + .union(); // only keep allowed candidates combinations_candidates &= &allowed_candidates; // remove current candidates from allowed candidates @@ -299,13 +301,10 @@ fn attribute_start_with_docids( use ExactQueryPart::*; match part { Synonyms(synonyms) => { - let mut synonyms_candidates = RoaringBitmap::new(); - for word in synonyms { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - synonyms_candidates |= word_candidates; - } - } + let synonyms_candidates = synonyms + .into_iter() + .filter_map(|word| ctx.word_position_docids(word, pos).transpose()) + .union()?; attribute_candidates_array.push(synonyms_candidates); pos += 1; } @@ -324,15 +323,6 @@ fn attribute_start_with_docids( Ok(attribute_candidates_array) } -fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { - rbs.sort_unstable_by_key(|rb| rb.len()); - let mut iter = rbs.into_iter(); - match iter.next() { - Some(first) => iter.fold(first.clone(), |acc, rb| acc & rb), - None => RoaringBitmap::new(), - } -} - #[derive(Debug, Clone)] pub enum ExactQueryPart { Phrase(Vec), diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index f48865ba5..75073fb97 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use self::asc_desc::AscDesc; use self::attribute::Attribute; @@ -14,7 +14,7 @@ use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; +use crate::{AscDesc as AscDescName, DocumentId, Error, FieldId, Index, Member, Result}; mod asc_desc; mod attribute; @@ -307,34 +307,10 @@ pub fn resolve_query_tree( match query_tree { And(ops) => { - let mut ops = ops - .iter() - .map(|op| resolve_operation(ctx, op, wdcache)) - .collect::>>()?; - - ops.sort_unstable_by_key(|cds| cds.len()); - - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for docids in ops { - if first_loop { - candidates = docids; - first_loop = false; - } else { - candidates &= &docids; - } - } - Ok(candidates) + ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).intersection() } + Or(_, ops) => ops.into_iter().map(|op| resolve_operation(ctx, op, wdcache)).union(), Phrase(words) => resolve_phrase(ctx, &words), - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, wdcache)?; - candidates |= docids; - } - Ok(candidates) - } Query(q) => Ok(query_docids(ctx, q, wdcache)?), } } @@ -343,41 +319,22 @@ pub fn resolve_query_tree( } pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; let winsize = phrase.len().min(7); - for win in phrase.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } - Ok(candidates) + phrase + .windows(winsize) + .flat_map(|win| { + win.iter().enumerate().flat_map(move |(offset, s1)| { + win.iter().skip(offset + 1).enumerate().map(move |(dist, s2)| { + ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1) + // If there are no document for this distance, there will be no + // results for the phrase query. + .map(|m| m.unwrap_or_default()) + }) + }) + }) + .intersection() + .map_err(Error::from) } fn all_word_pair_proximity_docids, U: AsRef>( @@ -386,15 +343,16 @@ fn all_word_pair_proximity_docids, U: AsRef>( right_words: &[(U, u8)], proximity: u8, ) -> Result { - let mut docids = RoaringBitmap::new(); - for (left, _l_typo) in left_words { - for (right, _r_typo) in right_words { - let current_docids = ctx - .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? - .unwrap_or_default(); - docids |= current_docids; - } - } + let docids = left_words + .into_iter() + .flat_map(|(left, _l_typo)| { + right_words.into_iter().map(move |(right, _r_typo)| { + ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity) + .map(|res| res.unwrap_or_default()) + }) + }) + .union()?; + Ok(docids) } @@ -414,15 +372,20 @@ fn query_docids( Ok(docids) } else if query.prefix { let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - docids |= ctx.word_docids(&word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); - } - } - Ok(docids) + + words + .into_iter() + .flat_map(|(word, _typo)| { + let current_docids = + ctx.word_docids(&word).map(|word| word.unwrap_or_default()); + let typo = (*original_typo == 0).then(|| { + ctx.exact_word_docids(&word).map(|word| word.unwrap_or_default()) + }); + + std::iter::once(current_docids).chain(typo) + }) + .union() + .map_err(Error::from) } else { let mut docids = ctx.word_docids(&word)?.unwrap_or_default(); // only add the exact docids if the word hasn't been derived @@ -434,15 +397,19 @@ fn query_docids( } QueryKind::Tolerant { typo, word } => { let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, typo) in words { - let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - if *typo == 0 { - current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default() - } - docids |= current_docids; - } - Ok(docids) + + words + .into_iter() + .flat_map(|(word, typo)| { + let current_docids = + ctx.word_docids(&word).map(|word| word.unwrap_or_default()); + let typo = (*typo == 0) + .then(|| ctx.exact_word_docids(&word).map(|word| word.unwrap_or_default())); + + std::iter::once(current_docids).chain(typo) + }) + .union() + .map_err(Error::from) } } } @@ -486,23 +453,28 @@ fn query_pair_proximity_docids( let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - let current_docids = match ctx.word_prefix_pair_proximity_docids( - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { - let r_words = - word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + l_words + .into_iter() + .map(|(left, _)| -> Result<_> { + match ctx.word_prefix_pair_proximity_docids( + left.as_str(), + right.as_str(), + proximity, + )? { + Some(docids) => Ok(docids), + None => { + let r_words = + word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) + } } - }?; - docids |= current_docids; - } - Ok(docids) + }) + .union() } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e942a7bef..ef8718686 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -3,7 +3,7 @@ use std::collections::hash_map::HashMap; use std::mem::take; use log::debug; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, @@ -257,17 +257,14 @@ fn resolve_candidates<'t>( for (ll, lr, lcandidates) in lefts { for (rl, rr, rcandidates) in rights { - let mut candidates = + let candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; - if lcandidates.len() < rcandidates.len() { - candidates &= lcandidates; - candidates &= rcandidates; - } else { - candidates &= rcandidates; - candidates &= lcandidates; - } if !candidates.is_empty() { - output.push((ll.clone(), rr.clone(), candidates)); + output.push(( + ll.clone(), + rr.clone(), + [&candidates, lcandidates, rcandidates].intersection(), + )); } } } @@ -318,10 +315,11 @@ fn resolve_candidates<'t>( } } - let mut candidates = RoaringBitmap::new(); - for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? { - candidates |= cds; - } + let candidates = resolve_operation(ctx, query_tree, proximity, cache, wdcache)? + .into_iter() + .map(|(_, _, cds)| cds) + .union(); + Ok(candidates) } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7241dab2b..220e98891 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -6,7 +6,7 @@ use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; use log::debug; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use super::FacetNumberRange; use crate::error::{Error, UserError}; @@ -365,13 +365,18 @@ impl<'a> Filter<'a> { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - let mut bitmap = RoaringBitmap::new(); + let bitmap = els + .into_iter() + .map(|el| { + Self::evaluate_operator( + rtxn, + index, + fid, + &Condition::Equal(el.clone()), + ) + }) + .union()?; - for el in els { - let op = Condition::Equal(el.clone()); - let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; - bitmap |= el_bitmap; - } Ok(bitmap) } else { Ok(RoaringBitmap::new()) @@ -413,39 +418,14 @@ impl<'a> Filter<'a> { } } } - FilterCondition::Or(subfilters) => { - let mut bitmap = RoaringBitmap::new(); - for f in subfilters { - bitmap |= - Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; - } - Ok(bitmap) - } - FilterCondition::And(subfilters) => { - let mut subfilters_iter = subfilters.iter(); - if let Some(first_subfilter) = subfilters_iter.next() { - let mut bitmap = Self::inner_evaluate( - &(first_subfilter.clone()).into(), - rtxn, - index, - filterable_fields, - )?; - for f in subfilters_iter { - if bitmap.is_empty() { - return Ok(bitmap); - } - bitmap &= Self::inner_evaluate( - &(f.clone()).into(), - rtxn, - index, - filterable_fields, - )?; - } - Ok(bitmap) - } else { - Ok(RoaringBitmap::new()) - } - } + FilterCondition::Or(subfilters) => subfilters + .into_iter() + .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .union(), + FilterCondition::And(subfilters) => subfilters + .into_iter() + .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .intersection(), FilterCondition::GeoLowerThan { point, radius } => { if filterable_fields.contains("_geo") { let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 108acae4f..048c2e3f4 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -132,7 +132,7 @@ use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, BytesEncode, Error}; use log::debug; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use time::OffsetDateTime; use crate::error::InternalError; @@ -301,9 +301,7 @@ fn compute_facet_number_levels<'t>( first_level_size, level_group_size, &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } + number_document_ids |= bitmaps.union(); Ok(()) }, &|_i, (_field_id, _level, left, _right)| *left, @@ -316,11 +314,11 @@ fn compute_facet_number_levels<'t>( Ok((subwriters, number_document_ids)) } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, docids) = result?; - documents_ids |= docids; - } + let documents_ids = db + .range(rtxn, &(level_0_start..))? + .take(first_level_size) + .map(|result| result.map(|(_key, docids)| docids)) + .union()?; Ok((vec![], documents_ids)) } @@ -389,11 +387,11 @@ fn compute_facet_strings_levels<'t>( Ok((subwriters, strings_document_ids)) } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; - } + let documents_ids = db + .range(rtxn, &(level_0_start..))? + .take(first_level_size) + .map(|result| result.map(|(_key, (_original_value, docids))| docids)) + .union()?; Ok((vec![], documents_ids)) } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index c5385e347..8c1c15054 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::io; use std::result::Result as StdResult; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use super::read_u32_ne_bytes; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; @@ -41,8 +41,7 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul .map(AsRef::as_ref) .map(RoaringBitmap::deserialize_from) .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); + .union(); let mut buffer = Vec::new(); serialize_roaring_bitmap(&merged, &mut buffer)?; Ok(Cow::Owned(buffer)) @@ -65,8 +64,7 @@ pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( .map(|(_, bitmap_bytes)| bitmap_bytes) .map(RoaringBitmap::deserialize_from) .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); + .union(); let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); let mut buffer = Vec::with_capacity(cap);