diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index d8d23225da0..2ccb875c421 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -5259,7 +5259,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "stable_deref_trait", ] @@ -6996,10 +6996,12 @@ dependencies = [ "proptest", "quickwit-common", "quickwit-datetime", + "regex", "serde", "serde_json", "serde_with", "tantivy", + "tantivy-fst", "thiserror 1.0.69", "time", "whichlang", @@ -7062,6 +7064,7 @@ dependencies = [ "serde", "serde_json", "tantivy", + "tantivy-fst", "thiserror 1.0.69", "tokio", "tokio-stream", @@ -8875,7 +8878,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "aho-corasick", "arc-swap", @@ -8890,6 +8893,7 @@ dependencies = [ "fastdivide", "fnv", "fs4", + "futures-channel", "futures-util", "htmlescape", "hyperloglogplus", @@ -8919,7 +8923,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.6", + "thiserror 2.0.7", "time", "uuid", "winapi 0.3.9", @@ -8929,7 +8933,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "bitpacking", ] @@ -8937,7 +8941,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "downcast-rs", "fastdivide", @@ -8952,7 +8956,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "async-trait", "byteorder", @@ -8975,7 +8979,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "nom", ] @@ -8983,8 +8987,10 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ + "futures-util", + "itertools 0.13.0", "tantivy-bitpacker", "tantivy-common", "tantivy-fst", @@ -8994,7 +9000,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "murmurhash32", "rand_distr", @@ -9004,7 +9010,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=71cf198#71cf19870b6bf6b78ace1c8f887f888207a01925" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "serde", ] @@ -9076,11 +9082,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.6" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47" +checksum = "93605438cbd668185516ab499d589afb7ee1859ea3d5fc8f6b0755e1c7443767" dependencies = [ - "thiserror-impl 2.0.6", + "thiserror-impl 2.0.7", ] [[package]] @@ -9096,9 +9102,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.6" +version = "2.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312" +checksum = "e1d8749b4531af2117677a5fcd12b1348a3fe2b81e36e61ffeac5c4aa3273e36" dependencies = [ "proc-macro2", "quote", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 735076a90d8..8fd46f5266b 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -328,12 +328,13 @@ quickwit-serve = { path = "quickwit-serve" } quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "71cf198", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "d281ca3", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", "zstd-compression", ] } +tantivy-fst = "0.5" # This is actually not used directly the goal is to fix the version # used by reqwest. diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index c0aaf27d487..b5bca08f1cb 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -85,6 +85,15 @@ pub struct TermRange { pub limit: Option, } +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +/// Supported automaton types to warmup +pub enum Automaton { + /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if + /// inside a json field + Regex(Option>, String), + // we could add termset query here, instead of downloading the whole dictionary +} + /// Description of how a fast field should be warmed up #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct FastFieldWarmupInfo { @@ -109,6 +118,8 @@ pub struct WarmupInfo { pub terms_grouped_by_field: HashMap>, /// Term ranges to warmup, and whether their position is needed too. pub term_ranges_grouped_by_field: HashMap>, + /// Automatons to warmup + pub automatons_grouped_by_field: HashMap>, } impl WarmupInfo { @@ -143,6 +154,11 @@ impl WarmupInfo { *sub_map.entry(term_range).or_default() |= include_position; } } + + for (field, automatons) in other.automatons_grouped_by_field.into_iter() { + let sub_map = self.automatons_grouped_by_field.entry(field).or_default(); + sub_map.extend(automatons); + } } /// Simplify a WarmupInfo, removing some redundant tasks @@ -599,6 +615,13 @@ mod tests { .collect() } + fn automaton_hashset(elements: &[&str]) -> HashSet { + elements + .iter() + .map(|elem| Automaton::Regex(None, elem.to_string())) + .collect() + } + fn hashset_field(elements: &[u32]) -> HashSet { elements .iter() @@ -648,6 +671,12 @@ mod tests { (2, "term1", false), (2, "term2", false), ]), + automatons_grouped_by_field: [( + Field::from_field_id(1), + automaton_hashset(&["my_reg.*ex"]), + )] + .into_iter() + .collect(), }; // merging with default has no impact @@ -665,6 +694,12 @@ mod tests { (3, "term1", false), (2, "term2", true), ]), + automatons_grouped_by_field: [ + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(2), automaton_hashset(&["my_reg.*ex"])), + ] + .into_iter() + .collect(), }; wi_base.merge(wi_2.clone()); @@ -712,6 +747,17 @@ mod tests { ); } + let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")]; + for (field, regex) in expected_automatons { + let field = Field::from_field_id(field); + let automaton = Automaton::Regex(None, regex.to_string()); + assert!(wi_base + .automatons_grouped_by_field + .get(&field) + .unwrap() + .contains(&automaton)); + } + // merge is idempotent let mut wi_cloned = wi_base.clone(); wi_cloned.merge(wi_2); @@ -734,6 +780,13 @@ mod tests { (1, "term2", true), (2, "term3", false), ]), + automatons_grouped_by_field: [ + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])), + ] + .into_iter() + .collect(), }; let expected = WarmupInfo { term_dict_fields: hashset_field(&[1]), @@ -744,6 +797,12 @@ mod tests { (1, "term2", true), (2, "term3", false), ]), + automatons_grouped_by_field: [ + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])), + ] + .into_iter() + .collect(), }; warmup_info.simplify(); diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index 9c940076a5a..5279004424c 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -35,7 +35,7 @@ mod routing_expression; pub mod tag_pruning; pub use doc_mapper::{ - analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo, + analyze_text, Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FastFieldWarmupInfo, FieldMappingEntry, FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, TokenizerConfig, TokenizerEntry, WarmupInfo, }; diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 047cb9ea131..58d9c31eb0a 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -23,7 +23,7 @@ use std::ops::Bound; use quickwit_query::query_ast::{ FieldPresenceQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, QueryAstVisitor, RangeQuery, - TermSetQuery, WildcardQuery, + RegexQuery, TermSetQuery, WildcardQuery, }; use quickwit_query::tokenizers::TokenizerManager; use quickwit_query::{find_field_or_hit_dynamic, InvalidQuery}; @@ -33,7 +33,7 @@ use tantivy::Term; use tracing::error; use crate::doc_mapper::FastFieldWarmupInfo; -use crate::{QueryParserError, TermRange, WarmupInfo}; +use crate::{Automaton, QueryParserError, TermRange, WarmupInfo}; #[derive(Default)] struct RangeQueryFields { @@ -124,8 +124,8 @@ pub(crate) fn build_query( )?; let term_set_query_fields = extract_term_set_query_fields(query_ast, &schema)?; - let term_ranges_grouped_by_field = - extract_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?; + let (term_ranges_grouped_by_field, automatons_grouped_by_field) = + extract_prefix_term_ranges_and_automaton(query_ast, &schema, tokenizer_manager)?; let mut terms_grouped_by_field: HashMap> = Default::default(); query.query_terms(&mut |term, need_position| { @@ -142,6 +142,7 @@ pub(crate) fn build_query( terms_grouped_by_field, term_ranges_grouped_by_field, fast_fields, + automatons_grouped_by_field, ..WarmupInfo::default() }; @@ -211,6 +212,7 @@ struct ExtractPrefixTermRanges<'a> { schema: &'a Schema, tokenizer_manager: &'a TokenizerManager, term_ranges_to_warm_up: HashMap>, + automatons_to_warm_up: HashMap>, } impl<'a> ExtractPrefixTermRanges<'a> { @@ -219,6 +221,7 @@ impl<'a> ExtractPrefixTermRanges<'a> { schema, tokenizer_manager, term_ranges_to_warm_up: HashMap::new(), + automatons_to_warm_up: HashMap::new(), } } @@ -242,6 +245,13 @@ impl<'a> ExtractPrefixTermRanges<'a> { .entry(term_range) .or_default() |= position_needed; } + + fn add_automaton(&mut self, field: Field, automaton: Automaton) { + self.automatons_to_warm_up + .entry(field) + .or_default() + .insert(automaton); + } } impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { @@ -277,25 +287,44 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> { - let term = match wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager) { - Ok((_, term)) => term, + let (field, path, regex) = + match wildcard_query.to_regex(self.schema, self.tokenizer_manager) { + Ok(res) => res, + /* the query will be nullified when casting to a tantivy ast */ + Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), + Err(e) => return Err(e), + }; + + self.add_automaton(field, Automaton::Regex(path, regex)); + Ok(()) + } + + fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> { + let (field, path, regex) = match regex_query.to_field_and_regex(self.schema) { + Ok(res) => res, /* the query will be nullified when casting to a tantivy ast */ Err(InvalidQuery::FieldDoesNotExist { .. }) => return Ok(()), Err(e) => return Err(e), }; - self.add_prefix_term(term, u32::MAX, false); + self.add_automaton(field, Automaton::Regex(path, regex)); Ok(()) } } -fn extract_prefix_term_ranges( +type TermRangeWarmupInfo = HashMap>; +type AutomatonWarmupInfo = HashMap>; + +fn extract_prefix_term_ranges_and_automaton( query_ast: &QueryAst, schema: &Schema, tokenizer_manager: &TokenizerManager, -) -> anyhow::Result>> { +) -> anyhow::Result<(TermRangeWarmupInfo, AutomatonWarmupInfo)> { let mut visitor = ExtractPrefixTermRanges::with_schema(schema, tokenizer_manager); visitor.visit(query_ast)?; - Ok(visitor.term_ranges_to_warm_up) + Ok(( + visitor.term_ranges_to_warm_up, + visitor.automatons_to_warm_up, + )) } #[cfg(test)] @@ -563,21 +592,13 @@ mod test { #[test] fn test_wildcard_query() { - check_build_query_static_mode( - "title:hello*", - Vec::new(), - TestExpectation::Ok("PhrasePrefixQuery"), - ); + check_build_query_static_mode("title:hello*", Vec::new(), TestExpectation::Ok("Regex")); check_build_query_static_mode( "foo:bar*", Vec::new(), TestExpectation::Err("invalid query: field does not exist: `foo`"), ); - check_build_query_static_mode( - "title:hello*yo", - Vec::new(), - TestExpectation::Err("Wildcard query contains wildcard in non final position"), - ); + check_build_query_static_mode("title:hello*yo", Vec::new(), TestExpectation::Ok("Regex")); } #[test] diff --git a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs index e3b2255efc3..76082b34bb4 100644 --- a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs +++ b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs @@ -118,6 +118,7 @@ fn extract_unsimplified_tags_filter_ast(query_ast: QueryAst) -> UnsimplifiedTagF panic!("Extract unsimplified should only be called on AST without UserInputQuery."); } QueryAst::FieldPresence(_) => UnsimplifiedTagFilterAst::Uninformative, + QueryAst::Regex(_) => UnsimplifiedTagFilterAst::Uninformative, } } diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index e94f8aef4ec..00ffd59ff11 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -18,10 +18,12 @@ lindera-core = { workspace = true, optional = true } lindera-dictionary = { workspace = true, optional = true } lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } +regex = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } tantivy = { workspace = true } +tantivy-fst = { workspace = true } time = { workspace = true } thiserror = { workspace = true } whichlang = { workspace = true, optional = true } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs index 2140b659138..7e8afea9995 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs @@ -29,6 +29,7 @@ mod one_field_map; mod phrase_prefix_query; mod query_string_query; mod range_query; +mod regex_query; mod string_or_struct; mod term_query; mod terms_query; @@ -46,6 +47,7 @@ use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery; use crate::elastic_query_dsl::match_phrase_query::MatchPhraseQuery; use crate::elastic_query_dsl::match_query::MatchQuery; use crate::elastic_query_dsl::multi_match::MultiMatchQuery; +use crate::elastic_query_dsl::regex_query::RegexQuery; use crate::elastic_query_dsl::terms_query::TermsQuery; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::QueryAst; @@ -87,6 +89,7 @@ pub(crate) enum ElasticQueryDslInner { MultiMatch(MultiMatchQuery), Range(RangeQuery), Exists(ExistsQuery), + Regexp(RegexQuery), } #[derive(Deserialize, Debug, Eq, PartialEq, Clone)] @@ -134,6 +137,7 @@ impl ConvertibleToQueryAst for ElasticQueryDslInner { Self::Match(match_query) => match_query.convert_to_query_ast(), Self::Exists(exists_query) => exists_query.convert_to_query_ast(), Self::MultiMatch(multi_match_query) => multi_match_query.convert_to_query_ast(), + Self::Regexp(regex_query) => regex_query.convert_to_query_ast(), } } } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs new file mode 100644 index 00000000000..55b76131571 --- /dev/null +++ b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs @@ -0,0 +1,43 @@ +// Copyright (C) 2024 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use serde::Deserialize; + +use crate::elastic_query_dsl::one_field_map::OneFieldMap; +use crate::elastic_query_dsl::ConvertibleToQueryAst; +use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery}; + +#[derive(Deserialize, Debug, Default, Eq, PartialEq, Clone)] +#[serde(deny_unknown_fields)] +pub struct RegexQueryParams { + value: String, + // we could probably add case_insensitive +} + +pub type RegexQuery = OneFieldMap; + +impl ConvertibleToQueryAst for RegexQuery { + fn convert_to_query_ast(self) -> anyhow::Result { + Ok(AstRegexQuery { + field: self.field, + regex: self.value.value, + } + .into()) + } +} diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index 8699e05d238..31d53ff65f9 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -28,6 +28,7 @@ mod field_presence; mod full_text_query; mod phrase_prefix_query; mod range_query; +mod regex_query; mod tantivy_query_ast; mod term_query; mod term_set_query; @@ -41,6 +42,7 @@ pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; pub use range_query::RangeQuery; +pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery}; use tantivy_query_ast::TantivyQueryAst; pub use term_query::TermQuery; pub use term_set_query::TermSetQuery; @@ -63,6 +65,7 @@ pub enum QueryAst { Range(RangeQuery), UserInput(UserInputQuery), Wildcard(WildcardQuery), + Regex(RegexQuery), MatchAll, MatchNone, Boost { @@ -105,7 +108,8 @@ impl QueryAst { | ast @ QueryAst::MatchNone | ast @ QueryAst::FieldPresence(_) | ast @ QueryAst::Range(_) - | ast @ QueryAst::Wildcard(_) => Ok(ast), + | ast @ QueryAst::Wildcard(_) + | ast @ QueryAst::Regex(_) => Ok(ast), QueryAst::UserInput(user_text_query) => { user_text_query.parse_user_query(default_search_fields) } @@ -249,6 +253,12 @@ impl BuildTantivyAst for QueryAst { search_fields, with_validation, ), + QueryAst::Regex(regex) => regex.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), } } } diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs new file mode 100644 index 00000000000..66b24ede640 --- /dev/null +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -0,0 +1,382 @@ +// Copyright (C) 2024 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::sync::Arc; + +use anyhow::Context; +pub use prefix::{AutomatonQuery, JsonPathPrefix}; +use serde::{Deserialize, Serialize}; +use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; +use tantivy::Term; + +use super::{BuildTantivyAst, QueryAst}; +use crate::query_ast::TantivyQueryAst; +use crate::tokenizers::TokenizerManager; +use crate::{find_field_or_hit_dynamic, InvalidQuery}; + +/// A Regex query +#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] +pub struct RegexQuery { + pub field: String, + pub regex: String, +} + +impl From for QueryAst { + fn from(regex_query: RegexQuery) -> Self { + Self::Regex(regex_query) + } +} + +impl RegexQuery { + #[cfg(test)] + pub fn from_field_value(field: impl ToString, regex: impl ToString) -> Self { + Self { + field: field.to_string(), + regex: regex.to_string(), + } + } +} + +impl RegexQuery { + pub fn to_field_and_regex( + &self, + schema: &TantivySchema, + ) -> Result<(Field, Option>, String), InvalidQuery> { + let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) + else { + return Err(InvalidQuery::FieldDoesNotExist { + full_path: self.field.clone(), + }); + }; + let field_type = field_entry.field_type(); + + match field_type { + FieldType::Str(ref text_options) => { + text_options.get_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + + Ok((field, None, self.regex.clone())) + } + FieldType::JsonObject(json_options) => { + json_options.get_text_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + + let mut term_for_path = Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ); + term_for_path.append_type_and_str(""); + + let value = term_for_path.value(); + // We skip the 1st byte which is a marker to tell this is json. This isn't present + // in the dictionary + let byte_path_prefix = value.as_serialized()[1..].to_owned(); + Ok((field, Some(byte_path_prefix), self.regex.clone())) + } + _ => Err(InvalidQuery::SchemaError( + "trying to run a regex query on a non-text field".to_string(), + )), + } + } +} + +impl BuildTantivyAst for RegexQuery { + fn build_tantivy_ast_impl( + &self, + schema: &TantivySchema, + _tokenizer_manager: &TokenizerManager, + _search_fields: &[String], + _with_validation: bool, + ) -> Result { + let (field, path, regex) = self.to_field_and_regex(schema)?; + let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; + let regex_automaton_with_path = JsonPathPrefix { + prefix: path.unwrap_or_default(), + automaton: regex.into(), + }; + let regex_query_with_path = AutomatonQuery { + field, + automaton: Arc::new(regex_automaton_with_path), + }; + Ok(regex_query_with_path.into()) + } +} + +mod prefix { + use std::sync::Arc; + + use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight}; + use tantivy::schema::Field; + use tantivy_fst::Automaton; + + pub struct JsonPathPrefix { + pub prefix: Vec, + pub automaton: Arc, + } + + // we need to implement manually because the std adds an unnecessary bound `A: Clone` + impl Clone for JsonPathPrefix { + fn clone(&self) -> Self { + JsonPathPrefix { + prefix: self.prefix.clone(), + automaton: self.automaton.clone(), + } + } + } + + #[derive(Clone, Debug, PartialEq)] + pub enum JsonPathPrefixState { + Prefix(usize), + Inner(A), + PrefixFailed, + } + + impl Automaton for JsonPathPrefix { + type State = JsonPathPrefixState; + + fn start(&self) -> Self::State { + if self.prefix.is_empty() { + JsonPathPrefixState::Inner(self.automaton.start()) + } else { + JsonPathPrefixState::Prefix(0) + } + } + + fn is_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => false, + JsonPathPrefixState::Inner(inner_state) => self.automaton.is_match(inner_state), + JsonPathPrefixState::PrefixFailed => false, + } + } + + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + match state { + JsonPathPrefixState::Prefix(i) => { + if self.prefix.get(*i) != Some(&byte) { + return JsonPathPrefixState::PrefixFailed; + } + let next_pos = i + 1; + if next_pos == self.prefix.len() { + JsonPathPrefixState::Inner(self.automaton.start()) + } else { + JsonPathPrefixState::Prefix(next_pos) + } + } + JsonPathPrefixState::Inner(inner_state) => { + JsonPathPrefixState::Inner(self.automaton.accept(inner_state, byte)) + } + JsonPathPrefixState::PrefixFailed => JsonPathPrefixState::PrefixFailed, + } + } + + fn can_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => true, + JsonPathPrefixState::Inner(inner_state) => self.automaton.can_match(inner_state), + JsonPathPrefixState::PrefixFailed => false, + } + } + + fn will_always_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => false, + JsonPathPrefixState::Inner(inner_state) => { + self.automaton.will_always_match(inner_state) + } + JsonPathPrefixState::PrefixFailed => false, + } + } + } + + // we don't use RegexQuery to handle our path. We could tinker with the regex to embed + // json field path inside, but that seems not as clean, and would prevent support of + // case-insensitive search in the future (we would also make the path insensitive, + // which we shouldn't) + pub struct AutomatonQuery { + pub automaton: Arc, + pub field: Field, + } + + impl std::fmt::Debug for AutomatonQuery { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("AutomatonQuery") + .field("field", &self.field) + .field("automaton", &std::any::type_name::()) + .finish() + } + } + + impl Clone for AutomatonQuery { + fn clone(&self) -> Self { + AutomatonQuery { + automaton: self.automaton.clone(), + field: self.field, + } + } + } + + impl Query for AutomatonQuery + where A::State: Clone + { + fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> tantivy::Result> { + Ok(Box::new(AutomatonWeight::::new( + self.field, + self.automaton.clone(), + ))) + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use tantivy::schema::{Schema as TantivySchema, TEXT}; + use tantivy_fst::{Automaton, Regex}; + + use super::prefix::JsonPathPrefixState; + use super::{JsonPathPrefix, RegexQuery}; + + #[test] + fn test_regex_query_text_field() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(field, schema.get_field("field").unwrap()); + assert!(path.is_none()); + assert_eq!(regex, query.regex); + } + + #[test] + fn test_regex_query_json_field() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_json_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field.sub.field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(field, schema.get_field("field").unwrap()); + assert_eq!(path.unwrap(), b"sub\x01field\0s"); + assert_eq!(regex, query.regex); + + // i believe this is how concatenated field behave + let query_empty_path = RegexQuery { + field: "field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap(); + assert_eq!(field, schema.get_field("field").unwrap()); + assert_eq!(path.unwrap(), b"\0s"); + assert_eq!(regex, query_empty_path.regex); + } + + #[test] + fn test_json_prefix_automaton_empty_path() { + let regex = Arc::new(Regex::new("e(f|g.*)").unwrap()); + let empty_path_automaton = JsonPathPrefix { + prefix: Vec::new(), + automaton: regex.clone(), + }; + + let start = empty_path_automaton.start(); + assert_eq!(start, JsonPathPrefixState::Inner(regex.start())); + } + + #[test] + fn test_json_prefix_automaton() { + let regex = Arc::new(Regex::new("e(f|g.*)").unwrap()); + let automaton = JsonPathPrefix { + prefix: b"ab".to_vec(), + automaton: regex.clone(), + }; + + let start = automaton.start(); + assert!(matches!(start, JsonPathPrefixState::Prefix(_))); + assert!(automaton.can_match(&start)); + assert!(!automaton.is_match(&start)); + + let miss = automaton.accept(&start, b'g'); + assert_eq!(miss, JsonPathPrefixState::PrefixFailed); + // supporting this is important for optimisation + assert!(!automaton.can_match(&miss)); + assert!(!automaton.is_match(&miss)); + + let a = automaton.accept(&start, b'a'); + assert!(matches!(a, JsonPathPrefixState::Prefix(_))); + assert!(automaton.can_match(&a)); + assert!(!automaton.is_match(&a)); + + let ab = automaton.accept(&a, b'b'); + assert_eq!(ab, JsonPathPrefixState::Inner(regex.start())); + assert!(automaton.can_match(&ab)); + assert!(!automaton.is_match(&ab)); + + // starting here, we just take that we passthrough correctly, + // and reply to can_match as well as possible + // (we don't test will_always_match because Regex doesn't support it) + let abc = automaton.accept(&ab, b'c'); + assert!(matches!(abc, JsonPathPrefixState::Inner(_))); + assert!(!automaton.can_match(&abc)); + assert!(!automaton.is_match(&abc)); + + let abe = automaton.accept(&ab, b'e'); + assert!(matches!(abe, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abe)); + assert!(!automaton.is_match(&abe)); + + let abef = automaton.accept(&abe, b'f'); + assert!(matches!(abef, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abef)); + assert!(automaton.is_match(&abef)); + + let abefg = automaton.accept(&abef, b'g'); + assert!(matches!(abefg, JsonPathPrefixState::Inner(_))); + assert!(!automaton.can_match(&abefg)); + assert!(!automaton.is_match(&abefg)); + + let abeg = automaton.accept(&abe, b'g'); + assert!(matches!(abeg, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abeg)); + assert!(automaton.is_match(&abeg)); + + let abegh = automaton.accept(&abeg, b'h'); + assert!(matches!(abegh, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abegh)); + assert!(automaton.is_match(&abegh)); + } +} diff --git a/quickwit/quickwit-query/src/query_ast/visitor.rs b/quickwit/quickwit-query/src/query_ast/visitor.rs index bd85a71d64e..c9ce180b518 100644 --- a/quickwit/quickwit-query/src/query_ast/visitor.rs +++ b/quickwit/quickwit-query/src/query_ast/visitor.rs @@ -21,8 +21,8 @@ use crate::not_nan_f32::NotNaNf32; use crate::query_ast::field_presence::FieldPresenceQuery; use crate::query_ast::user_input_query::UserInputQuery; use crate::query_ast::{ - BoolQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, TermQuery, TermSetQuery, - WildcardQuery, + BoolQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, RegexQuery, TermQuery, + TermSetQuery, WildcardQuery, }; /// Simple trait to implement a Visitor over the QueryAst. @@ -45,6 +45,7 @@ pub trait QueryAstVisitor<'a> { QueryAst::UserInput(user_text_query) => self.visit_user_text(user_text_query), QueryAst::FieldPresence(exists) => self.visit_exists(exists), QueryAst::Wildcard(wildcard) => self.visit_wildcard(wildcard), + QueryAst::Regex(regex) => self.visit_regex(regex), } } @@ -111,6 +112,10 @@ pub trait QueryAstVisitor<'a> { fn visit_wildcard(&mut self, _wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> { Ok(()) } + + fn visit_regex(&mut self, _regex_query: &'a RegexQuery) -> Result<(), Self::Err> { + Ok(()) + } } /// Simple trait to implement a Visitor over the QueryAst. @@ -133,6 +138,7 @@ pub trait QueryAstTransformer { QueryAst::UserInput(user_text_query) => self.transform_user_text(user_text_query), QueryAst::FieldPresence(exists) => self.transform_exists(exists), QueryAst::Wildcard(wildcard) => self.transform_wildcard(wildcard), + QueryAst::Regex(regex) => self.transform_regex(regex), } } @@ -231,4 +237,8 @@ pub trait QueryAstTransformer { ) -> Result, Self::Err> { Ok(Some(QueryAst::Wildcard(wildcard_query))) } + + fn transform_regex(&mut self, regex_query: RegexQuery) -> Result, Self::Err> { + Ok(Some(QueryAst::Regex(regex_query))) + } } diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 81a44932f22..a5bdaf1b430 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -17,19 +17,20 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use anyhow::{anyhow, bail, Context}; +use std::borrow::Cow; +use std::sync::Arc; + +use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use tantivy::Term; use super::{BuildTantivyAst, QueryAst}; -use crate::query_ast::TantivyQueryAst; +use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst}; use crate::tokenizers::TokenizerManager; use crate::{find_field_or_hit_dynamic, InvalidQuery}; /// A Wildcard query allows to match 'bond' with a query like 'b*d'. -/// -/// At the moment, only wildcard at end of term is supported. #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] pub struct WildcardQuery { pub field: String, @@ -44,72 +45,78 @@ impl From for QueryAst { } } -fn extract_unique_token(mut tokens: Vec) -> anyhow::Result { - let term = tokens - .pop() - .with_context(|| "wildcard query generated no term")?; - if !tokens.is_empty() { - anyhow::bail!("wildcard query generated more than one term"); +fn parse_wildcard_query(mut query: &str) -> Vec { + let mut res = Vec::new(); + while let Some(pos) = query.find(['*', '?', '\\']) { + if pos > 0 { + res.push(SubQuery::Text(query[..pos].to_string())); + } + let chr = &query[pos..pos + 1]; + query = &query[pos + 1..]; + match chr { + "*" => res.push(SubQuery::Wildcard), + "?" => res.push(SubQuery::QuestionMark), + "\\" => { + if let Some(chr) = query.chars().next() { + res.push(SubQuery::Text(chr.to_string())); + query = &query[chr.len_utf8()..]; + } else { + // escaping at the end is invalid, handle it as if that escape sequence wasn't + // present + break; + } + } + _ => unreachable!("find shouldn't return non-matching position"), + } + } + if !query.is_empty() { + res.push(SubQuery::Text(query.to_string())); } - Ok(term) + res } -fn unescape_with_final_wildcard(phrase: &str) -> anyhow::Result { - enum State { - Normal, - Escaped, - } +enum SubQuery { + Text(String), + Wildcard, + QuestionMark, +} - // we keep this state outside of scan because we want to query if after - let mut saw_wildcard = false; - let saw_wildcard = &mut saw_wildcard; +fn sub_query_parts_to_regex( + sub_query_parts: Vec, + tokenizer_name: &str, + tokenizer_manager: &TokenizerManager, +) -> anyhow::Result { + let mut normalizer = tokenizer_manager + .get_normalizer(tokenizer_name) + .with_context(|| format!("no tokenizer named `{tokenizer_name}` is registered"))?; - let phrase = phrase - .chars() - .scan(State::Normal, |state, c| { - if *saw_wildcard { - return Some(Some(Err(anyhow!( - "Wildcard query contains wildcard in non final position" - )))); - } - match state { - State::Escaped => { - *state = State::Normal; - Some(Some(Ok(c))) - } - State::Normal => { - if c == '*' { - *saw_wildcard = true; - Some(None) - } else if c == '\\' { - *state = State::Escaped; - Some(None) - } else if c == '?' { - Some(Some(Err(anyhow!("Wildcard query contains `?`")))) - } else { - Some(Some(Ok(c))) - } + sub_query_parts + .into_iter() + .map(|part| match part { + SubQuery::Text(text) => { + let mut token_stream = normalizer.token_stream(&text); + let expected_token = token_stream + .next() + .context("normalizer generated no content")? + .text + .clone(); + if let Some(_unexpected_token) = token_stream.next() { + bail!("normalizer generated multiple tokens") } + Ok(Cow::Owned(regex::escape(&expected_token))) } + SubQuery::Wildcard => Ok(Cow::Borrowed(".*")), + SubQuery::QuestionMark => Ok(Cow::Borrowed(".")), }) - // we have an iterator of Option> - .flatten() - // we have an iterator of Result - .collect::>()?; - if !*saw_wildcard { - bail!("Wildcard query doesn't contain a wildcard"); - } - Ok(phrase) + .collect::>() } impl WildcardQuery { - // TODO this method will probably disappear once we support the full semantic of - // wildcard queries - pub fn extract_prefix_term( + pub fn to_regex( &self, schema: &TantivySchema, tokenizer_manager: &TokenizerManager, - ) -> Result<(Field, Term), InvalidQuery> { + ) -> Result<(Field, Option>, String), InvalidQuery> { let Some((field, field_entry, json_path)) = find_field_or_hit_dynamic(&self.field, schema) else { return Err(InvalidQuery::FieldDoesNotExist { @@ -118,7 +125,7 @@ impl WildcardQuery { }; let field_type = field_entry.field_type(); - let prefix = unescape_with_final_wildcard(&self.value)?; + let sub_query_parts = parse_wildcard_query(&self.value); match field_type { FieldType::Str(ref text_options) => { @@ -129,19 +136,10 @@ impl WildcardQuery { )) })?; let tokenizer_name = text_field_indexing.tokenizer(); - let mut normalizer = tokenizer_manager - .get_normalizer(tokenizer_name) - .with_context(|| { - format!("no tokenizer named `{}` is registered", tokenizer_name) - })?; - let mut token_stream = normalizer.token_stream(&prefix); - let mut tokens = Vec::new(); - token_stream.process(&mut |token| { - let term: Term = Term::from_field_text(field, &token.text); - tokens.push(term); - }); - let term = extract_unique_token(tokens)?; - Ok((field, term)) + let regex = + sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?; + + Ok((field, None, regex)) } FieldType::JsonObject(json_options) => { let text_field_indexing = @@ -152,25 +150,22 @@ impl WildcardQuery { )) })?; let tokenizer_name = text_field_indexing.tokenizer(); - let mut normalizer = tokenizer_manager - .get_normalizer(tokenizer_name) - .with_context(|| { - format!("no tokenizer named `{}` is registered", tokenizer_name) - })?; - let mut token_stream = normalizer.token_stream(&prefix); - let mut tokens = Vec::new(); + let regex = + sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?; + + let mut term_for_path = Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ); + term_for_path.append_type_and_str(""); - token_stream.process(&mut |token| { - let mut term = Term::from_field_json_path( - field, - json_path, - json_options.is_expand_dots_enabled(), - ); - term.append_type_and_str(&token.text); - tokens.push(term); - }); - let term = extract_unique_token(tokens)?; - Ok((field, term)) + let value = term_for_path.value(); + // We skip the 1st byte which is a marker to tell this is json. This isn't present + // in the dictionary + let byte_path_prefix = value.as_serialized()[1..].to_owned(); + + Ok((field, Some(byte_path_prefix), regex)) } _ => Err(InvalidQuery::SchemaError( "trying to run a Wildcard query on a non-text field".to_string(), @@ -187,18 +182,24 @@ impl BuildTantivyAst for WildcardQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (_, term) = match self.extract_prefix_term(schema, tokenizer_manager) { + let (field, path, regex) = match self.to_regex(schema, tokenizer_manager) { Ok(res) => res, Err(InvalidQuery::FieldDoesNotExist { .. }) if self.lenient => { return Ok(TantivyQueryAst::match_none()) } Err(e) => return Err(e), }; - - let mut phrase_prefix_query = - tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]); - phrase_prefix_query.set_max_expansions(u32::MAX); - Ok(phrase_prefix_query.into()) + let regex = + tantivy_fst::Regex::new(®ex).context("failed to parse regex built from wildcard")?; + let regex_automaton_with_path = JsonPathPrefix { + prefix: path.unwrap_or_default(), + automaton: regex.into(), + }; + let regex_query_with_path = AutomatonQuery { + field, + automaton: Arc::new(regex_automaton_with_path), + }; + Ok(regex_query_with_path.into()) } } @@ -218,21 +219,24 @@ mod tests { } #[test] - fn test_extract_term_for_wildcard() { + fn test_wildcard_query_to_regex_on_text() { let query = WildcardQuery { - field: "my_field".to_string(), - value: "MyString Wh1ch a nOrMal Tokenizer would cut*".to_string(), + field: "text_field".to_string(), + value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(), lenient: false, }; + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); for tokenizer in ["raw", "whitespace"] { - let schema = single_text_field_schema("my_field", tokenizer); - let (_field, term) = query - .extract_prefix_term(&schema, &tokenizer_manager) - .unwrap(); - let value = term.value(); - let text = value.as_str().unwrap(); - assert_eq!(text, query.value.trim_end_matches('*')); + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field("text_field", text_options); + let schema = schema_builder.build(); + + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"); + assert!(path.is_none()); } for tokenizer in [ @@ -244,18 +248,64 @@ mod tests { "source_code_default", "source_code_with_hex", ] { - let schema = single_text_field_schema("my_field", tokenizer); - let (_field, term) = query - .extract_prefix_term(&schema, &tokenizer_manager) - .unwrap(); - let value = term.value(); - let text = value.as_str().unwrap(); - assert_eq!(text, &query.value.trim_end_matches('*').to_lowercase()); + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field("text_field", text_options); + let schema = schema_builder.build(); + + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut"); + assert!(path.is_none()); + } + } + + #[test] + fn test_wildcard_query_to_regex_on_json() { + let query = WildcardQuery { + // this volontarily contains uppercase and regex-unsafe char to make sure we properly + // keep the case, but sanitize special chars + field: "json_field.Inner.Fie*ld".to_string(), + value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(), + lenient: false, + }; + + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + for tokenizer in ["raw", "whitespace"] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_json_field("json_field", text_options); + let schema = schema_builder.build(); + + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"); + assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes()); + } + + for tokenizer in [ + "raw_lowercase", + "lowercase", + "default", + "en_stem", + "chinese_compatible", + "source_code_default", + "source_code_with_hex", + ] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_json_field("json_field", text_options); + let schema = schema_builder.build(); + + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut"); + assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes()); } } #[test] - fn test_extract_term_for_wildcard_missing_field() { + fn test_extract_regex_wildcard_missing_field() { let query = WildcardQuery { field: "my_missing_field".to_string(), value: "My query value*".to_string(), @@ -263,9 +313,7 @@ mod tests { }; let tokenizer_manager = create_default_quickwit_tokenizer_manager(); let schema = single_text_field_schema("my_field", "whitespace"); - let err = query - .extract_prefix_term(&schema, &tokenizer_manager) - .unwrap_err(); + let err = query.to_regex(&schema, &tokenizer_manager).unwrap_err(); let InvalidQuery::FieldDoesNotExist { full_path: missing_field_full_path, } = err diff --git a/quickwit/quickwit-search/Cargo.toml b/quickwit/quickwit-search/Cargo.toml index 3aeb47a6eb7..ce03e38a78e 100644 --- a/quickwit/quickwit-search/Cargo.toml +++ b/quickwit/quickwit-search/Cargo.toml @@ -28,6 +28,7 @@ rayon = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } +tantivy-fst = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } tokio-stream = { workspace = true } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 7242e62c409..03e6a0ebc8b 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -29,7 +29,7 @@ use bytesize::ByteSize; use futures::future::try_join_all; use quickwit_common::pretty::PrettySample; use quickwit_directories::{CachingDirectory, HotDirectory, StorageDirectory}; -use quickwit_doc_mapper::{DocMapper, FastFieldWarmupInfo, TermRange, WarmupInfo}; +use quickwit_doc_mapper::{Automaton, DocMapper, FastFieldWarmupInfo, TermRange, WarmupInfo}; use quickwit_proto::search::{ CountHits, LeafSearchRequest, LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortOrder, SortValue, SplitIdAndFooterOffsets, SplitSearchError, @@ -226,6 +226,9 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any // TODO merge warm_up_postings into warm_up_term_dict_fields let warm_up_postings_future = warm_up_postings(searcher, &warmup_info.term_dict_fields) .instrument(debug_span!("warm_up_postings")); + let warm_up_automatons_future = + warm_up_automatons(searcher, &warmup_info.automatons_grouped_by_field) + .instrument(debug_span!("warm_up_automatons")); tokio::try_join!( warm_up_terms_future, @@ -234,6 +237,7 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any warm_up_term_dict_future, warm_up_fieldnorms_future, warm_up_postings_future, + warm_up_automatons_future, )?; Ok(()) @@ -351,6 +355,47 @@ async fn warm_up_term_ranges( Ok(()) } +async fn warm_up_automatons( + searcher: &Searcher, + terms_grouped_by_field: &HashMap>, +) -> anyhow::Result<()> { + let mut warm_up_futures = Vec::new(); + let cpu_intensive_executor = |task| async { + crate::search_thread_pool() + .run_cpu_intensive(task) + .await + .map_err(|_| std::io::Error::other("task panicked"))? + }; + for (field, automatons) in terms_grouped_by_field { + for segment_reader in searcher.segment_readers() { + let inv_idx = segment_reader.inverted_index(*field)?; + for automaton in automatons { + let inv_idx_clone = inv_idx.clone(); + warm_up_futures.push(async move { + match automaton { + Automaton::Regex(path, regex_str) => { + let regex = tantivy_fst::Regex::new(regex_str) + .context("failed to parse regex during warmup")?; + inv_idx_clone + .warm_postings_automaton( + quickwit_query::query_ast::JsonPathPrefix { + automaton: regex.into(), + prefix: path.clone().unwrap_or_default(), + }, + cpu_intensive_executor, + ) + .await + .context("failed to load automaton") + } + } + }); + } + } + } + try_join_all(warm_up_futures).await?; + Ok(()) +} + async fn warm_up_fieldnorms(searcher: &Searcher, requires_scoring: bool) -> anyhow::Result<()> { if !requires_scoring { return Ok(()); diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml index 668e4877cfc..d4d33233b1f 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml @@ -158,7 +158,7 @@ json: query_string: default_field: payload.description lenient: true - query: "Jour* AND unix" + query: "Jou*al AND unix" expected: hits: total: @@ -170,7 +170,7 @@ json: query_string: default_field: payload.description lenient: true - query: "jour* AND unix" + query: "jou*al AND unix" expected: hits: total: @@ -200,6 +200,16 @@ expected: total: value: 1 --- +json: + query: + regexp: + payload.description: + value: "jour.*" +expected: + hits: + total: + value: 3 +--- json: query: query_string: