From 6bbddfaab61709e464481c4471b1b69db242c930 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Tue, 17 Dec 2024 17:09:17 +0100 Subject: [PATCH 1/7] run wildcard as automatons --- quickwit/Cargo.lock | 152 +++++---- quickwit/Cargo.toml | 3 +- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 18 + quickwit/quickwit-doc-mapper/src/lib.rs | 6 +- .../quickwit-doc-mapper/src/query_builder.rs | 28 +- quickwit/quickwit-query/Cargo.toml | 1 + .../src/query_ast/wildcard_query.rs | 311 ++++++++++-------- quickwit/quickwit-search/Cargo.toml | 1 + quickwit/quickwit-search/src/leaf.rs | 35 +- 9 files changed, 338 insertions(+), 217 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 1662803ed0b..3a5b45ed497 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -679,7 +679,7 @@ dependencies = [ "regex-lite", "roxmltree 0.14.1", "serde_json", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -1046,7 +1046,7 @@ dependencies = [ "miniserde", "peakmem-alloc", "perf-event", - "rustc-hash 2.0.0", + "rustc-hash", "rustop", "unicode-width 0.1.14", "yansi", @@ -2833,7 +2833,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -2848,7 +2848,7 @@ checksum = "f8bdaaa4bc036e8318274d1b25f0f2265b3e95418b765fd1ea1c7ef938fd69bd" dependencies = [ "google-cloud-token", "http 0.2.12", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-retry", "tonic", @@ -2874,7 +2874,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96e4ad0802d3f416f62e7ce01ac1460898ee0efc98f8b45cd4aab7611607012f" dependencies = [ "reqwest", - "thiserror", + "thiserror 1.0.69", "tokio", ] @@ -2891,7 +2891,7 @@ dependencies = [ "google-cloud-googleapis", "google-cloud-token", "prost-types 0.11.9", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tracing", @@ -3637,9 +3637,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" dependencies = [ "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", ] [[package]] @@ -4053,7 +4050,7 @@ dependencies = [ "log", "once_cell", "serde", - "thiserror", + "thiserror 1.0.69", "yada", ] @@ -4333,11 +4330,10 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measure_time" -version = "0.8.3" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc" +checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e" dependencies = [ - "instant", "log", ] @@ -4467,7 +4463,7 @@ dependencies = [ "rustc_version", "smallvec", "tagptr", - "thiserror", + "thiserror 1.0.69", "triomphe", "uuid", ] @@ -4481,7 +4477,7 @@ dependencies = [ "crc32fast", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tracing", ] @@ -4777,7 +4773,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "sha2", - "thiserror", + "thiserror 1.0.69", "url", ] @@ -4908,7 +4904,7 @@ dependencies = [ "serde_plain", "serde_with 1.14.0", "subtle", - "thiserror", + "thiserror 1.0.69", "url", ] @@ -5005,7 +5001,7 @@ dependencies = [ "opentelemetry_sdk", "prost 0.11.9", "reqwest", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic", ] @@ -5043,7 +5039,7 @@ dependencies = [ "js-sys", "once_cell", "pin-project-lite", - "thiserror", + "thiserror 1.0.69", "urlencoding", ] @@ -5065,7 +5061,7 @@ dependencies = [ "rand 0.8.5", "regex", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", ] @@ -5147,7 +5143,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "stable_deref_trait", ] @@ -5325,7 +5321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442" dependencies = [ "memchr", - "thiserror", + "thiserror 1.0.69", "ucd-trie", ] @@ -5678,7 +5674,7 @@ dependencies = [ "smallvec", "symbolic-demangle", "tempfile", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -5872,7 +5868,7 @@ dependencies = [ "parking_lot", "procfs", "protobuf", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -6158,7 +6154,7 @@ dependencies = [ "serde", "serde_json", "sync_wrapper", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -6222,7 +6218,7 @@ dependencies = [ "serde_json", "tabled", "tempfile", - "thiserror", + "thiserror 1.0.69", "thousands", "tikv-jemalloc-ctl", "tikv-jemallocator", @@ -6292,7 +6288,7 @@ dependencies = [ "quickwit-common", "quickwit-proto", "serde", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tonic", @@ -6330,7 +6326,7 @@ dependencies = [ "serde_json", "siphasher", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-metrics", "tokio-stream", @@ -6461,7 +6457,7 @@ dependencies = [ "serde_yaml", "siphasher", "tantivy", - "thiserror", + "thiserror 1.0.69", "time", "tracing", "utoipa", @@ -6481,7 +6477,7 @@ dependencies = [ "quickwit-metastore", "quickwit-proto", "quickwit-storage", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -6539,7 +6535,7 @@ dependencies = [ "serde_json", "tantivy", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -6577,7 +6573,7 @@ dependencies = [ "serde_json", "serde_json_borrow", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic", "tower", @@ -6672,7 +6668,7 @@ dependencies = [ "serde_json", "tantivy", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tracing", @@ -6766,7 +6762,7 @@ dependencies = [ "serial_test", "sqlx", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tokio-stream", @@ -6793,7 +6789,7 @@ dependencies = [ "quickwit-proto", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tonic", @@ -6824,7 +6820,7 @@ dependencies = [ "serde", "serde_json", "sqlx", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic", "tonic-build", @@ -6851,11 +6847,12 @@ dependencies = [ "proptest", "quickwit-common", "quickwit-datetime", + "regex", "serde", "serde_json", "serde_with 3.11.0", "tantivy", - "thiserror", + "thiserror 1.0.69", "time", "whichlang", ] @@ -6877,7 +6874,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", "wiremock", @@ -6917,7 +6914,8 @@ dependencies = [ "serde", "serde_json", "tantivy", - "thiserror", + "tantivy-fst", + "thiserror 1.0.69", "tokio", "tokio-stream", "tower", @@ -6980,7 +6978,7 @@ dependencies = [ "serde_qs 0.12.0", "serde_with 3.11.0", "tempfile", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tokio-stream", @@ -7032,7 +7030,7 @@ dependencies = [ "serde_json", "tantivy", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", "tokio-util", @@ -7251,7 +7249,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom 0.2.15", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -7575,12 +7573,6 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustc-hash" version = "2.0.0" @@ -7810,7 +7802,7 @@ dependencies = [ "proc-macro2", "quote", "syn 2.0.89", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -7958,7 +7950,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" dependencies = [ "percent-encoding", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -7970,7 +7962,7 @@ dependencies = [ "futures", "percent-encoding", "serde", - "thiserror", + "thiserror 1.0.69", "tracing", "warp", ] @@ -8204,7 +8196,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085" dependencies = [ "num-bigint", "num-traits", - "thiserror", + "thiserror 1.0.69", "time", ] @@ -8389,7 +8381,7 @@ dependencies = [ "sha2", "smallvec", "sqlformat", - "thiserror", + "thiserror 1.0.69", "time", "tokio", "tokio-stream", @@ -8474,7 +8466,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror", + "thiserror 1.0.69", "time", "tracing", "whoami", @@ -8513,7 +8505,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror", + "thiserror 1.0.69", "time", "tracing", "whoami", @@ -8740,7 +8732,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "aho-corasick", "arc-swap", @@ -8769,7 +8761,7 @@ dependencies = [ "rayon", "regex", "rust-stemmers", - "rustc-hash 1.1.0", + "rustc-hash", "serde", "serde_json", "sketches-ddsketch", @@ -8783,7 +8775,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror", + "thiserror 2.0.7", "time", "uuid", "winapi 0.3.9", @@ -8793,7 +8785,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "bitpacking", ] @@ -8801,7 +8793,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "downcast-rs", "fastdivide", @@ -8816,7 +8808,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "async-trait", "byteorder", @@ -8839,7 +8831,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "nom", ] @@ -8847,8 +8839,10 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ + "futures-util", + "itertools 0.13.0", "tantivy-bitpacker", "tantivy-common", "tantivy-fst", @@ -8858,7 +8852,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "murmurhash32", "rand_distr", @@ -8868,7 +8862,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" dependencies = [ "serde", ] @@ -8935,7 +8929,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93605438cbd668185516ab499d589afb7ee1859ea3d5fc8f6b0755e1c7443767" +dependencies = [ + "thiserror-impl 2.0.7", ] [[package]] @@ -8949,6 +8952,17 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "thiserror-impl" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d8749b4531af2117677a5fcd12b1348a3fe2b81e36e61ffeac5c4aa3273e36" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.89", +] + [[package]] name = "thousands" version = "0.2.0" @@ -9025,7 +9039,7 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78bfd61bca99323ce96911bd2c443259115460615e44f1d449cee8cb3831a1dd" dependencies = [ - "thiserror", + "thiserror 1.0.69", "time", ] @@ -9487,7 +9501,7 @@ dependencies = [ "log", "rand 0.8.5", "sha1", - "thiserror", + "thiserror 1.0.69", "url", "utf-8", ] @@ -9822,7 +9836,7 @@ dependencies = [ "strip-ansi-escapes", "syslog_loose", "termcolor", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", "uaparser", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index c3e3051470c..6e7fa89c9a5 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -328,12 +328,13 @@ quickwit-serve = { path = "quickwit-serve" } quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "2f2db16", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", branch = "trinity/sstable-partial-automaton", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", "zstd-compression", ] } +tantivy-fst = "0.5" # This is actually not used directly the goal is to fix the version # used by reqwest. diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 146c2f1f51c..059fed62222 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -85,6 +85,13 @@ pub struct TermRange { pub limit: Option, } +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +/// Supported automaton types to warmup +pub enum Automaton { + /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq + Regex(String), +} + /// Information about what a DocMapper think should be warmed up before /// running the query. #[derive(Debug, Default, Clone, PartialEq, Eq)] @@ -100,6 +107,8 @@ pub struct WarmupInfo { pub terms_grouped_by_field: HashMap>, /// Term ranges to warmup, and whether their position is needed too. pub term_ranges_grouped_by_field: HashMap>, + /// Automatons to warmup + pub automatons_grouped_by_field: HashMap>, } impl WarmupInfo { @@ -125,6 +134,11 @@ impl WarmupInfo { *sub_map.entry(term_range).or_default() |= include_position; } } + + for (field, automatons) in other.automatons_grouped_by_field.into_iter() { + let sub_map = self.automatons_grouped_by_field.entry(field).or_default(); + sub_map.extend(automatons); + } } /// Simplify a WarmupInfo, removing some redundant tasks @@ -624,6 +638,7 @@ mod tests { (2, "term1", false), (2, "term2", false), ]), + automatons_grouped_by_field: HashMap::new(), // TODO complete tests }; // merging with default has no impact @@ -641,6 +656,7 @@ mod tests { (3, "term1", false), (2, "term2", true), ]), + automatons_grouped_by_field: HashMap::new(), // TODO complete tests }; wi_base.merge(wi_2.clone()); @@ -710,6 +726,7 @@ mod tests { (1, "term2", true), (2, "term3", false), ]), + automatons_grouped_by_field: HashMap::new(), // TODO complete tests }; let expected = WarmupInfo { term_dict_fields: hashset_field(&[1]), @@ -720,6 +737,7 @@ mod tests { (1, "term2", true), (2, "term3", false), ]), + automatons_grouped_by_field: HashMap::new(), // TODO complete tests }; warmup_info.simplify(); diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index c592616e86a..312d2e69d69 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -35,9 +35,9 @@ mod routing_expression; pub mod tag_pruning; pub use doc_mapper::{ - analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FieldMappingEntry, FieldMappingType, - JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, TokenizerConfig, - TokenizerEntry, WarmupInfo, + analyze_text, Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FieldMappingEntry, + FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, + TokenizerConfig, TokenizerEntry, WarmupInfo, }; use doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index dbc663794e5..6dae2e29590 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -31,7 +31,7 @@ use tantivy::query::Query; use tantivy::schema::{Field, Schema}; use tantivy::Term; -use crate::{QueryParserError, TermRange, WarmupInfo}; +use crate::{Automaton, QueryParserError, TermRange, WarmupInfo}; #[derive(Default)] struct RangeQueryFields { @@ -101,7 +101,7 @@ pub(crate) fn build_query( )?; let term_set_query_fields = extract_term_set_query_fields(query_ast, &schema)?; - let term_ranges_grouped_by_field = + let (term_ranges_grouped_by_field, automatons_grouped_by_field) = extract_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?; let mut terms_grouped_by_field: HashMap> = Default::default(); @@ -119,6 +119,7 @@ pub(crate) fn build_query( terms_grouped_by_field, term_ranges_grouped_by_field, fast_field_names, + automatons_grouped_by_field, ..WarmupInfo::default() }; @@ -194,6 +195,7 @@ struct ExtractPrefixTermRanges<'a> { schema: &'a Schema, tokenizer_manager: &'a TokenizerManager, term_ranges_to_warm_up: HashMap>, + automatons_to_warm_up: HashMap>, } impl<'a> ExtractPrefixTermRanges<'a> { @@ -202,6 +204,7 @@ impl<'a> ExtractPrefixTermRanges<'a> { schema, tokenizer_manager, term_ranges_to_warm_up: HashMap::new(), + automatons_to_warm_up: HashMap::new(), } } @@ -225,6 +228,13 @@ impl<'a> ExtractPrefixTermRanges<'a> { .entry(term_range) .or_default() |= position_needed; } + + fn add_automaton(&mut self, field: Field, automaton: Automaton) { + self.automatons_to_warm_up + .entry(field) + .or_default() + .insert(automaton); + } } impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { @@ -258,8 +268,8 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> { - let (_, term) = wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager)?; - self.add_prefix_term(term, u32::MAX, false); + let (field, regex) = wildcard_query.to_regex(self.schema, self.tokenizer_manager)?; + self.add_automaton(field, Automaton::Regex(regex)); Ok(()) } } @@ -268,10 +278,16 @@ fn extract_prefix_term_ranges( query_ast: &QueryAst, schema: &Schema, tokenizer_manager: &TokenizerManager, -) -> anyhow::Result>> { +) -> anyhow::Result<( + HashMap>, + HashMap>, +)> { let mut visitor = ExtractPrefixTermRanges::with_schema(schema, tokenizer_manager); visitor.visit(query_ast)?; - Ok(visitor.term_ranges_to_warm_up) + Ok(( + visitor.term_ranges_to_warm_up, + visitor.automatons_to_warm_up, + )) } #[cfg(test)] diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index bee650198c8..35ddeaac479 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -18,6 +18,7 @@ lindera-core = { workspace = true, optional = true } lindera-dictionary = { workspace = true, optional = true } lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } +regex = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 86afb68a7d3..173872528d2 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -17,7 +17,9 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use anyhow::{anyhow, bail, Context}; +use std::borrow::Cow; + +use anyhow::{bail, Context}; use serde::{Deserialize, Serialize}; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use tantivy::Term; @@ -52,76 +54,52 @@ impl WildcardQuery { } } -fn extract_unique_token(mut tokens: Vec) -> anyhow::Result { - let term = tokens - .pop() - .with_context(|| "wildcard query generated no term")?; - if !tokens.is_empty() { - anyhow::bail!("wildcard query generated more than one term"); - } - Ok(term) -} - -fn unescape_with_final_wildcard(phrase: &str) -> anyhow::Result { - enum State { - Normal, - Escaped, - } - - // we keep this state outside of scan because we want to query if after - let mut saw_wildcard = false; - let saw_wildcard = &mut saw_wildcard; - - let phrase = phrase - .chars() - .scan(State::Normal, |state, c| { - if *saw_wildcard { - return Some(Some(Err(anyhow!( - "Wildcard iquery contains wildcard in non final position" - )))); - } - match state { - State::Escaped => { - *state = State::Normal; - Some(Some(Ok(c))) - } - State::Normal => { - if c == '*' { - *saw_wildcard = true; - Some(None) - } else if c == '\\' { - *state = State::Escaped; - Some(None) - } else if c == '?' { - Some(Some(Err(anyhow!("Wildcard query contains `?`")))) - } else { - Some(Some(Ok(c))) - } +fn parse_wildcard_query(mut query: &str) -> Vec { + let mut res = Vec::new(); + while let Some(pos) = query.find(['*', '?', '\\']) { + if pos > 0 { + res.push(SubQuery::Text(query[..pos].to_string())); + } + let chr = &query[pos..pos + 1]; + query = &query[pos + 1..]; + match chr { + "*" => res.push(SubQuery::Wildcard), + "?" => res.push(SubQuery::QuestionMark), + "\\" => { + if let Some(chr) = query.chars().next() { + res.push(SubQuery::Text(chr.to_string())); + query = &query[chr.len_utf8()..]; + } else { + // this is invalid, but let's just ignore that escape sequence + break; } } - }) - // we have an iterator of Option> - .flatten() - // we have an iterator of Result - .collect::>()?; - if !*saw_wildcard { - bail!("Wildcard query doesn't contain a wildcard"); + _ => unreachable!("find shouldn't return non-matching position"), + } + } + if !query.is_empty() { + res.push(SubQuery::Text(query.to_string())); } - Ok(phrase) + res +} + +enum SubQuery { + Text(String), + Wildcard, + QuestionMark, } impl WildcardQuery { - // TODO this method will probably disappear once we support the full semantic of - // wildcard queries - pub fn extract_prefix_term( + pub fn to_regex( &self, schema: &TantivySchema, tokenizer_manager: &TokenizerManager, - ) -> Result<(Field, Term), InvalidQuery> { + ) -> Result<(Field, String), InvalidQuery> { let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; let field_type = field_entry.field_type(); - let prefix = unescape_with_final_wildcard(&self.value)?; + let sub_query_parts = parse_wildcard_query(&self.value); + // TODO handle json_path match field_type { FieldType::Str(ref text_options) => { @@ -137,14 +115,28 @@ impl WildcardQuery { .with_context(|| { format!("no tokenizer named `{}` is registered", tokenizer_name) })?; - let mut token_stream = normalizer.token_stream(&prefix); - let mut tokens = Vec::new(); - token_stream.process(&mut |token| { - let term: Term = Term::from_field_text(field, &token.text); - tokens.push(term); - }); - let term = extract_unique_token(tokens)?; - Ok((field, term)) + + let regex = sub_query_parts + .into_iter() + .map(|part| match part { + SubQuery::Text(text) => { + let mut token_stream = normalizer.token_stream(&text); + let expected_token = token_stream + .next() + .context("normalizer generated no content")? + .text + .clone(); + if let Some(_unexpected_token) = token_stream.next() { + bail!("normalizer generated multiple tokens") + } + Ok(Cow::Owned(regex::escape(&expected_token))) + } + SubQuery::Wildcard => Ok(Cow::Borrowed(".*")), + SubQuery::QuestionMark => Ok(Cow::Borrowed(".")), + }) + .collect::>()?; + + Ok((field, regex)) } FieldType::JsonObject(json_options) => { let text_field_indexing = @@ -160,26 +152,115 @@ impl WildcardQuery { .with_context(|| { format!("no tokenizer named `{}` is registered", tokenizer_name) })?; - let mut token_stream = normalizer.token_stream(&prefix); - let mut tokens = Vec::new(); - - token_stream.process(&mut |token| { - let mut term = Term::from_field_json_path( - field, - json_path, - json_options.is_expand_dots_enabled(), - ); - term.append_type_and_str(&token.text); - tokens.push(term); - }); - let term = extract_unique_token(tokens)?; - Ok((field, term)) + + let mut term_for_path = Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ); + term_for_path.append_type_and_str(""); + + let value = term_for_path.value(); + // this shouldn't error: json path was a string, and all things added while encoding + // the path are valid ascii (and valid utf-8). We also skip the 1st byte which is a + // marker to tell this is json. This isn't present in the dictionary + let path_prefix = std::str::from_utf8(&value.as_serialized()[1..]) + .context("failed to extract json path from term")?; + let regex = std::iter::once(Ok(Cow::Owned(regex::escape(path_prefix)))) + .chain(sub_query_parts.into_iter().map(|part| match part { + SubQuery::Text(text) => { + let mut token_stream = normalizer.token_stream(&text); + let expected_token = token_stream + .next() + .context("normalizer generated no content")? + .text + .clone(); + if let Some(_unexpected_token) = token_stream.next() { + bail!("normalizer generated multiple tokens") + } + Ok(Cow::Owned(regex::escape(&expected_token))) + } + SubQuery::Wildcard => Ok(Cow::Borrowed(".*")), + SubQuery::QuestionMark => Ok(Cow::Borrowed(".")), + })) + .collect::>()?; + Ok((field, regex)) } _ => Err(InvalidQuery::SchemaError( "trying to run a Wildcard query on a non-text field".to_string(), )), } } + + /* + pub fn extract_prefix_term( + &self, + schema: &TantivySchema, + tokenizer_manager: &TokenizerManager, + ) -> Result<(Field, Term), InvalidQuery> { + let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; + let field_type = field_entry.field_type(); + + let prefix = unescape_with_final_wildcard(&self.value)?; + + match field_type { + FieldType::Str(ref text_options) => { + let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + let tokenizer_name = text_field_indexing.tokenizer(); + let mut normalizer = tokenizer_manager + .get_normalizer(tokenizer_name) + .with_context(|| { + format!("no tokenizer named `{}` is registered", tokenizer_name) + })?; + let mut token_stream = normalizer.token_stream(&prefix); + let mut tokens = Vec::new(); + token_stream.process(&mut |token| { + let term: Term = Term::from_field_text(field, &token.text); + tokens.push(term); + }); + let term = extract_unique_token(tokens)?; + Ok((field, term)) + } + FieldType::JsonObject(json_options) => { + let text_field_indexing = + json_options.get_text_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + let tokenizer_name = text_field_indexing.tokenizer(); + let mut normalizer = tokenizer_manager + .get_normalizer(tokenizer_name) + .with_context(|| { + format!("no tokenizer named `{}` is registered", tokenizer_name) + })?; + let mut token_stream = normalizer.token_stream(&prefix); + let mut tokens = Vec::new(); + + token_stream.process(&mut |token| { + let mut term = Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ); + term.append_type_and_str(&token.text); + tokens.push(term); + }); + let term = extract_unique_token(tokens)?; + Ok((field, term)) + } + _ => Err(InvalidQuery::SchemaError( + "trying to run a Wildcard query on a non-text field".to_string(), + )), + } + } + */ } impl BuildTantivyAst for WildcardQuery { @@ -190,66 +271,22 @@ impl BuildTantivyAst for WildcardQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?; + /* + let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?; - let mut phrase_prefix_query = - tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]); - phrase_prefix_query.set_max_expansions(u32::MAX); - Ok(phrase_prefix_query.into()) + let mut phrase_prefix_query = + tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]); + phrase_prefix_query.set_max_expansions(u32::MAX); + Ok(phrase_prefix_query.into()) + */ + let (field, regex) = self.to_regex(schema, tokenizer_manager)?; + let regex_query = tantivy::query::RegexQuery::from_pattern(®ex, field) + .context("failed to build regex from wildcard")?; + Ok(regex_query.into()) } } #[cfg(test)] mod tests { - use tantivy::schema::{TextFieldIndexing, TextOptions}; - - use super::*; - use crate::create_default_quickwit_tokenizer_manager; - - #[test] - fn test_extract_term_for_wildcard() { - let query = WildcardQuery { - field: "my_field".to_string(), - value: "MyString Wh1ch a nOrMal Tokenizer would cut*".to_string(), - }; - let tokenizer_manager = create_default_quickwit_tokenizer_manager(); - for tokenizer in ["raw", "whitespace"] { - let mut schema_builder = TantivySchema::builder(); - let text_options = TextOptions::default() - .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); - schema_builder.add_text_field("my_field", text_options); - let schema = schema_builder.build(); - - let (_field, term) = query - .extract_prefix_term(&schema, &tokenizer_manager) - .unwrap(); - let value = term.value(); - let text = value.as_str().unwrap(); - assert_eq!(text, query.value.trim_end_matches('*')); - } - - for tokenizer in [ - "raw_lowercase", - "lowercase", - "default", - "en_stem", - "chinese_compatible", - "source_code_default", - "source_code_with_hex", - ] { - let mut schema_builder = TantivySchema::builder(); - let text_options = TextOptions::default() - .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); - schema_builder.add_text_field("my_field", text_options); - let schema = schema_builder.build(); - - let (_field, term) = query - .extract_prefix_term(&schema, &tokenizer_manager) - .unwrap(); - - let value = term.value(); - let text = value.as_str().unwrap(); - assert_eq!(text, &query.value.trim_end_matches('*').to_lowercase()); - } - } + // TODO add test } diff --git a/quickwit/quickwit-search/Cargo.toml b/quickwit/quickwit-search/Cargo.toml index 3aeb47a6eb7..ce03e38a78e 100644 --- a/quickwit/quickwit-search/Cargo.toml +++ b/quickwit/quickwit-search/Cargo.toml @@ -28,6 +28,7 @@ rayon = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tantivy = { workspace = true } +tantivy-fst = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } tokio-stream = { workspace = true } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 5ad92f63aa2..7c99a04f013 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -27,7 +27,7 @@ use anyhow::Context; use futures::future::try_join_all; use quickwit_common::pretty::PrettySample; use quickwit_directories::{CachingDirectory, HotDirectory, StorageDirectory}; -use quickwit_doc_mapper::{DocMapper, TermRange, WarmupInfo}; +use quickwit_doc_mapper::{Automaton, DocMapper, TermRange, WarmupInfo}; use quickwit_proto::search::{ CountHits, LeafSearchRequest, LeafSearchResponse, PartialHit, SearchRequest, SortOrder, SortValue, SplitIdAndFooterOffsets, SplitSearchError, @@ -218,6 +218,9 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any // TODO merge warm_up_postings into warm_up_term_dict_fields let warm_up_postings_future = warm_up_postings(searcher, &warmup_info.term_dict_fields) .instrument(debug_span!("warm_up_postings")); + let warm_up_automatons_future = + warm_up_automatons(searcher, &warmup_info.automatons_grouped_by_field) + .instrument(debug_span!("warm_up_automatons")); tokio::try_join!( warm_up_terms_future, @@ -226,6 +229,7 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any warm_up_term_dict_future, warm_up_fieldnorms_future, warm_up_postings_future, + warm_up_automatons_future, )?; Ok(()) @@ -337,6 +341,35 @@ async fn warm_up_term_ranges( Ok(()) } +async fn warm_up_automatons( + searcher: &Searcher, + terms_grouped_by_field: &HashMap>, +) -> anyhow::Result<()> { + let mut warm_up_futures = Vec::new(); + for (field, automatons) in terms_grouped_by_field { + for segment_reader in searcher.segment_readers() { + let inv_idx = segment_reader.inverted_index(*field)?; + for automaton in automatons { + let inv_idx_clone = inv_idx.clone(); + warm_up_futures.push(async move { + match automaton { + Automaton::Regex(regex_str) => { + let regex = tantivy_fst::Regex::new(regex_str) + .context("failed parsing regex during warmup")?; + inv_idx_clone + .warm_postings_automaton(®ex) + .await + .context("failed loading automaton") + } + } + }); + } + } + } + try_join_all(warm_up_futures).await?; + Ok(()) +} + async fn warm_up_fieldnorms(searcher: &Searcher, requires_scoring: bool) -> anyhow::Result<()> { if !requires_scoring { return Ok(()); From 9d83c5f84f2e6e4744afff802e66e27d3683407b Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Wed, 18 Dec 2024 11:04:09 +0100 Subject: [PATCH 2/7] add tests for new wildcard queries --- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 47 ++++- .../quickwit-doc-mapper/src/query_builder.rs | 12 +- .../src/query_ast/wildcard_query.rs | 172 ++++++++++-------- .../0005-query_string_query.yaml | 4 +- 4 files changed, 143 insertions(+), 92 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 059fed62222..d5b27fa6204 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -589,6 +589,13 @@ mod tests { elements.iter().map(|elem| elem.to_string()).collect() } + fn automaton_hashset(elements: &[&str]) -> HashSet { + elements + .iter() + .map(|elem| Automaton::Regex(elem.to_string())) + .collect() + } + fn hashset_field(elements: &[u32]) -> HashSet { elements .iter() @@ -638,7 +645,12 @@ mod tests { (2, "term1", false), (2, "term2", false), ]), - automatons_grouped_by_field: HashMap::new(), // TODO complete tests + automatons_grouped_by_field: [( + Field::from_field_id(1), + automaton_hashset(&["my_reg.*ex"]), + )] + .into_iter() + .collect(), }; // merging with default has no impact @@ -656,7 +668,12 @@ mod tests { (3, "term1", false), (2, "term2", true), ]), - automatons_grouped_by_field: HashMap::new(), // TODO complete tests + automatons_grouped_by_field: [ + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(2), automaton_hashset(&["my_reg.*ex"])), + ] + .into_iter() + .collect(), }; wi_base.merge(wi_2.clone()); @@ -704,6 +721,17 @@ mod tests { ); } + let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")]; + for (field, regex) in expected_automatons { + let field = Field::from_field_id(field); + let automaton = Automaton::Regex(regex.to_string()); + assert!(wi_base + .automatons_grouped_by_field + .get(&field) + .unwrap() + .contains(&automaton)); + } + // merge is idempotent let mut wi_cloned = wi_base.clone(); wi_cloned.merge(wi_2); @@ -726,7 +754,13 @@ mod tests { (1, "term2", true), (2, "term3", false), ]), - automatons_grouped_by_field: HashMap::new(), // TODO complete tests + automatons_grouped_by_field: [ + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])), + ] + .into_iter() + .collect(), }; let expected = WarmupInfo { term_dict_fields: hashset_field(&[1]), @@ -737,7 +771,12 @@ mod tests { (1, "term2", true), (2, "term3", false), ]), - automatons_grouped_by_field: HashMap::new(), // TODO complete tests + automatons_grouped_by_field: [ + (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])), + (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])), + ] + .into_iter() + .collect(), }; warmup_info.simplify(); diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 6dae2e29590..e28d665046b 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -102,7 +102,7 @@ pub(crate) fn build_query( let term_set_query_fields = extract_term_set_query_fields(query_ast, &schema)?; let (term_ranges_grouped_by_field, automatons_grouped_by_field) = - extract_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?; + extract_prefix_term_ranges_and_automaton(query_ast, &schema, tokenizer_manager)?; let mut terms_grouped_by_field: HashMap> = Default::default(); query.query_terms(&mut |term, need_position| { @@ -274,14 +274,14 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } } -fn extract_prefix_term_ranges( +type TermRangeWarmupInfo = HashMap>; +type AutomatonWarmupInfo = HashMap>; + +fn extract_prefix_term_ranges_and_automaton( query_ast: &QueryAst, schema: &Schema, tokenizer_manager: &TokenizerManager, -) -> anyhow::Result<( - HashMap>, - HashMap>, -)> { +) -> anyhow::Result<(TermRangeWarmupInfo, AutomatonWarmupInfo)> { let mut visitor = ExtractPrefixTermRanges::with_schema(schema, tokenizer_manager); visitor.visit(query_ast)?; Ok(( diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 173872528d2..c459151d021 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -99,7 +99,6 @@ impl WildcardQuery { let field_type = field_entry.field_type(); let sub_query_parts = parse_wildcard_query(&self.value); - // TODO handle json_path match field_type { FieldType::Str(ref text_options) => { @@ -191,76 +190,6 @@ impl WildcardQuery { )), } } - - /* - pub fn extract_prefix_term( - &self, - schema: &TantivySchema, - tokenizer_manager: &TokenizerManager, - ) -> Result<(Field, Term), InvalidQuery> { - let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; - let field_type = field_entry.field_type(); - - let prefix = unescape_with_final_wildcard(&self.value)?; - - match field_type { - FieldType::Str(ref text_options) => { - let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| { - InvalidQuery::SchemaError(format!( - "field {} is not full-text searchable", - field_entry.name() - )) - })?; - let tokenizer_name = text_field_indexing.tokenizer(); - let mut normalizer = tokenizer_manager - .get_normalizer(tokenizer_name) - .with_context(|| { - format!("no tokenizer named `{}` is registered", tokenizer_name) - })?; - let mut token_stream = normalizer.token_stream(&prefix); - let mut tokens = Vec::new(); - token_stream.process(&mut |token| { - let term: Term = Term::from_field_text(field, &token.text); - tokens.push(term); - }); - let term = extract_unique_token(tokens)?; - Ok((field, term)) - } - FieldType::JsonObject(json_options) => { - let text_field_indexing = - json_options.get_text_indexing_options().ok_or_else(|| { - InvalidQuery::SchemaError(format!( - "field {} is not full-text searchable", - field_entry.name() - )) - })?; - let tokenizer_name = text_field_indexing.tokenizer(); - let mut normalizer = tokenizer_manager - .get_normalizer(tokenizer_name) - .with_context(|| { - format!("no tokenizer named `{}` is registered", tokenizer_name) - })?; - let mut token_stream = normalizer.token_stream(&prefix); - let mut tokens = Vec::new(); - - token_stream.process(&mut |token| { - let mut term = Term::from_field_json_path( - field, - json_path, - json_options.is_expand_dots_enabled(), - ); - term.append_type_and_str(&token.text); - tokens.push(term); - }); - let term = extract_unique_token(tokens)?; - Ok((field, term)) - } - _ => Err(InvalidQuery::SchemaError( - "trying to run a Wildcard query on a non-text field".to_string(), - )), - } - } - */ } impl BuildTantivyAst for WildcardQuery { @@ -271,14 +200,6 @@ impl BuildTantivyAst for WildcardQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - /* - let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?; - - let mut phrase_prefix_query = - tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]); - phrase_prefix_query.set_max_expansions(u32::MAX); - Ok(phrase_prefix_query.into()) - */ let (field, regex) = self.to_regex(schema, tokenizer_manager)?; let regex_query = tantivy::query::RegexQuery::from_pattern(®ex, field) .context("failed to build regex from wildcard")?; @@ -288,5 +209,96 @@ impl BuildTantivyAst for WildcardQuery { #[cfg(test)] mod tests { - // TODO add test + use tantivy::schema::{TextFieldIndexing, TextOptions}; + + use super::*; + use crate::create_default_quickwit_tokenizer_manager; + + #[test] + fn test_wildcard_query_to_regex_on_text() { + let query = WildcardQuery { + field: "text_field".to_string(), + value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(), + }; + + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + for tokenizer in ["raw", "whitespace"] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field("text_field", text_options); + let schema = schema_builder.build(); + + let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"); + } + + for tokenizer in [ + "raw_lowercase", + "lowercase", + "default", + "en_stem", + "chinese_compatible", + "source_code_default", + "source_code_with_hex", + ] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_text_field("text_field", text_options); + let schema = schema_builder.build(); + + let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + + assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut"); + } + } + + #[test] + fn test_wildcard_query_to_regex_on_json() { + let query = WildcardQuery { + // this volontarily contains uppercase and regex-unsafe char to make sure we properly + // keep the case, but sanitize special chars + field: "json_field.Inner.Fie*ld".to_string(), + value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(), + }; + + let tokenizer_manager = create_default_quickwit_tokenizer_manager(); + for tokenizer in ["raw", "whitespace"] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_json_field("json_field", text_options); + let schema = schema_builder.build(); + + let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!( + regex, + "Inner\u{1}Fie\\*ld\0sMyString Wh1ch.a\\.nOrMal Tokenizer would.*cut" + ); + } + + for tokenizer in [ + "raw_lowercase", + "lowercase", + "default", + "en_stem", + "chinese_compatible", + "source_code_default", + "source_code_with_hex", + ] { + let mut schema_builder = TantivySchema::builder(); + let text_options = TextOptions::default() + .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer)); + schema_builder.add_json_field("json_field", text_options); + let schema = schema_builder.build(); + + let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + + assert_eq!( + regex, + "Inner\u{1}Fie\\*ld\0smystring wh1ch.a\\.normal tokenizer would.*cut" + ); + } + } } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml index 8cb495379c3..84c6bb3d790 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml @@ -158,7 +158,7 @@ json: query_string: default_field: payload.description lenient: true - query: "Jour* AND unix" + query: "Jou*al AND unix" expected: hits: total: @@ -170,7 +170,7 @@ json: query_string: default_field: payload.description lenient: true - query: "jour* AND unix" + query: "jou*al AND unix" expected: hits: total: From f65193832fd887561844681c3f12a62c3f1c5a7b Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Wed, 18 Dec 2024 14:22:40 +0100 Subject: [PATCH 3/7] refactor json path handling for automaton queries it should make it easier to use other kind of queries --- quickwit/Cargo.lock | 1 + .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 9 +- .../quickwit-doc-mapper/src/query_builder.rs | 4 +- quickwit/quickwit-query/Cargo.toml | 1 + quickwit/quickwit-query/src/query_ast/mod.rs | 2 +- .../src/query_ast/wildcard_query.rs | 174 +++++++++++++++--- quickwit/quickwit-search/src/leaf.rs | 9 +- 7 files changed, 162 insertions(+), 38 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 3a5b45ed497..d8ac98ead61 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -6852,6 +6852,7 @@ dependencies = [ "serde_json", "serde_with 3.11.0", "tantivy", + "tantivy-fst", "thiserror 1.0.69", "time", "whichlang", diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index d5b27fa6204..1504e26c743 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -88,8 +88,9 @@ pub struct TermRange { #[derive(Debug, Clone, PartialEq, Eq, Hash)] /// Supported automaton types to warmup pub enum Automaton { - /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq - Regex(String), + /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if + /// inside a json field + Regex(Option>, String), } /// Information about what a DocMapper think should be warmed up before @@ -592,7 +593,7 @@ mod tests { fn automaton_hashset(elements: &[&str]) -> HashSet { elements .iter() - .map(|elem| Automaton::Regex(elem.to_string())) + .map(|elem| Automaton::Regex(None, elem.to_string())) .collect() } @@ -724,7 +725,7 @@ mod tests { let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")]; for (field, regex) in expected_automatons { let field = Field::from_field_id(field); - let automaton = Automaton::Regex(regex.to_string()); + let automaton = Automaton::Regex(None, regex.to_string()); assert!(wi_base .automatons_grouped_by_field .get(&field) diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index e28d665046b..f42231afef8 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -268,8 +268,8 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> { - let (field, regex) = wildcard_query.to_regex(self.schema, self.tokenizer_manager)?; - self.add_automaton(field, Automaton::Regex(regex)); + let (field, path, regex) = wildcard_query.to_regex(self.schema, self.tokenizer_manager)?; + self.add_automaton(field, Automaton::Regex(path, regex)); Ok(()) } } diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 35ddeaac479..733f8769056 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -23,6 +23,7 @@ serde = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } tantivy = { workspace = true } +tantivy-fst = { workspace = true } time = { workspace = true } thiserror = { workspace = true } whichlang = { workspace = true, optional = true } diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index 8699e05d238..b99d5079d41 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -46,7 +46,7 @@ pub use term_query::TermQuery; pub use term_set_query::TermSetQuery; pub use user_input_query::UserInputQuery; pub use visitor::{QueryAstTransformer, QueryAstVisitor}; -pub use wildcard_query::WildcardQuery; +pub use wildcard_query::{JsonPathPrefix, WildcardQuery}; use crate::{BooleanOperand, InvalidQuery, NotNaNf32}; diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index c459151d021..64bc47ac082 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -18,8 +18,10 @@ // along with this program. If not, see . use std::borrow::Cow; +use std::sync::Arc; use anyhow::{bail, Context}; +pub use prefix::JsonPathPrefix; use serde::{Deserialize, Serialize}; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use tantivy::Term; @@ -94,7 +96,7 @@ impl WildcardQuery { &self, schema: &TantivySchema, tokenizer_manager: &TokenizerManager, - ) -> Result<(Field, String), InvalidQuery> { + ) -> Result<(Field, Option>, String), InvalidQuery> { let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; let field_type = field_entry.field_type(); @@ -135,7 +137,7 @@ impl WildcardQuery { }) .collect::>()?; - Ok((field, regex)) + Ok((field, None, regex)) } FieldType::JsonObject(json_options) => { let text_field_indexing = @@ -160,13 +162,12 @@ impl WildcardQuery { term_for_path.append_type_and_str(""); let value = term_for_path.value(); - // this shouldn't error: json path was a string, and all things added while encoding - // the path are valid ascii (and valid utf-8). We also skip the 1st byte which is a - // marker to tell this is json. This isn't present in the dictionary - let path_prefix = std::str::from_utf8(&value.as_serialized()[1..]) - .context("failed to extract json path from term")?; - let regex = std::iter::once(Ok(Cow::Owned(regex::escape(path_prefix)))) - .chain(sub_query_parts.into_iter().map(|part| match part { + // We skip the 1st byte which is a marker to tell this is json. This isn't present + // in the dictionary + let byte_path_prefix = value.as_serialized()[1..].to_owned(); + let regex = sub_query_parts + .into_iter() + .map(|part| match part { SubQuery::Text(text) => { let mut token_stream = normalizer.token_stream(&text); let expected_token = token_stream @@ -181,9 +182,9 @@ impl WildcardQuery { } SubQuery::Wildcard => Ok(Cow::Borrowed(".*")), SubQuery::QuestionMark => Ok(Cow::Borrowed(".")), - })) + }) .collect::>()?; - Ok((field, regex)) + Ok((field, Some(byte_path_prefix), regex)) } _ => Err(InvalidQuery::SchemaError( "trying to run a Wildcard query on a non-text field".to_string(), @@ -200,10 +201,129 @@ impl BuildTantivyAst for WildcardQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (field, regex) = self.to_regex(schema, tokenizer_manager)?; - let regex_query = tantivy::query::RegexQuery::from_pattern(®ex, field) - .context("failed to build regex from wildcard")?; - Ok(regex_query.into()) + let (field, path, regex) = self.to_regex(schema, tokenizer_manager)?; + let regex = + tantivy_fst::Regex::new(®ex).context("failed to parse regex built from wildcard")?; + let regex_automaton_with_path = prefix::JsonPathPrefix { + prefix: path.unwrap_or_default(), + automaton: regex, + }; + let regex_query_with_path = prefix::AutomatonQuery { + field, + automaton: Arc::new(regex_automaton_with_path), + }; + Ok(regex_query_with_path.into()) + } +} + +mod prefix { + use std::sync::Arc; + + use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight}; + use tantivy::schema::Field; + use tantivy_fst::Automaton; + pub struct JsonPathPrefix { + pub prefix: Vec, + pub automaton: A, + } + + #[derive(Clone)] + pub enum JsonPathPrefixState { + Prefix(usize), + Inner(A), + PrefixFailed, + } + + impl Automaton for JsonPathPrefix { + type State = JsonPathPrefixState; + + fn start(&self) -> Self::State { + if self.prefix.is_empty() { + JsonPathPrefixState::Inner(self.automaton.start()) + } else { + JsonPathPrefixState::Prefix(0) + } + } + + fn is_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => false, + JsonPathPrefixState::Inner(inner_state) => self.automaton.is_match(inner_state), + JsonPathPrefixState::PrefixFailed => false, + } + } + + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + match state { + JsonPathPrefixState::Prefix(i) => { + if self.prefix.get(*i) != Some(&byte) { + return JsonPathPrefixState::PrefixFailed; + } + let next_pos = i + 1; + if next_pos == self.prefix.len() { + JsonPathPrefixState::Inner(self.automaton.start()) + } else { + JsonPathPrefixState::Prefix(next_pos) + } + } + JsonPathPrefixState::Inner(inner_state) => { + JsonPathPrefixState::Inner(self.automaton.accept(inner_state, byte)) + } + JsonPathPrefixState::PrefixFailed => JsonPathPrefixState::PrefixFailed, + } + } + + fn can_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => true, + JsonPathPrefixState::Inner(inner_state) => self.automaton.can_match(inner_state), + JsonPathPrefixState::PrefixFailed => false, + } + } + + fn will_always_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => false, + JsonPathPrefixState::Inner(inner_state) => { + self.automaton.will_always_match(inner_state) + } + JsonPathPrefixState::PrefixFailed => false, + } + } + } + + pub struct AutomatonQuery { + pub automaton: Arc, + pub field: Field, + } + + impl std::fmt::Debug for AutomatonQuery { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("AutomatonQuery") + .field("field", &self.field) + .field("automaton", &std::any::type_name::()) + .finish() + } + } + + impl Clone for AutomatonQuery { + fn clone(&self) -> Self { + AutomatonQuery { + automaton: self.automaton.clone(), + field: self.field, + } + } + } + + impl Query for AutomatonQuery + where A::State: Clone + { + fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> tantivy::Result> { + Ok(Box::new(AutomatonWeight::::new( + self.field, + self.automaton.clone(), + ))) + } } } @@ -229,8 +349,9 @@ mod tests { schema_builder.add_text_field("text_field", text_options); let schema = schema_builder.build(); - let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"); + assert!(path.is_none()); } for tokenizer in [ @@ -248,9 +369,9 @@ mod tests { schema_builder.add_text_field("text_field", text_options); let schema = schema_builder.build(); - let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); - + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut"); + assert!(path.is_none()); } } @@ -271,11 +392,9 @@ mod tests { schema_builder.add_json_field("json_field", text_options); let schema = schema_builder.build(); - let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); - assert_eq!( - regex, - "Inner\u{1}Fie\\*ld\0sMyString Wh1ch.a\\.nOrMal Tokenizer would.*cut" - ); + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"); + assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes()); } for tokenizer in [ @@ -293,12 +412,9 @@ mod tests { schema_builder.add_json_field("json_field", text_options); let schema = schema_builder.build(); - let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); - - assert_eq!( - regex, - "Inner\u{1}Fie\\*ld\0smystring wh1ch.a\\.normal tokenizer would.*cut" - ); + let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap(); + assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut"); + assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes()); } } } diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 7c99a04f013..c321b4fe662 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -353,11 +353,16 @@ async fn warm_up_automatons( let inv_idx_clone = inv_idx.clone(); warm_up_futures.push(async move { match automaton { - Automaton::Regex(regex_str) => { + Automaton::Regex(path, regex_str) => { let regex = tantivy_fst::Regex::new(regex_str) .context("failed parsing regex during warmup")?; inv_idx_clone - .warm_postings_automaton(®ex) + .warm_postings_automaton( + &quickwit_query::query_ast::JsonPathPrefix { + automaton: regex, + prefix: path.clone().unwrap_or_default(), + }, + ) .await .context("failed loading automaton") } From 4aada41c084e2ae7641d843455a9b1b355145ebe Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Wed, 18 Dec 2024 15:54:24 +0100 Subject: [PATCH 4/7] regex support --- .../quickwit-doc-mapper/src/query_builder.rs | 8 +- .../quickwit-doc-mapper/src/tag_pruning.rs | 1 + .../src/elastic_query_dsl/mod.rs | 4 + .../src/elastic_query_dsl/regex_query.rs | 42 ++++++ quickwit/quickwit-query/src/query_ast/mod.rs | 14 +- .../src/query_ast/regex_query.rs | 124 ++++++++++++++++++ .../quickwit-query/src/query_ast/visitor.rs | 14 +- .../src/query_ast/wildcard_query.rs | 2 +- 8 files changed, 203 insertions(+), 6 deletions(-) create mode 100644 quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs create mode 100644 quickwit/quickwit-query/src/query_ast/regex_query.rs diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index f42231afef8..77fda563987 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -23,7 +23,7 @@ use std::ops::Bound; use quickwit_query::query_ast::{ FieldPresenceQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, QueryAstVisitor, RangeQuery, - TermSetQuery, WildcardQuery, + RegexQuery, TermSetQuery, WildcardQuery, }; use quickwit_query::tokenizers::TokenizerManager; use quickwit_query::{find_field_or_hit_dynamic, InvalidQuery}; @@ -272,6 +272,12 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { self.add_automaton(field, Automaton::Regex(path, regex)); Ok(()) } + + fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> { + let (field, path, regex) = regex_query.to_regex(self.schema)?; + self.add_automaton(field, Automaton::Regex(path, regex)); + Ok(()) + } } type TermRangeWarmupInfo = HashMap>; diff --git a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs index e3b2255efc3..76082b34bb4 100644 --- a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs +++ b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs @@ -118,6 +118,7 @@ fn extract_unsimplified_tags_filter_ast(query_ast: QueryAst) -> UnsimplifiedTagF panic!("Extract unsimplified should only be called on AST without UserInputQuery."); } QueryAst::FieldPresence(_) => UnsimplifiedTagFilterAst::Uninformative, + QueryAst::Regex(_) => UnsimplifiedTagFilterAst::Uninformative, } } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs index 9e49c866d95..8b841f43f7f 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs @@ -29,6 +29,7 @@ mod one_field_map; mod phrase_prefix_query; mod query_string_query; mod range_query; +mod regex_query; mod string_or_struct; mod term_query; mod terms_query; @@ -46,6 +47,7 @@ use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery; use crate::elastic_query_dsl::match_phrase_query::MatchPhraseQuery; use crate::elastic_query_dsl::match_query::MatchQuery; use crate::elastic_query_dsl::multi_match::MultiMatchQuery; +use crate::elastic_query_dsl::regex_query::RegexQuery; use crate::elastic_query_dsl::terms_query::TermsQuery; use crate::not_nan_f32::NotNaNf32; use crate::query_ast::QueryAst; @@ -79,6 +81,7 @@ pub(crate) enum ElasticQueryDslInner { MultiMatch(MultiMatchQuery), Range(RangeQuery), Exists(ExistsQuery), + Regexp(RegexQuery), } #[derive(Deserialize, Debug, Eq, PartialEq, Clone)] @@ -126,6 +129,7 @@ impl ConvertibleToQueryAst for ElasticQueryDslInner { Self::Match(match_query) => match_query.convert_to_query_ast(), Self::Exists(exists_query) => exists_query.convert_to_query_ast(), Self::MultiMatch(multi_match_query) => multi_match_query.convert_to_query_ast(), + Self::Regexp(regex_query) => regex_query.convert_to_query_ast(), } } } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs new file mode 100644 index 00000000000..f3c2d0427f0 --- /dev/null +++ b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs @@ -0,0 +1,42 @@ +// Copyright (C) 2024 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use serde::Deserialize; + +use crate::elastic_query_dsl::one_field_map::OneFieldMap; +use crate::elastic_query_dsl::ConvertibleToQueryAst; +use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery}; + +#[derive(Deserialize, Debug, Default, Eq, PartialEq, Clone)] +#[serde(deny_unknown_fields)] +pub struct RegexQueryParams { + value: String, +} + +pub type RegexQuery = OneFieldMap; + +impl ConvertibleToQueryAst for RegexQuery { + fn convert_to_query_ast(self) -> anyhow::Result { + Ok(AstRegexQuery { + field: self.field, + regex: self.value.value, + } + .into()) + } +} diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index b99d5079d41..9f2c5b34f61 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -28,6 +28,7 @@ mod field_presence; mod full_text_query; mod phrase_prefix_query; mod range_query; +mod regex_query; mod tantivy_query_ast; mod term_query; mod term_set_query; @@ -41,12 +42,13 @@ pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; pub use range_query::RangeQuery; +pub use regex_query::RegexQuery; use tantivy_query_ast::TantivyQueryAst; pub use term_query::TermQuery; pub use term_set_query::TermSetQuery; pub use user_input_query::UserInputQuery; pub use visitor::{QueryAstTransformer, QueryAstVisitor}; -pub use wildcard_query::{JsonPathPrefix, WildcardQuery}; +pub use wildcard_query::{AutomatonQuery, JsonPathPrefix, WildcardQuery}; use crate::{BooleanOperand, InvalidQuery, NotNaNf32}; @@ -63,6 +65,7 @@ pub enum QueryAst { Range(RangeQuery), UserInput(UserInputQuery), Wildcard(WildcardQuery), + Regex(RegexQuery), MatchAll, MatchNone, Boost { @@ -105,7 +108,8 @@ impl QueryAst { | ast @ QueryAst::MatchNone | ast @ QueryAst::FieldPresence(_) | ast @ QueryAst::Range(_) - | ast @ QueryAst::Wildcard(_) => Ok(ast), + | ast @ QueryAst::Wildcard(_) + | ast @ QueryAst::Regex(_) => Ok(ast), QueryAst::UserInput(user_text_query) => { user_text_query.parse_user_query(default_search_fields) } @@ -249,6 +253,12 @@ impl BuildTantivyAst for QueryAst { search_fields, with_validation, ), + QueryAst::Regex(regex) => regex.build_tantivy_ast_call( + schema, + tokenizer_manager, + search_fields, + with_validation, + ), } } } diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs new file mode 100644 index 00000000000..513cec1a9b6 --- /dev/null +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -0,0 +1,124 @@ +// Copyright (C) 2024 Quickwit, Inc. +// +// Quickwit is offered under the AGPL v3.0 and as commercial software. +// For commercial licensing, contact us at hello@quickwit.io. +// +// AGPL: +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as +// published by the Free Software Foundation, either version 3 of the +// License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +use std::sync::Arc; + +use anyhow::Context; +use serde::{Deserialize, Serialize}; +use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; +use tantivy::Term; + +use super::{BuildTantivyAst, QueryAst}; +use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst}; +use crate::tokenizers::TokenizerManager; +use crate::{find_field_or_hit_dynamic, InvalidQuery}; + +/// A Wildcard query allows to match 'bond' with a query like 'b*d'. +/// +/// At the moment, only wildcard at end of term is supported. +#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] +pub struct RegexQuery { + pub field: String, + pub regex: String, +} + +impl From for QueryAst { + fn from(regex_query: RegexQuery) -> Self { + Self::Regex(regex_query) + } +} + +impl RegexQuery { + #[cfg(test)] + pub fn from_field_value(field: impl ToString, regex: impl ToString) -> Self { + Self { + field: field.to_string(), + regex: regex.to_string(), + } + } +} + +impl RegexQuery { + pub fn to_regex( + &self, + schema: &TantivySchema, + ) -> Result<(Field, Option>, String), InvalidQuery> { + let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?; + let field_type = field_entry.field_type(); + + match field_type { + FieldType::Str(ref text_options) => { + text_options.get_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + + Ok((field, None, self.regex.clone())) + } + FieldType::JsonObject(json_options) => { + json_options.get_text_indexing_options().ok_or_else(|| { + InvalidQuery::SchemaError(format!( + "field {} is not full-text searchable", + field_entry.name() + )) + })?; + + let mut term_for_path = Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ); + term_for_path.append_type_and_str(""); + + let value = term_for_path.value(); + // We skip the 1st byte which is a marker to tell this is json. This isn't present + // in the dictionary + let byte_path_prefix = value.as_serialized()[1..].to_owned(); + Ok((field, Some(byte_path_prefix), self.regex.clone())) + } + _ => Err(InvalidQuery::SchemaError( + "trying to run a regex query on a non-text field".to_string(), + )), + } + } +} + +impl BuildTantivyAst for RegexQuery { + fn build_tantivy_ast_impl( + &self, + schema: &TantivySchema, + _tokenizer_manager: &TokenizerManager, + _search_fields: &[String], + _with_validation: bool, + ) -> Result { + let (field, path, regex) = self.to_regex(schema)?; + let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; + let regex_automaton_with_path = JsonPathPrefix { + prefix: path.unwrap_or_default(), + automaton: regex, + }; + let regex_query_with_path = AutomatonQuery { + field, + automaton: Arc::new(regex_automaton_with_path), + }; + Ok(regex_query_with_path.into()) + } +} diff --git a/quickwit/quickwit-query/src/query_ast/visitor.rs b/quickwit/quickwit-query/src/query_ast/visitor.rs index bd85a71d64e..c9ce180b518 100644 --- a/quickwit/quickwit-query/src/query_ast/visitor.rs +++ b/quickwit/quickwit-query/src/query_ast/visitor.rs @@ -21,8 +21,8 @@ use crate::not_nan_f32::NotNaNf32; use crate::query_ast::field_presence::FieldPresenceQuery; use crate::query_ast::user_input_query::UserInputQuery; use crate::query_ast::{ - BoolQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, TermQuery, TermSetQuery, - WildcardQuery, + BoolQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, RegexQuery, TermQuery, + TermSetQuery, WildcardQuery, }; /// Simple trait to implement a Visitor over the QueryAst. @@ -45,6 +45,7 @@ pub trait QueryAstVisitor<'a> { QueryAst::UserInput(user_text_query) => self.visit_user_text(user_text_query), QueryAst::FieldPresence(exists) => self.visit_exists(exists), QueryAst::Wildcard(wildcard) => self.visit_wildcard(wildcard), + QueryAst::Regex(regex) => self.visit_regex(regex), } } @@ -111,6 +112,10 @@ pub trait QueryAstVisitor<'a> { fn visit_wildcard(&mut self, _wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> { Ok(()) } + + fn visit_regex(&mut self, _regex_query: &'a RegexQuery) -> Result<(), Self::Err> { + Ok(()) + } } /// Simple trait to implement a Visitor over the QueryAst. @@ -133,6 +138,7 @@ pub trait QueryAstTransformer { QueryAst::UserInput(user_text_query) => self.transform_user_text(user_text_query), QueryAst::FieldPresence(exists) => self.transform_exists(exists), QueryAst::Wildcard(wildcard) => self.transform_wildcard(wildcard), + QueryAst::Regex(regex) => self.transform_regex(regex), } } @@ -231,4 +237,8 @@ pub trait QueryAstTransformer { ) -> Result, Self::Err> { Ok(Some(QueryAst::Wildcard(wildcard_query))) } + + fn transform_regex(&mut self, regex_query: RegexQuery) -> Result, Self::Err> { + Ok(Some(QueryAst::Regex(regex_query))) + } } diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 64bc47ac082..2c184883a88 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -21,7 +21,7 @@ use std::borrow::Cow; use std::sync::Arc; use anyhow::{bail, Context}; -pub use prefix::JsonPathPrefix; +pub use prefix::{AutomatonQuery, JsonPathPrefix}; use serde::{Deserialize, Serialize}; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use tantivy::Term; From 314fd9337dce52e961f1571be42c2356422bd453 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Mon, 6 Jan 2025 13:47:12 +0100 Subject: [PATCH 5/7] run automaton in search thread pool --- quickwit/Cargo.lock | 19 ++++++++++--------- .../src/query_ast/regex_query.rs | 2 +- .../src/query_ast/wildcard_query.rs | 15 +++++++++++++-- quickwit/quickwit-search/src/leaf.rs | 11 +++++++++-- 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index d8ac98ead61..aa681d503cd 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -5143,7 +5143,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "stable_deref_trait", ] @@ -8733,7 +8733,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "aho-corasick", "arc-swap", @@ -8747,6 +8747,7 @@ dependencies = [ "fastdivide", "fnv", "fs4", + "futures-channel", "futures-util", "htmlescape", "hyperloglogplus", @@ -8786,7 +8787,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "bitpacking", ] @@ -8794,7 +8795,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "downcast-rs", "fastdivide", @@ -8809,7 +8810,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "async-trait", "byteorder", @@ -8832,7 +8833,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "nom", ] @@ -8840,7 +8841,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "futures-util", "itertools 0.13.0", @@ -8853,7 +8854,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "murmurhash32", "rand_distr", @@ -8863,7 +8864,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b" +source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" dependencies = [ "serde", ] diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs index 513cec1a9b6..bea193c76eb 100644 --- a/quickwit/quickwit-query/src/query_ast/regex_query.rs +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -113,7 +113,7 @@ impl BuildTantivyAst for RegexQuery { let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; let regex_automaton_with_path = JsonPathPrefix { prefix: path.unwrap_or_default(), - automaton: regex, + automaton: regex.into(), }; let regex_query_with_path = AutomatonQuery { field, diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 2c184883a88..cc36bc7480e 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -206,7 +206,7 @@ impl BuildTantivyAst for WildcardQuery { tantivy_fst::Regex::new(®ex).context("failed to parse regex built from wildcard")?; let regex_automaton_with_path = prefix::JsonPathPrefix { prefix: path.unwrap_or_default(), - automaton: regex, + automaton: regex.into(), }; let regex_query_with_path = prefix::AutomatonQuery { field, @@ -222,9 +222,20 @@ mod prefix { use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight}; use tantivy::schema::Field; use tantivy_fst::Automaton; + pub struct JsonPathPrefix { pub prefix: Vec, - pub automaton: A, + pub automaton: Arc, + } + + // we need to implement manually because the std adds an unnecessary bound `A: Clone` + impl Clone for JsonPathPrefix { + fn clone(&self) -> Self { + JsonPathPrefix { + prefix: self.prefix.clone(), + automaton: self.automaton.clone(), + } + } } #[derive(Clone)] diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index c321b4fe662..c93960bb137 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -346,6 +346,12 @@ async fn warm_up_automatons( terms_grouped_by_field: &HashMap>, ) -> anyhow::Result<()> { let mut warm_up_futures = Vec::new(); + let cpu_intensive_executor = |task| async { + crate::search_thread_pool() + .run_cpu_intensive(task) + .await + .map_err(|_| std::io::Error::other("task panicked"))? + }; for (field, automatons) in terms_grouped_by_field { for segment_reader in searcher.segment_readers() { let inv_idx = segment_reader.inverted_index(*field)?; @@ -358,10 +364,11 @@ async fn warm_up_automatons( .context("failed parsing regex during warmup")?; inv_idx_clone .warm_postings_automaton( - &quickwit_query::query_ast::JsonPathPrefix { - automaton: regex, + quickwit_query::query_ast::JsonPathPrefix { + automaton: regex.into(), prefix: path.clone().unwrap_or_default(), }, + cpu_intensive_executor, ) .await .context("failed loading automaton") From a7f502b91f3991533054d54bebbef83c07993127 Mon Sep 17 00:00:00 2001 From: trinity Pointard Date: Thu, 9 Jan 2025 16:58:44 +0100 Subject: [PATCH 6/7] cleanup, refactor and test --- quickwit/Cargo.lock | 44 ++- quickwit/Cargo.toml | 2 +- .../quickwit-doc-mapper/src/doc_mapper/mod.rs | 1 + .../quickwit-doc-mapper/src/query_builder.rs | 2 +- .../src/elastic_query_dsl/regex_query.rs | 1 + .../src/query_ast/field_presence.rs | 2 +- quickwit/quickwit-query/src/query_ast/mod.rs | 4 +- .../src/query_ast/regex_query.rs | 265 +++++++++++++++++- .../src/query_ast/wildcard_query.rs | 218 +++----------- .../0005-query_string_query.yaml | 10 + 10 files changed, 351 insertions(+), 198 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index aa681d503cd..298c2626237 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -1121,6 +1121,31 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bon" +version = "3.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe7acc34ff59877422326db7d6f2d845a582b16396b6b08194942bf34c6528ab" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4159dd617a7fbc9be6a692fe69dc2954f8e6bb6bb5e4d7578467441390d77fd0" +dependencies = [ + "darling 0.20.10", + "ident_case", + "prettyplease 0.2.25", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.89", +] + [[package]] name = "borsh" version = "1.5.3" @@ -5143,7 +5168,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "stable_deref_trait", ] @@ -8733,12 +8758,13 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "aho-corasick", "arc-swap", "base64 0.22.1", "bitpacking", + "bon", "byteorder", "census", "crc32fast", @@ -8787,7 +8813,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "bitpacking", ] @@ -8795,7 +8821,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "downcast-rs", "fastdivide", @@ -8810,7 +8836,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "async-trait", "byteorder", @@ -8833,7 +8859,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "nom", ] @@ -8841,7 +8867,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "futures-util", "itertools 0.13.0", @@ -8854,7 +8880,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "murmurhash32", "rand_distr", @@ -8864,7 +8890,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65" dependencies = [ "serde", ] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 6e7fa89c9a5..11a04ec0f0f 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -328,7 +328,7 @@ quickwit-serve = { path = "quickwit-serve" } quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", branch = "trinity/sstable-partial-automaton", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "d281ca3", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index 1504e26c743..4754a153873 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -91,6 +91,7 @@ pub enum Automaton { /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if /// inside a json field Regex(Option>, String), + // we could add termset query here, instead of downloading the whole dictionary } /// Information about what a DocMapper think should be warmed up before diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs index 77fda563987..36d03ca23b7 100644 --- a/quickwit/quickwit-doc-mapper/src/query_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs @@ -274,7 +274,7 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> { } fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> { - let (field, path, regex) = regex_query.to_regex(self.schema)?; + let (field, path, regex) = regex_query.to_field_and_regex(self.schema)?; self.add_automaton(field, Automaton::Regex(path, regex)); Ok(()) } diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs index f3c2d0427f0..55b76131571 100644 --- a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs +++ b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs @@ -27,6 +27,7 @@ use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery}; #[serde(deny_unknown_fields)] pub struct RegexQueryParams { value: String, + // we could probably add case_insensitive } pub type RegexQuery = OneFieldMap; diff --git a/quickwit/quickwit-query/src/query_ast/field_presence.rs b/quickwit/quickwit-query/src/query_ast/field_presence.rs index df82ab3591a..1544bd5ac46 100644 --- a/quickwit/quickwit-query/src/query_ast/field_presence.rs +++ b/quickwit/quickwit-query/src/query_ast/field_presence.rs @@ -87,7 +87,7 @@ impl BuildTantivyAst for FieldPresenceQuery { } else { format!("{}.{}", field_entry.name(), path) }; - let exists_query = tantivy::query::ExistsQuery::new_exists_query(full_path); + let exists_query = tantivy::query::ExistsQuery::new(full_path, true); Ok(TantivyQueryAst::from(exists_query)) } else { // fallback to the presence field diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs index 9f2c5b34f61..31d53ff65f9 100644 --- a/quickwit/quickwit-query/src/query_ast/mod.rs +++ b/quickwit/quickwit-query/src/query_ast/mod.rs @@ -42,13 +42,13 @@ pub use field_presence::FieldPresenceQuery; pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery}; pub use phrase_prefix_query::PhrasePrefixQuery; pub use range_query::RangeQuery; -pub use regex_query::RegexQuery; +pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery}; use tantivy_query_ast::TantivyQueryAst; pub use term_query::TermQuery; pub use term_set_query::TermSetQuery; pub use user_input_query::UserInputQuery; pub use visitor::{QueryAstTransformer, QueryAstVisitor}; -pub use wildcard_query::{AutomatonQuery, JsonPathPrefix, WildcardQuery}; +pub use wildcard_query::WildcardQuery; use crate::{BooleanOperand, InvalidQuery, NotNaNf32}; diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs index bea193c76eb..09960caf3cc 100644 --- a/quickwit/quickwit-query/src/query_ast/regex_query.rs +++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs @@ -20,18 +20,17 @@ use std::sync::Arc; use anyhow::Context; +pub use prefix::{AutomatonQuery, JsonPathPrefix}; use serde::{Deserialize, Serialize}; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use tantivy::Term; use super::{BuildTantivyAst, QueryAst}; -use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst}; +use crate::query_ast::TantivyQueryAst; use crate::tokenizers::TokenizerManager; use crate::{find_field_or_hit_dynamic, InvalidQuery}; -/// A Wildcard query allows to match 'bond' with a query like 'b*d'. -/// -/// At the moment, only wildcard at end of term is supported. +/// A Regex query #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] pub struct RegexQuery { pub field: String, @@ -55,7 +54,7 @@ impl RegexQuery { } impl RegexQuery { - pub fn to_regex( + pub fn to_field_and_regex( &self, schema: &TantivySchema, ) -> Result<(Field, Option>, String), InvalidQuery> { @@ -109,7 +108,7 @@ impl BuildTantivyAst for RegexQuery { _search_fields: &[String], _with_validation: bool, ) -> Result { - let (field, path, regex) = self.to_regex(schema)?; + let (field, path, regex) = self.to_field_and_regex(schema)?; let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex")?; let regex_automaton_with_path = JsonPathPrefix { prefix: path.unwrap_or_default(), @@ -122,3 +121,257 @@ impl BuildTantivyAst for RegexQuery { Ok(regex_query_with_path.into()) } } + +mod prefix { + use std::sync::Arc; + + use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight}; + use tantivy::schema::Field; + use tantivy_fst::Automaton; + + pub struct JsonPathPrefix { + pub prefix: Vec, + pub automaton: Arc, + } + + // we need to implement manually because the std adds an unnecessary bound `A: Clone` + impl Clone for JsonPathPrefix { + fn clone(&self) -> Self { + JsonPathPrefix { + prefix: self.prefix.clone(), + automaton: self.automaton.clone(), + } + } + } + + #[derive(Clone, Debug, PartialEq)] + pub enum JsonPathPrefixState { + Prefix(usize), + Inner(A), + PrefixFailed, + } + + impl Automaton for JsonPathPrefix { + type State = JsonPathPrefixState; + + fn start(&self) -> Self::State { + if self.prefix.is_empty() { + JsonPathPrefixState::Inner(self.automaton.start()) + } else { + JsonPathPrefixState::Prefix(0) + } + } + + fn is_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => false, + JsonPathPrefixState::Inner(inner_state) => self.automaton.is_match(inner_state), + JsonPathPrefixState::PrefixFailed => false, + } + } + + fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + match state { + JsonPathPrefixState::Prefix(i) => { + if self.prefix.get(*i) != Some(&byte) { + return JsonPathPrefixState::PrefixFailed; + } + let next_pos = i + 1; + if next_pos == self.prefix.len() { + JsonPathPrefixState::Inner(self.automaton.start()) + } else { + JsonPathPrefixState::Prefix(next_pos) + } + } + JsonPathPrefixState::Inner(inner_state) => { + JsonPathPrefixState::Inner(self.automaton.accept(inner_state, byte)) + } + JsonPathPrefixState::PrefixFailed => JsonPathPrefixState::PrefixFailed, + } + } + + fn can_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => true, + JsonPathPrefixState::Inner(inner_state) => self.automaton.can_match(inner_state), + JsonPathPrefixState::PrefixFailed => false, + } + } + + fn will_always_match(&self, state: &Self::State) -> bool { + match state { + JsonPathPrefixState::Prefix(_) => false, + JsonPathPrefixState::Inner(inner_state) => { + self.automaton.will_always_match(inner_state) + } + JsonPathPrefixState::PrefixFailed => false, + } + } + } + + // we don't use RegexQuery to handle our path. We could tinker with the regex to embed + // json field path inside, but that seems not as clean, and would prevent support of + // case-insensitive search in the future (we would also make the path insensitive, + // which we shouldn't) + pub struct AutomatonQuery { + pub automaton: Arc, + pub field: Field, + } + + impl std::fmt::Debug for AutomatonQuery { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("AutomatonQuery") + .field("field", &self.field) + .field("automaton", &std::any::type_name::()) + .finish() + } + } + + impl Clone for AutomatonQuery { + fn clone(&self) -> Self { + AutomatonQuery { + automaton: self.automaton.clone(), + field: self.field, + } + } + } + + impl Query for AutomatonQuery + where A::State: Clone + { + fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> tantivy::Result> { + Ok(Box::new(AutomatonWeight::::new( + self.field, + self.automaton.clone(), + ))) + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use tantivy::schema::{Schema as TantivySchema, TEXT}; + use tantivy_fst::{Automaton, Regex}; + + use super::prefix::JsonPathPrefixState; + use super::{JsonPathPrefix, RegexQuery}; + + #[test] + fn test_regex_query_text_field() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_text_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(field, schema.get_field("field").unwrap()); + assert!(path.is_none()); + assert_eq!(regex, query.regex); + } + + #[test] + fn test_regex_query_json_field() { + let mut schema_builder = TantivySchema::builder(); + schema_builder.add_json_field("field", TEXT); + let schema = schema_builder.build(); + + let query = RegexQuery { + field: "field.sub.field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (field, path, regex) = query.to_field_and_regex(&schema).unwrap(); + assert_eq!(field, schema.get_field("field").unwrap()); + assert_eq!(path.unwrap(), b"sub\x01field\0s"); + assert_eq!(regex, query.regex); + + // i believe this is how concatenated field behave + let query_empty_path = RegexQuery { + field: "field".to_string(), + regex: "abc.*xyz".to_string(), + }; + let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap(); + assert_eq!(field, schema.get_field("field").unwrap()); + assert_eq!(path.unwrap(), b"\0s"); + assert_eq!(regex, query_empty_path.regex); + } + + #[test] + fn test_json_prefix_automaton_empty_path() { + let regex = Arc::new(Regex::new("e(f|g.*)").unwrap()); + let empty_path_automaton = JsonPathPrefix { + prefix: Vec::new(), + automaton: regex.clone(), + }; + + let start = empty_path_automaton.start(); + assert_eq!(start, JsonPathPrefixState::Inner(regex.start())); + } + + #[test] + fn test_json_prefix_automaton() { + let regex = Arc::new(Regex::new("e(f|g.*)").unwrap()); + let automaton = JsonPathPrefix { + prefix: b"ab".to_vec(), + automaton: regex.clone(), + }; + + let start = automaton.start(); + assert!(matches!(start, JsonPathPrefixState::Prefix(_))); + assert!(automaton.can_match(&start)); + assert!(!automaton.is_match(&start)); + + let miss = automaton.accept(&start, b'g'); + assert_eq!(miss, JsonPathPrefixState::PrefixFailed); + // supporting this is important for optimisation + assert!(!automaton.can_match(&miss)); + assert!(!automaton.is_match(&miss)); + + let a = automaton.accept(&start, b'a'); + assert!(matches!(a, JsonPathPrefixState::Prefix(_))); + assert!(automaton.can_match(&a)); + assert!(!automaton.is_match(&a)); + + let ab = automaton.accept(&a, b'b'); + assert_eq!(ab, JsonPathPrefixState::Inner(regex.start())); + assert!(automaton.can_match(&ab)); + assert!(!automaton.is_match(&ab)); + + // starting here, we just take that we passthrough correctly, + // and reply to can_match as well as possible + // (we don't test will_always_match because Regex doesn't support it) + let abc = automaton.accept(&ab, b'c'); + assert!(matches!(abc, JsonPathPrefixState::Inner(_))); + assert!(!automaton.can_match(&abc)); + assert!(!automaton.is_match(&abc)); + + let abe = automaton.accept(&ab, b'e'); + assert!(matches!(abe, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abe)); + assert!(!automaton.is_match(&abe)); + + let abef = automaton.accept(&abe, b'f'); + assert!(matches!(abef, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abef)); + assert!(automaton.is_match(&abef)); + + let abefg = automaton.accept(&abef, b'g'); + assert!(matches!(abefg, JsonPathPrefixState::Inner(_))); + assert!(!automaton.can_match(&abefg)); + assert!(!automaton.is_match(&abefg)); + + let abeg = automaton.accept(&abe, b'g'); + assert!(matches!(abeg, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abeg)); + assert!(automaton.is_match(&abeg)); + + let abegh = automaton.accept(&abeg, b'h'); + assert!(matches!(abegh, JsonPathPrefixState::Inner(_))); + assert!(automaton.can_match(&abegh)); + assert!(automaton.is_match(&abegh)); + } +} diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index cc36bc7480e..78640e6d6b0 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -21,19 +21,16 @@ use std::borrow::Cow; use std::sync::Arc; use anyhow::{bail, Context}; -pub use prefix::{AutomatonQuery, JsonPathPrefix}; use serde::{Deserialize, Serialize}; use tantivy::schema::{Field, FieldType, Schema as TantivySchema}; use tantivy::Term; use super::{BuildTantivyAst, QueryAst}; -use crate::query_ast::TantivyQueryAst; +use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst}; use crate::tokenizers::TokenizerManager; use crate::{find_field_or_hit_dynamic, InvalidQuery}; /// A Wildcard query allows to match 'bond' with a query like 'b*d'. -/// -/// At the moment, only wildcard at end of term is supported. #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)] pub struct WildcardQuery { pub field: String, @@ -72,7 +69,8 @@ fn parse_wildcard_query(mut query: &str) -> Vec { res.push(SubQuery::Text(chr.to_string())); query = &query[chr.len_utf8()..]; } else { - // this is invalid, but let's just ignore that escape sequence + // escaping at the end is invalid, handle it as if that escape sequence wasn't + // present break; } } @@ -91,6 +89,36 @@ enum SubQuery { QuestionMark, } +fn sub_query_parts_to_regex( + sub_query_parts: Vec, + tokenizer_name: &str, + tokenizer_manager: &TokenizerManager, +) -> anyhow::Result { + let mut normalizer = tokenizer_manager + .get_normalizer(tokenizer_name) + .with_context(|| format!("no tokenizer named `{}` is registered", tokenizer_name))?; + + sub_query_parts + .into_iter() + .map(|part| match part { + SubQuery::Text(text) => { + let mut token_stream = normalizer.token_stream(&text); + let expected_token = token_stream + .next() + .context("normalizer generated no content")? + .text + .clone(); + if let Some(_unexpected_token) = token_stream.next() { + bail!("normalizer generated multiple tokens") + } + Ok(Cow::Owned(regex::escape(&expected_token))) + } + SubQuery::Wildcard => Ok(Cow::Borrowed(".*")), + SubQuery::QuestionMark => Ok(Cow::Borrowed(".")), + }) + .collect::>() +} + impl WildcardQuery { pub fn to_regex( &self, @@ -111,31 +139,8 @@ impl WildcardQuery { )) })?; let tokenizer_name = text_field_indexing.tokenizer(); - let mut normalizer = tokenizer_manager - .get_normalizer(tokenizer_name) - .with_context(|| { - format!("no tokenizer named `{}` is registered", tokenizer_name) - })?; - - let regex = sub_query_parts - .into_iter() - .map(|part| match part { - SubQuery::Text(text) => { - let mut token_stream = normalizer.token_stream(&text); - let expected_token = token_stream - .next() - .context("normalizer generated no content")? - .text - .clone(); - if let Some(_unexpected_token) = token_stream.next() { - bail!("normalizer generated multiple tokens") - } - Ok(Cow::Owned(regex::escape(&expected_token))) - } - SubQuery::Wildcard => Ok(Cow::Borrowed(".*")), - SubQuery::QuestionMark => Ok(Cow::Borrowed(".")), - }) - .collect::>()?; + let regex = + sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?; Ok((field, None, regex)) } @@ -148,11 +153,8 @@ impl WildcardQuery { )) })?; let tokenizer_name = text_field_indexing.tokenizer(); - let mut normalizer = tokenizer_manager - .get_normalizer(tokenizer_name) - .with_context(|| { - format!("no tokenizer named `{}` is registered", tokenizer_name) - })?; + let regex = + sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?; let mut term_for_path = Term::from_field_json_path( field, @@ -165,25 +167,7 @@ impl WildcardQuery { // We skip the 1st byte which is a marker to tell this is json. This isn't present // in the dictionary let byte_path_prefix = value.as_serialized()[1..].to_owned(); - let regex = sub_query_parts - .into_iter() - .map(|part| match part { - SubQuery::Text(text) => { - let mut token_stream = normalizer.token_stream(&text); - let expected_token = token_stream - .next() - .context("normalizer generated no content")? - .text - .clone(); - if let Some(_unexpected_token) = token_stream.next() { - bail!("normalizer generated multiple tokens") - } - Ok(Cow::Owned(regex::escape(&expected_token))) - } - SubQuery::Wildcard => Ok(Cow::Borrowed(".*")), - SubQuery::QuestionMark => Ok(Cow::Borrowed(".")), - }) - .collect::>()?; + Ok((field, Some(byte_path_prefix), regex)) } _ => Err(InvalidQuery::SchemaError( @@ -204,11 +188,11 @@ impl BuildTantivyAst for WildcardQuery { let (field, path, regex) = self.to_regex(schema, tokenizer_manager)?; let regex = tantivy_fst::Regex::new(®ex).context("failed to parse regex built from wildcard")?; - let regex_automaton_with_path = prefix::JsonPathPrefix { + let regex_automaton_with_path = JsonPathPrefix { prefix: path.unwrap_or_default(), automaton: regex.into(), }; - let regex_query_with_path = prefix::AutomatonQuery { + let regex_query_with_path = AutomatonQuery { field, automaton: Arc::new(regex_automaton_with_path), }; @@ -216,128 +200,6 @@ impl BuildTantivyAst for WildcardQuery { } } -mod prefix { - use std::sync::Arc; - - use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight}; - use tantivy::schema::Field; - use tantivy_fst::Automaton; - - pub struct JsonPathPrefix { - pub prefix: Vec, - pub automaton: Arc, - } - - // we need to implement manually because the std adds an unnecessary bound `A: Clone` - impl Clone for JsonPathPrefix { - fn clone(&self) -> Self { - JsonPathPrefix { - prefix: self.prefix.clone(), - automaton: self.automaton.clone(), - } - } - } - - #[derive(Clone)] - pub enum JsonPathPrefixState { - Prefix(usize), - Inner(A), - PrefixFailed, - } - - impl Automaton for JsonPathPrefix { - type State = JsonPathPrefixState; - - fn start(&self) -> Self::State { - if self.prefix.is_empty() { - JsonPathPrefixState::Inner(self.automaton.start()) - } else { - JsonPathPrefixState::Prefix(0) - } - } - - fn is_match(&self, state: &Self::State) -> bool { - match state { - JsonPathPrefixState::Prefix(_) => false, - JsonPathPrefixState::Inner(inner_state) => self.automaton.is_match(inner_state), - JsonPathPrefixState::PrefixFailed => false, - } - } - - fn accept(&self, state: &Self::State, byte: u8) -> Self::State { - match state { - JsonPathPrefixState::Prefix(i) => { - if self.prefix.get(*i) != Some(&byte) { - return JsonPathPrefixState::PrefixFailed; - } - let next_pos = i + 1; - if next_pos == self.prefix.len() { - JsonPathPrefixState::Inner(self.automaton.start()) - } else { - JsonPathPrefixState::Prefix(next_pos) - } - } - JsonPathPrefixState::Inner(inner_state) => { - JsonPathPrefixState::Inner(self.automaton.accept(inner_state, byte)) - } - JsonPathPrefixState::PrefixFailed => JsonPathPrefixState::PrefixFailed, - } - } - - fn can_match(&self, state: &Self::State) -> bool { - match state { - JsonPathPrefixState::Prefix(_) => true, - JsonPathPrefixState::Inner(inner_state) => self.automaton.can_match(inner_state), - JsonPathPrefixState::PrefixFailed => false, - } - } - - fn will_always_match(&self, state: &Self::State) -> bool { - match state { - JsonPathPrefixState::Prefix(_) => false, - JsonPathPrefixState::Inner(inner_state) => { - self.automaton.will_always_match(inner_state) - } - JsonPathPrefixState::PrefixFailed => false, - } - } - } - - pub struct AutomatonQuery { - pub automaton: Arc, - pub field: Field, - } - - impl std::fmt::Debug for AutomatonQuery { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_struct("AutomatonQuery") - .field("field", &self.field) - .field("automaton", &std::any::type_name::()) - .finish() - } - } - - impl Clone for AutomatonQuery { - fn clone(&self) -> Self { - AutomatonQuery { - automaton: self.automaton.clone(), - field: self.field, - } - } - } - - impl Query for AutomatonQuery - where A::State: Clone - { - fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> tantivy::Result> { - Ok(Box::new(AutomatonWeight::::new( - self.field, - self.automaton.clone(), - ))) - } - } -} - #[cfg(test)] mod tests { use tantivy::schema::{TextFieldIndexing, TextOptions}; diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml index 84c6bb3d790..3e11e8aa561 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml @@ -200,6 +200,16 @@ expected: total: value: 1 --- +json: + query: + regexp: + payload.description: + value: "jour.*" +expected: + hits: + total: + value: 3 +--- json: query: query_string: From 3f65e615d39c46a7fff6b65c68d4998b06d95424 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 10 Jan 2025 15:56:25 +0100 Subject: [PATCH 7/7] improve error messages Co-authored-by: Adrien Guillo --- quickwit/quickwit-query/src/query_ast/wildcard_query.rs | 2 +- quickwit/quickwit-search/src/leaf.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index e52268bf636..a5bdaf1b430 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -88,7 +88,7 @@ fn sub_query_parts_to_regex( ) -> anyhow::Result { let mut normalizer = tokenizer_manager .get_normalizer(tokenizer_name) - .with_context(|| format!("no tokenizer named `{}` is registered", tokenizer_name))?; + .with_context(|| format!("no tokenizer named `{tokenizer_name}` is registered"))?; sub_query_parts .into_iter() diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 5c10931501d..03e6a0ebc8b 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -375,7 +375,7 @@ async fn warm_up_automatons( match automaton { Automaton::Regex(path, regex_str) => { let regex = tantivy_fst::Regex::new(regex_str) - .context("failed parsing regex during warmup")?; + .context("failed to parse regex during warmup")?; inv_idx_clone .warm_postings_automaton( quickwit_query::query_ast::JsonPathPrefix { @@ -385,7 +385,7 @@ async fn warm_up_automatons( cpu_intensive_executor, ) .await - .context("failed loading automaton") + .context("failed to load automaton") } } });