From 6bbddfaab61709e464481c4471b1b69db242c930 Mon Sep 17 00:00:00 2001
From: trinity Pointard <trinity.pointard@datadoghq.com>
Date: Tue, 17 Dec 2024 17:09:17 +0100
Subject: [PATCH 1/7] run wildcard as automatons

---
 quickwit/Cargo.lock                           | 152 +++++----
 quickwit/Cargo.toml                           |   3 +-
 .../quickwit-doc-mapper/src/doc_mapper/mod.rs |  18 +
 quickwit/quickwit-doc-mapper/src/lib.rs       |   6 +-
 .../quickwit-doc-mapper/src/query_builder.rs  |  28 +-
 quickwit/quickwit-query/Cargo.toml            |   1 +
 .../src/query_ast/wildcard_query.rs           | 311 ++++++++++--------
 quickwit/quickwit-search/Cargo.toml           |   1 +
 quickwit/quickwit-search/src/leaf.rs          |  35 +-
 9 files changed, 338 insertions(+), 217 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index 1662803ed0b..3a5b45ed497 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -679,7 +679,7 @@ dependencies = [
  "regex-lite",
  "roxmltree 0.14.1",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -1046,7 +1046,7 @@ dependencies = [
  "miniserde",
  "peakmem-alloc",
  "perf-event",
- "rustc-hash 2.0.0",
+ "rustc-hash",
  "rustop",
  "unicode-width 0.1.14",
  "yansi",
@@ -2833,7 +2833,7 @@ dependencies = [
  "reqwest",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tracing",
@@ -2848,7 +2848,7 @@ checksum = "f8bdaaa4bc036e8318274d1b25f0f2265b3e95418b765fd1ea1c7ef938fd69bd"
 dependencies = [
  "google-cloud-token",
  "http 0.2.12",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-retry",
  "tonic",
@@ -2874,7 +2874,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "96e4ad0802d3f416f62e7ce01ac1460898ee0efc98f8b45cd4aab7611607012f"
 dependencies = [
  "reqwest",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
 ]
 
@@ -2891,7 +2891,7 @@ dependencies = [
  "google-cloud-googleapis",
  "google-cloud-token",
  "prost-types 0.11.9",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-util",
  "tracing",
@@ -3637,9 +3637,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
 dependencies = [
  "cfg-if",
- "js-sys",
- "wasm-bindgen",
- "web-sys",
 ]
 
 [[package]]
@@ -4053,7 +4050,7 @@ dependencies = [
  "log",
  "once_cell",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "yada",
 ]
 
@@ -4333,11 +4330,10 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measure_time"
-version = "0.8.3"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc"
+checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e"
 dependencies = [
- "instant",
  "log",
 ]
 
@@ -4467,7 +4463,7 @@ dependencies = [
  "rustc_version",
  "smallvec",
  "tagptr",
- "thiserror",
+ "thiserror 1.0.69",
  "triomphe",
  "uuid",
 ]
@@ -4481,7 +4477,7 @@ dependencies = [
  "crc32fast",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
 ]
 
@@ -4777,7 +4773,7 @@ dependencies = [
  "serde_json",
  "serde_path_to_error",
  "sha2",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
 ]
 
@@ -4908,7 +4904,7 @@ dependencies = [
  "serde_plain",
  "serde_with 1.14.0",
  "subtle",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
 ]
 
@@ -5005,7 +5001,7 @@ dependencies = [
  "opentelemetry_sdk",
  "prost 0.11.9",
  "reqwest",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tonic",
 ]
@@ -5043,7 +5039,7 @@ dependencies = [
  "js-sys",
  "once_cell",
  "pin-project-lite",
- "thiserror",
+ "thiserror 1.0.69",
  "urlencoding",
 ]
 
@@ -5065,7 +5061,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
 ]
@@ -5147,7 +5143,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "ownedbytes"
 version = "0.7.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "stable_deref_trait",
 ]
@@ -5325,7 +5321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "879952a81a83930934cbf1786752d6dedc3b1f29e8f8fb2ad1d0a36f377cf442"
 dependencies = [
  "memchr",
- "thiserror",
+ "thiserror 1.0.69",
  "ucd-trie",
 ]
 
@@ -5678,7 +5674,7 @@ dependencies = [
  "smallvec",
  "symbolic-demangle",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -5872,7 +5868,7 @@ dependencies = [
  "parking_lot",
  "procfs",
  "protobuf",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -6158,7 +6154,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sync_wrapper",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tracing",
 ]
@@ -6222,7 +6218,7 @@ dependencies = [
  "serde_json",
  "tabled",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "thousands",
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
@@ -6292,7 +6288,7 @@ dependencies = [
  "quickwit-common",
  "quickwit-proto",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
  "tonic",
@@ -6330,7 +6326,7 @@ dependencies = [
  "serde_json",
  "siphasher",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-metrics",
  "tokio-stream",
@@ -6461,7 +6457,7 @@ dependencies = [
  "serde_yaml",
  "siphasher",
  "tantivy",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tracing",
  "utoipa",
@@ -6481,7 +6477,7 @@ dependencies = [
  "quickwit-metastore",
  "quickwit-proto",
  "quickwit-storage",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tracing",
@@ -6539,7 +6535,7 @@ dependencies = [
  "serde_json",
  "tantivy",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tracing",
@@ -6577,7 +6573,7 @@ dependencies = [
  "serde_json",
  "serde_json_borrow",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tonic",
  "tower",
@@ -6672,7 +6668,7 @@ dependencies = [
  "serde_json",
  "tantivy",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tracing",
@@ -6766,7 +6762,7 @@ dependencies = [
  "serial_test",
  "sqlx",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tokio-stream",
@@ -6793,7 +6789,7 @@ dependencies = [
  "quickwit-proto",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tonic",
@@ -6824,7 +6820,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sqlx",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tonic",
  "tonic-build",
@@ -6851,11 +6847,12 @@ dependencies = [
  "proptest",
  "quickwit-common",
  "quickwit-datetime",
+ "regex",
  "serde",
  "serde_json",
  "serde_with 3.11.0",
  "tantivy",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "whichlang",
 ]
@@ -6877,7 +6874,7 @@ dependencies = [
  "reqwest",
  "serde",
  "serde_json",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tracing",
  "wiremock",
@@ -6917,7 +6914,8 @@ dependencies = [
  "serde",
  "serde_json",
  "tantivy",
- "thiserror",
+ "tantivy-fst",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
  "tower",
@@ -6980,7 +6978,7 @@ dependencies = [
  "serde_qs 0.12.0",
  "serde_with 3.11.0",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tokio-stream",
@@ -7032,7 +7030,7 @@ dependencies = [
  "serde_json",
  "tantivy",
  "tempfile",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tokio-stream",
  "tokio-util",
@@ -7251,7 +7249,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
 dependencies = [
  "getrandom 0.2.15",
  "libredox",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -7575,12 +7573,6 @@ version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
 
-[[package]]
-name = "rustc-hash"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
-
 [[package]]
 name = "rustc-hash"
 version = "2.0.0"
@@ -7810,7 +7802,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "syn 2.0.89",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -7958,7 +7950,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6"
 dependencies = [
  "percent-encoding",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
 ]
 
 [[package]]
@@ -7970,7 +7962,7 @@ dependencies = [
  "futures",
  "percent-encoding",
  "serde",
- "thiserror",
+ "thiserror 1.0.69",
  "tracing",
  "warp",
 ]
@@ -8204,7 +8196,7 @@ checksum = "adc4e5204eb1910f40f9cfa375f6f05b68c3abac4b6fd879c8ff5e7ae8a0a085"
 dependencies = [
  "num-bigint",
  "num-traits",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
@@ -8389,7 +8381,7 @@ dependencies = [
  "sha2",
  "smallvec",
  "sqlformat",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tokio",
  "tokio-stream",
@@ -8474,7 +8466,7 @@ dependencies = [
  "smallvec",
  "sqlx-core",
  "stringprep",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tracing",
  "whoami",
@@ -8513,7 +8505,7 @@ dependencies = [
  "smallvec",
  "sqlx-core",
  "stringprep",
- "thiserror",
+ "thiserror 1.0.69",
  "time",
  "tracing",
  "whoami",
@@ -8740,7 +8732,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
 [[package]]
 name = "tantivy"
 version = "0.23.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "aho-corasick",
  "arc-swap",
@@ -8769,7 +8761,7 @@ dependencies = [
  "rayon",
  "regex",
  "rust-stemmers",
- "rustc-hash 1.1.0",
+ "rustc-hash",
  "serde",
  "serde_json",
  "sketches-ddsketch",
@@ -8783,7 +8775,7 @@ dependencies = [
  "tantivy-stacker",
  "tantivy-tokenizer-api",
  "tempfile",
- "thiserror",
+ "thiserror 2.0.7",
  "time",
  "uuid",
  "winapi 0.3.9",
@@ -8793,7 +8785,7 @@ dependencies = [
 [[package]]
 name = "tantivy-bitpacker"
 version = "0.6.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "bitpacking",
 ]
@@ -8801,7 +8793,7 @@ dependencies = [
 [[package]]
 name = "tantivy-columnar"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "downcast-rs",
  "fastdivide",
@@ -8816,7 +8808,7 @@ dependencies = [
 [[package]]
 name = "tantivy-common"
 version = "0.7.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -8839,7 +8831,7 @@ dependencies = [
 [[package]]
 name = "tantivy-query-grammar"
 version = "0.22.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "nom",
 ]
@@ -8847,8 +8839,10 @@ dependencies = [
 [[package]]
 name = "tantivy-sstable"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
+ "futures-util",
+ "itertools 0.13.0",
  "tantivy-bitpacker",
  "tantivy-common",
  "tantivy-fst",
@@ -8858,7 +8852,7 @@ dependencies = [
 [[package]]
 name = "tantivy-stacker"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "murmurhash32",
  "rand_distr",
@@ -8868,7 +8862,7 @@ dependencies = [
 [[package]]
 name = "tantivy-tokenizer-api"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?rev=2f2db16#2f2db16ec10656f9a7ef37018d38e0c6fb5edbe5"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
 dependencies = [
  "serde",
 ]
@@ -8935,7 +8929,16 @@ version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93605438cbd668185516ab499d589afb7ee1859ea3d5fc8f6b0755e1c7443767"
+dependencies = [
+ "thiserror-impl 2.0.7",
 ]
 
 [[package]]
@@ -8949,6 +8952,17 @@ dependencies = [
  "syn 2.0.89",
 ]
 
+[[package]]
+name = "thiserror-impl"
+version = "2.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1d8749b4531af2117677a5fcd12b1348a3fe2b81e36e61ffeac5c4aa3273e36"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.89",
+]
+
 [[package]]
 name = "thousands"
 version = "0.2.0"
@@ -9025,7 +9039,7 @@ version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78bfd61bca99323ce96911bd2c443259115460615e44f1d449cee8cb3831a1dd"
 dependencies = [
- "thiserror",
+ "thiserror 1.0.69",
  "time",
 ]
 
@@ -9487,7 +9501,7 @@ dependencies = [
  "log",
  "rand 0.8.5",
  "sha1",
- "thiserror",
+ "thiserror 1.0.69",
  "url",
  "utf-8",
 ]
@@ -9822,7 +9836,7 @@ dependencies = [
  "strip-ansi-escapes",
  "syslog_loose",
  "termcolor",
- "thiserror",
+ "thiserror 1.0.69",
  "tokio",
  "tracing",
  "uaparser",
diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
index c3e3051470c..6e7fa89c9a5 100644
--- a/quickwit/Cargo.toml
+++ b/quickwit/Cargo.toml
@@ -328,12 +328,13 @@ quickwit-serve = { path = "quickwit-serve" }
 quickwit-storage = { path = "quickwit-storage" }
 quickwit-telemetry = { path = "quickwit-telemetry" }
 
-tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "2f2db16", default-features = false, features = [
+tantivy = { git = "https://github.com/quickwit-oss/tantivy/", branch = "trinity/sstable-partial-automaton", default-features = false, features = [
   "lz4-compression",
   "mmap",
   "quickwit",
   "zstd-compression",
 ] }
+tantivy-fst = "0.5"
 
 # This is actually not used directly the goal is to fix the version
 # used by reqwest.
diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
index 146c2f1f51c..059fed62222 100644
--- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
+++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
@@ -85,6 +85,13 @@ pub struct TermRange {
     pub limit: Option<u64>,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+/// Supported automaton types to warmup
+pub enum Automaton {
+    /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq
+    Regex(String),
+}
+
 /// Information about what a DocMapper think should be warmed up before
 /// running the query.
 #[derive(Debug, Default, Clone, PartialEq, Eq)]
@@ -100,6 +107,8 @@ pub struct WarmupInfo {
     pub terms_grouped_by_field: HashMap<Field, HashMap<Term, bool>>,
     /// Term ranges to warmup, and whether their position is needed too.
     pub term_ranges_grouped_by_field: HashMap<Field, HashMap<TermRange, bool>>,
+    /// Automatons to warmup
+    pub automatons_grouped_by_field: HashMap<Field, HashSet<Automaton>>,
 }
 
 impl WarmupInfo {
@@ -125,6 +134,11 @@ impl WarmupInfo {
                 *sub_map.entry(term_range).or_default() |= include_position;
             }
         }
+
+        for (field, automatons) in other.automatons_grouped_by_field.into_iter() {
+            let sub_map = self.automatons_grouped_by_field.entry(field).or_default();
+            sub_map.extend(automatons);
+        }
     }
 
     /// Simplify a WarmupInfo, removing some redundant tasks
@@ -624,6 +638,7 @@ mod tests {
                 (2, "term1", false),
                 (2, "term2", false),
             ]),
+            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
         };
 
         // merging with default has no impact
@@ -641,6 +656,7 @@ mod tests {
                 (3, "term1", false),
                 (2, "term2", true),
             ]),
+            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
         };
         wi_base.merge(wi_2.clone());
 
@@ -710,6 +726,7 @@ mod tests {
                 (1, "term2", true),
                 (2, "term3", false),
             ]),
+            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
         };
         let expected = WarmupInfo {
             term_dict_fields: hashset_field(&[1]),
@@ -720,6 +737,7 @@ mod tests {
                 (1, "term2", true),
                 (2, "term3", false),
             ]),
+            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
         };
 
         warmup_info.simplify();
diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs
index c592616e86a..312d2e69d69 100644
--- a/quickwit/quickwit-doc-mapper/src/lib.rs
+++ b/quickwit/quickwit-doc-mapper/src/lib.rs
@@ -35,9 +35,9 @@ mod routing_expression;
 pub mod tag_pruning;
 
 pub use doc_mapper::{
-    analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FieldMappingEntry, FieldMappingType,
-    JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, TokenizerConfig,
-    TokenizerEntry, WarmupInfo,
+    analyze_text, Automaton, BinaryFormat, DocMapper, DocMapperBuilder, FieldMappingEntry,
+    FieldMappingType, JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange,
+    TokenizerConfig, TokenizerEntry, WarmupInfo,
 };
 use doc_mapper::{
     FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema,
diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
index dbc663794e5..6dae2e29590 100644
--- a/quickwit/quickwit-doc-mapper/src/query_builder.rs
+++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -31,7 +31,7 @@ use tantivy::query::Query;
 use tantivy::schema::{Field, Schema};
 use tantivy::Term;
 
-use crate::{QueryParserError, TermRange, WarmupInfo};
+use crate::{Automaton, QueryParserError, TermRange, WarmupInfo};
 
 #[derive(Default)]
 struct RangeQueryFields {
@@ -101,7 +101,7 @@ pub(crate) fn build_query(
     )?;
 
     let term_set_query_fields = extract_term_set_query_fields(query_ast, &schema)?;
-    let term_ranges_grouped_by_field =
+    let (term_ranges_grouped_by_field, automatons_grouped_by_field) =
         extract_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?;
 
     let mut terms_grouped_by_field: HashMap<Field, HashMap<_, bool>> = Default::default();
@@ -119,6 +119,7 @@ pub(crate) fn build_query(
         terms_grouped_by_field,
         term_ranges_grouped_by_field,
         fast_field_names,
+        automatons_grouped_by_field,
         ..WarmupInfo::default()
     };
 
@@ -194,6 +195,7 @@ struct ExtractPrefixTermRanges<'a> {
     schema: &'a Schema,
     tokenizer_manager: &'a TokenizerManager,
     term_ranges_to_warm_up: HashMap<Field, HashMap<TermRange, PositionNeeded>>,
+    automatons_to_warm_up: HashMap<Field, HashSet<Automaton>>,
 }
 
 impl<'a> ExtractPrefixTermRanges<'a> {
@@ -202,6 +204,7 @@ impl<'a> ExtractPrefixTermRanges<'a> {
             schema,
             tokenizer_manager,
             term_ranges_to_warm_up: HashMap::new(),
+            automatons_to_warm_up: HashMap::new(),
         }
     }
 
@@ -225,6 +228,13 @@ impl<'a> ExtractPrefixTermRanges<'a> {
             .entry(term_range)
             .or_default() |= position_needed;
     }
+
+    fn add_automaton(&mut self, field: Field, automaton: Automaton) {
+        self.automatons_to_warm_up
+            .entry(field)
+            .or_default()
+            .insert(automaton);
+    }
 }
 
 impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
@@ -258,8 +268,8 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
     }
 
     fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> {
-        let (_, term) = wildcard_query.extract_prefix_term(self.schema, self.tokenizer_manager)?;
-        self.add_prefix_term(term, u32::MAX, false);
+        let (field, regex) = wildcard_query.to_regex(self.schema, self.tokenizer_manager)?;
+        self.add_automaton(field, Automaton::Regex(regex));
         Ok(())
     }
 }
@@ -268,10 +278,16 @@ fn extract_prefix_term_ranges(
     query_ast: &QueryAst,
     schema: &Schema,
     tokenizer_manager: &TokenizerManager,
-) -> anyhow::Result<HashMap<Field, HashMap<TermRange, PositionNeeded>>> {
+) -> anyhow::Result<(
+    HashMap<Field, HashMap<TermRange, PositionNeeded>>,
+    HashMap<Field, HashSet<Automaton>>,
+)> {
     let mut visitor = ExtractPrefixTermRanges::with_schema(schema, tokenizer_manager);
     visitor.visit(query_ast)?;
-    Ok(visitor.term_ranges_to_warm_up)
+    Ok((
+        visitor.term_ranges_to_warm_up,
+        visitor.automatons_to_warm_up,
+    ))
 }
 
 #[cfg(test)]
diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml
index bee650198c8..35ddeaac479 100644
--- a/quickwit/quickwit-query/Cargo.toml
+++ b/quickwit/quickwit-query/Cargo.toml
@@ -18,6 +18,7 @@ lindera-core = { workspace = true, optional = true }
 lindera-dictionary = { workspace = true, optional = true }
 lindera-tokenizer = { workspace = true, optional = true }
 once_cell = { workspace = true }
+regex = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 serde_with = { workspace = true }
diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
index 86afb68a7d3..173872528d2 100644
--- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -17,7 +17,9 @@
 // You should have received a copy of the GNU Affero General Public License
 // along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-use anyhow::{anyhow, bail, Context};
+use std::borrow::Cow;
+
+use anyhow::{bail, Context};
 use serde::{Deserialize, Serialize};
 use tantivy::schema::{Field, FieldType, Schema as TantivySchema};
 use tantivy::Term;
@@ -52,76 +54,52 @@ impl WildcardQuery {
     }
 }
 
-fn extract_unique_token(mut tokens: Vec<Term>) -> anyhow::Result<Term> {
-    let term = tokens
-        .pop()
-        .with_context(|| "wildcard query generated no term")?;
-    if !tokens.is_empty() {
-        anyhow::bail!("wildcard query generated more than one term");
-    }
-    Ok(term)
-}
-
-fn unescape_with_final_wildcard(phrase: &str) -> anyhow::Result<String> {
-    enum State {
-        Normal,
-        Escaped,
-    }
-
-    // we keep this state outside of scan because we want to query if after
-    let mut saw_wildcard = false;
-    let saw_wildcard = &mut saw_wildcard;
-
-    let phrase = phrase
-        .chars()
-        .scan(State::Normal, |state, c| {
-            if *saw_wildcard {
-                return Some(Some(Err(anyhow!(
-                    "Wildcard iquery contains wildcard in non final position"
-                ))));
-            }
-            match state {
-                State::Escaped => {
-                    *state = State::Normal;
-                    Some(Some(Ok(c)))
-                }
-                State::Normal => {
-                    if c == '*' {
-                        *saw_wildcard = true;
-                        Some(None)
-                    } else if c == '\\' {
-                        *state = State::Escaped;
-                        Some(None)
-                    } else if c == '?' {
-                        Some(Some(Err(anyhow!("Wildcard query contains `?`"))))
-                    } else {
-                        Some(Some(Ok(c)))
-                    }
+fn parse_wildcard_query(mut query: &str) -> Vec<SubQuery> {
+    let mut res = Vec::new();
+    while let Some(pos) = query.find(['*', '?', '\\']) {
+        if pos > 0 {
+            res.push(SubQuery::Text(query[..pos].to_string()));
+        }
+        let chr = &query[pos..pos + 1];
+        query = &query[pos + 1..];
+        match chr {
+            "*" => res.push(SubQuery::Wildcard),
+            "?" => res.push(SubQuery::QuestionMark),
+            "\\" => {
+                if let Some(chr) = query.chars().next() {
+                    res.push(SubQuery::Text(chr.to_string()));
+                    query = &query[chr.len_utf8()..];
+                } else {
+                    // this is invalid, but let's just ignore that escape sequence
+                    break;
                 }
             }
-        })
-        // we have an iterator of Option<Result<char, anyhow::Error>>
-        .flatten()
-        // we have an iterator of Result<char, anyhow::Error>
-        .collect::<Result<String, _>>()?;
-    if !*saw_wildcard {
-        bail!("Wildcard query doesn't contain a wildcard");
+            _ => unreachable!("find shouldn't return non-matching position"),
+        }
+    }
+    if !query.is_empty() {
+        res.push(SubQuery::Text(query.to_string()));
     }
-    Ok(phrase)
+    res
+}
+
+enum SubQuery {
+    Text(String),
+    Wildcard,
+    QuestionMark,
 }
 
 impl WildcardQuery {
-    // TODO this method will probably disappear once we support the full semantic of
-    // wildcard queries
-    pub fn extract_prefix_term(
+    pub fn to_regex(
         &self,
         schema: &TantivySchema,
         tokenizer_manager: &TokenizerManager,
-    ) -> Result<(Field, Term), InvalidQuery> {
+    ) -> Result<(Field, String), InvalidQuery> {
         let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?;
         let field_type = field_entry.field_type();
 
-        let prefix = unescape_with_final_wildcard(&self.value)?;
+        let sub_query_parts = parse_wildcard_query(&self.value);
+        // TODO handle json_path
 
         match field_type {
             FieldType::Str(ref text_options) => {
@@ -137,14 +115,28 @@ impl WildcardQuery {
                     .with_context(|| {
                         format!("no tokenizer named `{}` is registered", tokenizer_name)
                     })?;
-                let mut token_stream = normalizer.token_stream(&prefix);
-                let mut tokens = Vec::new();
-                token_stream.process(&mut |token| {
-                    let term: Term = Term::from_field_text(field, &token.text);
-                    tokens.push(term);
-                });
-                let term = extract_unique_token(tokens)?;
-                Ok((field, term))
+
+                let regex = sub_query_parts
+                    .into_iter()
+                    .map(|part| match part {
+                        SubQuery::Text(text) => {
+                            let mut token_stream = normalizer.token_stream(&text);
+                            let expected_token = token_stream
+                                .next()
+                                .context("normalizer generated no content")?
+                                .text
+                                .clone();
+                            if let Some(_unexpected_token) = token_stream.next() {
+                                bail!("normalizer generated multiple tokens")
+                            }
+                            Ok(Cow::Owned(regex::escape(&expected_token)))
+                        }
+                        SubQuery::Wildcard => Ok(Cow::Borrowed(".*")),
+                        SubQuery::QuestionMark => Ok(Cow::Borrowed(".")),
+                    })
+                    .collect::<Result<String, _>>()?;
+
+                Ok((field, regex))
             }
             FieldType::JsonObject(json_options) => {
                 let text_field_indexing =
@@ -160,26 +152,115 @@ impl WildcardQuery {
                     .with_context(|| {
                         format!("no tokenizer named `{}` is registered", tokenizer_name)
                     })?;
-                let mut token_stream = normalizer.token_stream(&prefix);
-                let mut tokens = Vec::new();
-
-                token_stream.process(&mut |token| {
-                    let mut term = Term::from_field_json_path(
-                        field,
-                        json_path,
-                        json_options.is_expand_dots_enabled(),
-                    );
-                    term.append_type_and_str(&token.text);
-                    tokens.push(term);
-                });
-                let term = extract_unique_token(tokens)?;
-                Ok((field, term))
+
+                let mut term_for_path = Term::from_field_json_path(
+                    field,
+                    json_path,
+                    json_options.is_expand_dots_enabled(),
+                );
+                term_for_path.append_type_and_str("");
+
+                let value = term_for_path.value();
+                // this shouldn't error: json path was a string, and all things added while encoding
+                // the path are valid ascii (and valid utf-8). We also skip the 1st byte which is a
+                // marker to tell this is json. This isn't present in the dictionary
+                let path_prefix = std::str::from_utf8(&value.as_serialized()[1..])
+                    .context("failed to extract json path from term")?;
+                let regex = std::iter::once(Ok(Cow::Owned(regex::escape(path_prefix))))
+                    .chain(sub_query_parts.into_iter().map(|part| match part {
+                        SubQuery::Text(text) => {
+                            let mut token_stream = normalizer.token_stream(&text);
+                            let expected_token = token_stream
+                                .next()
+                                .context("normalizer generated no content")?
+                                .text
+                                .clone();
+                            if let Some(_unexpected_token) = token_stream.next() {
+                                bail!("normalizer generated multiple tokens")
+                            }
+                            Ok(Cow::Owned(regex::escape(&expected_token)))
+                        }
+                        SubQuery::Wildcard => Ok(Cow::Borrowed(".*")),
+                        SubQuery::QuestionMark => Ok(Cow::Borrowed(".")),
+                    }))
+                    .collect::<Result<String, _>>()?;
+                Ok((field, regex))
             }
             _ => Err(InvalidQuery::SchemaError(
                 "trying to run a Wildcard query on a non-text field".to_string(),
             )),
         }
     }
+
+    /*
+        pub fn extract_prefix_term(
+            &self,
+            schema: &TantivySchema,
+            tokenizer_manager: &TokenizerManager,
+        ) -> Result<(Field, Term), InvalidQuery> {
+            let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?;
+            let field_type = field_entry.field_type();
+
+            let prefix = unescape_with_final_wildcard(&self.value)?;
+
+            match field_type {
+                FieldType::Str(ref text_options) => {
+                    let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| {
+                        InvalidQuery::SchemaError(format!(
+                            "field {} is not full-text searchable",
+                            field_entry.name()
+                        ))
+                    })?;
+                    let tokenizer_name = text_field_indexing.tokenizer();
+                    let mut normalizer = tokenizer_manager
+                        .get_normalizer(tokenizer_name)
+                        .with_context(|| {
+                            format!("no tokenizer named `{}` is registered", tokenizer_name)
+                        })?;
+                    let mut token_stream = normalizer.token_stream(&prefix);
+                    let mut tokens = Vec::new();
+                    token_stream.process(&mut |token| {
+                        let term: Term = Term::from_field_text(field, &token.text);
+                        tokens.push(term);
+                    });
+                    let term = extract_unique_token(tokens)?;
+                    Ok((field, term))
+                }
+                FieldType::JsonObject(json_options) => {
+                    let text_field_indexing =
+                        json_options.get_text_indexing_options().ok_or_else(|| {
+                            InvalidQuery::SchemaError(format!(
+                                "field {} is not full-text searchable",
+                                field_entry.name()
+                            ))
+                        })?;
+                    let tokenizer_name = text_field_indexing.tokenizer();
+                    let mut normalizer = tokenizer_manager
+                        .get_normalizer(tokenizer_name)
+                        .with_context(|| {
+                            format!("no tokenizer named `{}` is registered", tokenizer_name)
+                        })?;
+                    let mut token_stream = normalizer.token_stream(&prefix);
+                    let mut tokens = Vec::new();
+
+                    token_stream.process(&mut |token| {
+                        let mut term = Term::from_field_json_path(
+                            field,
+                            json_path,
+                            json_options.is_expand_dots_enabled(),
+                        );
+                        term.append_type_and_str(&token.text);
+                        tokens.push(term);
+                    });
+                    let term = extract_unique_token(tokens)?;
+                    Ok((field, term))
+                }
+                _ => Err(InvalidQuery::SchemaError(
+                    "trying to run a Wildcard query on a non-text field".to_string(),
+                )),
+            }
+        }
+    */
 }
 
 impl BuildTantivyAst for WildcardQuery {
@@ -190,66 +271,22 @@ impl BuildTantivyAst for WildcardQuery {
         _search_fields: &[String],
         _with_validation: bool,
     ) -> Result<TantivyQueryAst, InvalidQuery> {
-        let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?;
+        /*
+                let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?;
 
-        let mut phrase_prefix_query =
-            tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]);
-        phrase_prefix_query.set_max_expansions(u32::MAX);
-        Ok(phrase_prefix_query.into())
+                let mut phrase_prefix_query =
+                    tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]);
+                phrase_prefix_query.set_max_expansions(u32::MAX);
+                Ok(phrase_prefix_query.into())
+        */
+        let (field, regex) = self.to_regex(schema, tokenizer_manager)?;
+        let regex_query = tantivy::query::RegexQuery::from_pattern(&regex, field)
+            .context("failed to build regex from wildcard")?;
+        Ok(regex_query.into())
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use tantivy::schema::{TextFieldIndexing, TextOptions};
-
-    use super::*;
-    use crate::create_default_quickwit_tokenizer_manager;
-
-    #[test]
-    fn test_extract_term_for_wildcard() {
-        let query = WildcardQuery {
-            field: "my_field".to_string(),
-            value: "MyString Wh1ch a nOrMal Tokenizer would cut*".to_string(),
-        };
-        let tokenizer_manager = create_default_quickwit_tokenizer_manager();
-        for tokenizer in ["raw", "whitespace"] {
-            let mut schema_builder = TantivySchema::builder();
-            let text_options = TextOptions::default()
-                .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
-            schema_builder.add_text_field("my_field", text_options);
-            let schema = schema_builder.build();
-
-            let (_field, term) = query
-                .extract_prefix_term(&schema, &tokenizer_manager)
-                .unwrap();
-            let value = term.value();
-            let text = value.as_str().unwrap();
-            assert_eq!(text, query.value.trim_end_matches('*'));
-        }
-
-        for tokenizer in [
-            "raw_lowercase",
-            "lowercase",
-            "default",
-            "en_stem",
-            "chinese_compatible",
-            "source_code_default",
-            "source_code_with_hex",
-        ] {
-            let mut schema_builder = TantivySchema::builder();
-            let text_options = TextOptions::default()
-                .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
-            schema_builder.add_text_field("my_field", text_options);
-            let schema = schema_builder.build();
-
-            let (_field, term) = query
-                .extract_prefix_term(&schema, &tokenizer_manager)
-                .unwrap();
-
-            let value = term.value();
-            let text = value.as_str().unwrap();
-            assert_eq!(text, &query.value.trim_end_matches('*').to_lowercase());
-        }
-    }
+    // TODO add test
 }
diff --git a/quickwit/quickwit-search/Cargo.toml b/quickwit/quickwit-search/Cargo.toml
index 3aeb47a6eb7..ce03e38a78e 100644
--- a/quickwit/quickwit-search/Cargo.toml
+++ b/quickwit/quickwit-search/Cargo.toml
@@ -28,6 +28,7 @@ rayon = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 tantivy = { workspace = true }
+tantivy-fst = { workspace = true }
 thiserror = { workspace = true }
 tokio = { workspace = true }
 tokio-stream = { workspace = true }
diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs
index 5ad92f63aa2..7c99a04f013 100644
--- a/quickwit/quickwit-search/src/leaf.rs
+++ b/quickwit/quickwit-search/src/leaf.rs
@@ -27,7 +27,7 @@ use anyhow::Context;
 use futures::future::try_join_all;
 use quickwit_common::pretty::PrettySample;
 use quickwit_directories::{CachingDirectory, HotDirectory, StorageDirectory};
-use quickwit_doc_mapper::{DocMapper, TermRange, WarmupInfo};
+use quickwit_doc_mapper::{Automaton, DocMapper, TermRange, WarmupInfo};
 use quickwit_proto::search::{
     CountHits, LeafSearchRequest, LeafSearchResponse, PartialHit, SearchRequest, SortOrder,
     SortValue, SplitIdAndFooterOffsets, SplitSearchError,
@@ -218,6 +218,9 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any
     // TODO merge warm_up_postings into warm_up_term_dict_fields
     let warm_up_postings_future = warm_up_postings(searcher, &warmup_info.term_dict_fields)
         .instrument(debug_span!("warm_up_postings"));
+    let warm_up_automatons_future =
+        warm_up_automatons(searcher, &warmup_info.automatons_grouped_by_field)
+            .instrument(debug_span!("warm_up_automatons"));
 
     tokio::try_join!(
         warm_up_terms_future,
@@ -226,6 +229,7 @@ pub(crate) async fn warmup(searcher: &Searcher, warmup_info: &WarmupInfo) -> any
         warm_up_term_dict_future,
         warm_up_fieldnorms_future,
         warm_up_postings_future,
+        warm_up_automatons_future,
     )?;
 
     Ok(())
@@ -337,6 +341,35 @@ async fn warm_up_term_ranges(
     Ok(())
 }
 
+async fn warm_up_automatons(
+    searcher: &Searcher,
+    terms_grouped_by_field: &HashMap<Field, HashSet<Automaton>>,
+) -> anyhow::Result<()> {
+    let mut warm_up_futures = Vec::new();
+    for (field, automatons) in terms_grouped_by_field {
+        for segment_reader in searcher.segment_readers() {
+            let inv_idx = segment_reader.inverted_index(*field)?;
+            for automaton in automatons {
+                let inv_idx_clone = inv_idx.clone();
+                warm_up_futures.push(async move {
+                    match automaton {
+                        Automaton::Regex(regex_str) => {
+                            let regex = tantivy_fst::Regex::new(regex_str)
+                                .context("failed parsing regex during warmup")?;
+                            inv_idx_clone
+                                .warm_postings_automaton(&regex)
+                                .await
+                                .context("failed loading automaton")
+                        }
+                    }
+                });
+            }
+        }
+    }
+    try_join_all(warm_up_futures).await?;
+    Ok(())
+}
+
 async fn warm_up_fieldnorms(searcher: &Searcher, requires_scoring: bool) -> anyhow::Result<()> {
     if !requires_scoring {
         return Ok(());

From 9d83c5f84f2e6e4744afff802e66e27d3683407b Mon Sep 17 00:00:00 2001
From: trinity Pointard <trinity.pointard@datadoghq.com>
Date: Wed, 18 Dec 2024 11:04:09 +0100
Subject: [PATCH 2/7] add tests for new wildcard queries

---
 .../quickwit-doc-mapper/src/doc_mapper/mod.rs |  47 ++++-
 .../quickwit-doc-mapper/src/query_builder.rs  |  12 +-
 .../src/query_ast/wildcard_query.rs           | 172 ++++++++++--------
 .../0005-query_string_query.yaml              |   4 +-
 4 files changed, 143 insertions(+), 92 deletions(-)

diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
index 059fed62222..d5b27fa6204 100644
--- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
+++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
@@ -589,6 +589,13 @@ mod tests {
         elements.iter().map(|elem| elem.to_string()).collect()
     }
 
+    fn automaton_hashset(elements: &[&str]) -> HashSet<Automaton> {
+        elements
+            .iter()
+            .map(|elem| Automaton::Regex(elem.to_string()))
+            .collect()
+    }
+
     fn hashset_field(elements: &[u32]) -> HashSet<Field> {
         elements
             .iter()
@@ -638,7 +645,12 @@ mod tests {
                 (2, "term1", false),
                 (2, "term2", false),
             ]),
-            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
+            automatons_grouped_by_field: [(
+                Field::from_field_id(1),
+                automaton_hashset(&["my_reg.*ex"]),
+            )]
+            .into_iter()
+            .collect(),
         };
 
         // merging with default has no impact
@@ -656,7 +668,12 @@ mod tests {
                 (3, "term1", false),
                 (2, "term2", true),
             ]),
-            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
+            automatons_grouped_by_field: [
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(2), automaton_hashset(&["my_reg.*ex"])),
+            ]
+            .into_iter()
+            .collect(),
         };
         wi_base.merge(wi_2.clone());
 
@@ -704,6 +721,17 @@ mod tests {
             );
         }
 
+        let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")];
+        for (field, regex) in expected_automatons {
+            let field = Field::from_field_id(field);
+            let automaton = Automaton::Regex(regex.to_string());
+            assert!(wi_base
+                .automatons_grouped_by_field
+                .get(&field)
+                .unwrap()
+                .contains(&automaton));
+        }
+
         // merge is idempotent
         let mut wi_cloned = wi_base.clone();
         wi_cloned.merge(wi_2);
@@ -726,7 +754,13 @@ mod tests {
                 (1, "term2", true),
                 (2, "term3", false),
             ]),
-            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
+            automatons_grouped_by_field: [
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
+            ]
+            .into_iter()
+            .collect(),
         };
         let expected = WarmupInfo {
             term_dict_fields: hashset_field(&[1]),
@@ -737,7 +771,12 @@ mod tests {
                 (1, "term2", true),
                 (2, "term3", false),
             ]),
-            automatons_grouped_by_field: HashMap::new(), // TODO complete tests
+            automatons_grouped_by_field: [
+                (Field::from_field_id(1), automaton_hashset(&["other-re.ex"])),
+                (Field::from_field_id(2), automaton_hashset(&["my_reg.ex"])),
+            ]
+            .into_iter()
+            .collect(),
         };
 
         warmup_info.simplify();
diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
index 6dae2e29590..e28d665046b 100644
--- a/quickwit/quickwit-doc-mapper/src/query_builder.rs
+++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -102,7 +102,7 @@ pub(crate) fn build_query(
 
     let term_set_query_fields = extract_term_set_query_fields(query_ast, &schema)?;
     let (term_ranges_grouped_by_field, automatons_grouped_by_field) =
-        extract_prefix_term_ranges(query_ast, &schema, tokenizer_manager)?;
+        extract_prefix_term_ranges_and_automaton(query_ast, &schema, tokenizer_manager)?;
 
     let mut terms_grouped_by_field: HashMap<Field, HashMap<_, bool>> = Default::default();
     query.query_terms(&mut |term, need_position| {
@@ -274,14 +274,14 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
     }
 }
 
-fn extract_prefix_term_ranges(
+type TermRangeWarmupInfo = HashMap<Field, HashMap<TermRange, PositionNeeded>>;
+type AutomatonWarmupInfo = HashMap<Field, HashSet<Automaton>>;
+
+fn extract_prefix_term_ranges_and_automaton(
     query_ast: &QueryAst,
     schema: &Schema,
     tokenizer_manager: &TokenizerManager,
-) -> anyhow::Result<(
-    HashMap<Field, HashMap<TermRange, PositionNeeded>>,
-    HashMap<Field, HashSet<Automaton>>,
-)> {
+) -> anyhow::Result<(TermRangeWarmupInfo, AutomatonWarmupInfo)> {
     let mut visitor = ExtractPrefixTermRanges::with_schema(schema, tokenizer_manager);
     visitor.visit(query_ast)?;
     Ok((
diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
index 173872528d2..c459151d021 100644
--- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -99,7 +99,6 @@ impl WildcardQuery {
         let field_type = field_entry.field_type();
 
         let sub_query_parts = parse_wildcard_query(&self.value);
-        // TODO handle json_path
 
         match field_type {
             FieldType::Str(ref text_options) => {
@@ -191,76 +190,6 @@ impl WildcardQuery {
             )),
         }
     }
-
-    /*
-        pub fn extract_prefix_term(
-            &self,
-            schema: &TantivySchema,
-            tokenizer_manager: &TokenizerManager,
-        ) -> Result<(Field, Term), InvalidQuery> {
-            let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?;
-            let field_type = field_entry.field_type();
-
-            let prefix = unescape_with_final_wildcard(&self.value)?;
-
-            match field_type {
-                FieldType::Str(ref text_options) => {
-                    let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| {
-                        InvalidQuery::SchemaError(format!(
-                            "field {} is not full-text searchable",
-                            field_entry.name()
-                        ))
-                    })?;
-                    let tokenizer_name = text_field_indexing.tokenizer();
-                    let mut normalizer = tokenizer_manager
-                        .get_normalizer(tokenizer_name)
-                        .with_context(|| {
-                            format!("no tokenizer named `{}` is registered", tokenizer_name)
-                        })?;
-                    let mut token_stream = normalizer.token_stream(&prefix);
-                    let mut tokens = Vec::new();
-                    token_stream.process(&mut |token| {
-                        let term: Term = Term::from_field_text(field, &token.text);
-                        tokens.push(term);
-                    });
-                    let term = extract_unique_token(tokens)?;
-                    Ok((field, term))
-                }
-                FieldType::JsonObject(json_options) => {
-                    let text_field_indexing =
-                        json_options.get_text_indexing_options().ok_or_else(|| {
-                            InvalidQuery::SchemaError(format!(
-                                "field {} is not full-text searchable",
-                                field_entry.name()
-                            ))
-                        })?;
-                    let tokenizer_name = text_field_indexing.tokenizer();
-                    let mut normalizer = tokenizer_manager
-                        .get_normalizer(tokenizer_name)
-                        .with_context(|| {
-                            format!("no tokenizer named `{}` is registered", tokenizer_name)
-                        })?;
-                    let mut token_stream = normalizer.token_stream(&prefix);
-                    let mut tokens = Vec::new();
-
-                    token_stream.process(&mut |token| {
-                        let mut term = Term::from_field_json_path(
-                            field,
-                            json_path,
-                            json_options.is_expand_dots_enabled(),
-                        );
-                        term.append_type_and_str(&token.text);
-                        tokens.push(term);
-                    });
-                    let term = extract_unique_token(tokens)?;
-                    Ok((field, term))
-                }
-                _ => Err(InvalidQuery::SchemaError(
-                    "trying to run a Wildcard query on a non-text field".to_string(),
-                )),
-            }
-        }
-    */
 }
 
 impl BuildTantivyAst for WildcardQuery {
@@ -271,14 +200,6 @@ impl BuildTantivyAst for WildcardQuery {
         _search_fields: &[String],
         _with_validation: bool,
     ) -> Result<TantivyQueryAst, InvalidQuery> {
-        /*
-                let (_, term) = self.extract_prefix_term(schema, tokenizer_manager)?;
-
-                let mut phrase_prefix_query =
-                    tantivy::query::PhrasePrefixQuery::new_with_offset(vec![(0, term)]);
-                phrase_prefix_query.set_max_expansions(u32::MAX);
-                Ok(phrase_prefix_query.into())
-        */
         let (field, regex) = self.to_regex(schema, tokenizer_manager)?;
         let regex_query = tantivy::query::RegexQuery::from_pattern(&regex, field)
             .context("failed to build regex from wildcard")?;
@@ -288,5 +209,96 @@ impl BuildTantivyAst for WildcardQuery {
 
 #[cfg(test)]
 mod tests {
-    // TODO add test
+    use tantivy::schema::{TextFieldIndexing, TextOptions};
+
+    use super::*;
+    use crate::create_default_quickwit_tokenizer_manager;
+
+    #[test]
+    fn test_wildcard_query_to_regex_on_text() {
+        let query = WildcardQuery {
+            field: "text_field".to_string(),
+            value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(),
+        };
+
+        let tokenizer_manager = create_default_quickwit_tokenizer_manager();
+        for tokenizer in ["raw", "whitespace"] {
+            let mut schema_builder = TantivySchema::builder();
+            let text_options = TextOptions::default()
+                .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
+            schema_builder.add_text_field("text_field", text_options);
+            let schema = schema_builder.build();
+
+            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
+            assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut");
+        }
+
+        for tokenizer in [
+            "raw_lowercase",
+            "lowercase",
+            "default",
+            "en_stem",
+            "chinese_compatible",
+            "source_code_default",
+            "source_code_with_hex",
+        ] {
+            let mut schema_builder = TantivySchema::builder();
+            let text_options = TextOptions::default()
+                .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
+            schema_builder.add_text_field("text_field", text_options);
+            let schema = schema_builder.build();
+
+            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
+
+            assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut");
+        }
+    }
+
+    #[test]
+    fn test_wildcard_query_to_regex_on_json() {
+        let query = WildcardQuery {
+            // this volontarily contains uppercase and regex-unsafe char to make sure we properly
+            // keep the case, but sanitize special chars
+            field: "json_field.Inner.Fie*ld".to_string(),
+            value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(),
+        };
+
+        let tokenizer_manager = create_default_quickwit_tokenizer_manager();
+        for tokenizer in ["raw", "whitespace"] {
+            let mut schema_builder = TantivySchema::builder();
+            let text_options = TextOptions::default()
+                .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
+            schema_builder.add_json_field("json_field", text_options);
+            let schema = schema_builder.build();
+
+            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
+            assert_eq!(
+                regex,
+                "Inner\u{1}Fie\\*ld\0sMyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"
+            );
+        }
+
+        for tokenizer in [
+            "raw_lowercase",
+            "lowercase",
+            "default",
+            "en_stem",
+            "chinese_compatible",
+            "source_code_default",
+            "source_code_with_hex",
+        ] {
+            let mut schema_builder = TantivySchema::builder();
+            let text_options = TextOptions::default()
+                .set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
+            schema_builder.add_json_field("json_field", text_options);
+            let schema = schema_builder.build();
+
+            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
+
+            assert_eq!(
+                regex,
+                "Inner\u{1}Fie\\*ld\0smystring wh1ch.a\\.normal tokenizer would.*cut"
+            );
+        }
+    }
 }
diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
index 8cb495379c3..84c6bb3d790 100644
--- a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
+++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
@@ -158,7 +158,7 @@ json:
     query_string:
       default_field: payload.description
       lenient: true
-      query: "Jour* AND unix"
+      query: "Jou*al AND unix"
 expected:
   hits:
     total:
@@ -170,7 +170,7 @@ json:
     query_string:
       default_field: payload.description
       lenient: true
-      query: "jour* AND unix"
+      query: "jou*al AND unix"
 expected:
   hits:
     total:

From f65193832fd887561844681c3f12a62c3f1c5a7b Mon Sep 17 00:00:00 2001
From: trinity Pointard <trinity.pointard@datadoghq.com>
Date: Wed, 18 Dec 2024 14:22:40 +0100
Subject: [PATCH 3/7] refactor json path handling for automaton queries

it should make it easier to use other kind of queries
---
 quickwit/Cargo.lock                           |   1 +
 .../quickwit-doc-mapper/src/doc_mapper/mod.rs |   9 +-
 .../quickwit-doc-mapper/src/query_builder.rs  |   4 +-
 quickwit/quickwit-query/Cargo.toml            |   1 +
 quickwit/quickwit-query/src/query_ast/mod.rs  |   2 +-
 .../src/query_ast/wildcard_query.rs           | 174 +++++++++++++++---
 quickwit/quickwit-search/src/leaf.rs          |   9 +-
 7 files changed, 162 insertions(+), 38 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index 3a5b45ed497..d8ac98ead61 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -6852,6 +6852,7 @@ dependencies = [
  "serde_json",
  "serde_with 3.11.0",
  "tantivy",
+ "tantivy-fst",
  "thiserror 1.0.69",
  "time",
  "whichlang",
diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
index d5b27fa6204..1504e26c743 100644
--- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
+++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
@@ -88,8 +88,9 @@ pub struct TermRange {
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 /// Supported automaton types to warmup
 pub enum Automaton {
-    /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq
-    Regex(String),
+    /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if
+    /// inside a json field
+    Regex(Option<Vec<u8>>, String),
 }
 
 /// Information about what a DocMapper think should be warmed up before
@@ -592,7 +593,7 @@ mod tests {
     fn automaton_hashset(elements: &[&str]) -> HashSet<Automaton> {
         elements
             .iter()
-            .map(|elem| Automaton::Regex(elem.to_string()))
+            .map(|elem| Automaton::Regex(None, elem.to_string()))
             .collect()
     }
 
@@ -724,7 +725,7 @@ mod tests {
         let expected_automatons = [(1, "my_reg.*ex"), (1, "other-re.ex"), (2, "my_reg.*ex")];
         for (field, regex) in expected_automatons {
             let field = Field::from_field_id(field);
-            let automaton = Automaton::Regex(regex.to_string());
+            let automaton = Automaton::Regex(None, regex.to_string());
             assert!(wi_base
                 .automatons_grouped_by_field
                 .get(&field)
diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
index e28d665046b..f42231afef8 100644
--- a/quickwit/quickwit-doc-mapper/src/query_builder.rs
+++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -268,8 +268,8 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
     }
 
     fn visit_wildcard(&mut self, wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> {
-        let (field, regex) = wildcard_query.to_regex(self.schema, self.tokenizer_manager)?;
-        self.add_automaton(field, Automaton::Regex(regex));
+        let (field, path, regex) = wildcard_query.to_regex(self.schema, self.tokenizer_manager)?;
+        self.add_automaton(field, Automaton::Regex(path, regex));
         Ok(())
     }
 }
diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml
index 35ddeaac479..733f8769056 100644
--- a/quickwit/quickwit-query/Cargo.toml
+++ b/quickwit/quickwit-query/Cargo.toml
@@ -23,6 +23,7 @@ serde = { workspace = true }
 serde_json = { workspace = true }
 serde_with = { workspace = true }
 tantivy = { workspace = true }
+tantivy-fst = { workspace = true }
 time = { workspace = true }
 thiserror = { workspace = true }
 whichlang = { workspace = true, optional = true }
diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs
index 8699e05d238..b99d5079d41 100644
--- a/quickwit/quickwit-query/src/query_ast/mod.rs
+++ b/quickwit/quickwit-query/src/query_ast/mod.rs
@@ -46,7 +46,7 @@ pub use term_query::TermQuery;
 pub use term_set_query::TermSetQuery;
 pub use user_input_query::UserInputQuery;
 pub use visitor::{QueryAstTransformer, QueryAstVisitor};
-pub use wildcard_query::WildcardQuery;
+pub use wildcard_query::{JsonPathPrefix, WildcardQuery};
 
 use crate::{BooleanOperand, InvalidQuery, NotNaNf32};
 
diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
index c459151d021..64bc47ac082 100644
--- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -18,8 +18,10 @@
 // along with this program. If not, see <http://www.gnu.org/licenses/>.
 
 use std::borrow::Cow;
+use std::sync::Arc;
 
 use anyhow::{bail, Context};
+pub use prefix::JsonPathPrefix;
 use serde::{Deserialize, Serialize};
 use tantivy::schema::{Field, FieldType, Schema as TantivySchema};
 use tantivy::Term;
@@ -94,7 +96,7 @@ impl WildcardQuery {
         &self,
         schema: &TantivySchema,
         tokenizer_manager: &TokenizerManager,
-    ) -> Result<(Field, String), InvalidQuery> {
+    ) -> Result<(Field, Option<Vec<u8>>, String), InvalidQuery> {
         let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?;
         let field_type = field_entry.field_type();
 
@@ -135,7 +137,7 @@ impl WildcardQuery {
                     })
                     .collect::<Result<String, _>>()?;
 
-                Ok((field, regex))
+                Ok((field, None, regex))
             }
             FieldType::JsonObject(json_options) => {
                 let text_field_indexing =
@@ -160,13 +162,12 @@ impl WildcardQuery {
                 term_for_path.append_type_and_str("");
 
                 let value = term_for_path.value();
-                // this shouldn't error: json path was a string, and all things added while encoding
-                // the path are valid ascii (and valid utf-8). We also skip the 1st byte which is a
-                // marker to tell this is json. This isn't present in the dictionary
-                let path_prefix = std::str::from_utf8(&value.as_serialized()[1..])
-                    .context("failed to extract json path from term")?;
-                let regex = std::iter::once(Ok(Cow::Owned(regex::escape(path_prefix))))
-                    .chain(sub_query_parts.into_iter().map(|part| match part {
+                // We skip the 1st byte which is a marker to tell this is json. This isn't present
+                // in the dictionary
+                let byte_path_prefix = value.as_serialized()[1..].to_owned();
+                let regex = sub_query_parts
+                    .into_iter()
+                    .map(|part| match part {
                         SubQuery::Text(text) => {
                             let mut token_stream = normalizer.token_stream(&text);
                             let expected_token = token_stream
@@ -181,9 +182,9 @@ impl WildcardQuery {
                         }
                         SubQuery::Wildcard => Ok(Cow::Borrowed(".*")),
                         SubQuery::QuestionMark => Ok(Cow::Borrowed(".")),
-                    }))
+                    })
                     .collect::<Result<String, _>>()?;
-                Ok((field, regex))
+                Ok((field, Some(byte_path_prefix), regex))
             }
             _ => Err(InvalidQuery::SchemaError(
                 "trying to run a Wildcard query on a non-text field".to_string(),
@@ -200,10 +201,129 @@ impl BuildTantivyAst for WildcardQuery {
         _search_fields: &[String],
         _with_validation: bool,
     ) -> Result<TantivyQueryAst, InvalidQuery> {
-        let (field, regex) = self.to_regex(schema, tokenizer_manager)?;
-        let regex_query = tantivy::query::RegexQuery::from_pattern(&regex, field)
-            .context("failed to build regex from wildcard")?;
-        Ok(regex_query.into())
+        let (field, path, regex) = self.to_regex(schema, tokenizer_manager)?;
+        let regex =
+            tantivy_fst::Regex::new(&regex).context("failed to parse regex built from wildcard")?;
+        let regex_automaton_with_path = prefix::JsonPathPrefix {
+            prefix: path.unwrap_or_default(),
+            automaton: regex,
+        };
+        let regex_query_with_path = prefix::AutomatonQuery {
+            field,
+            automaton: Arc::new(regex_automaton_with_path),
+        };
+        Ok(regex_query_with_path.into())
+    }
+}
+
+mod prefix {
+    use std::sync::Arc;
+
+    use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight};
+    use tantivy::schema::Field;
+    use tantivy_fst::Automaton;
+    pub struct JsonPathPrefix<A> {
+        pub prefix: Vec<u8>,
+        pub automaton: A,
+    }
+
+    #[derive(Clone)]
+    pub enum JsonPathPrefixState<A> {
+        Prefix(usize),
+        Inner(A),
+        PrefixFailed,
+    }
+
+    impl<A: Automaton> Automaton for JsonPathPrefix<A> {
+        type State = JsonPathPrefixState<A::State>;
+
+        fn start(&self) -> Self::State {
+            if self.prefix.is_empty() {
+                JsonPathPrefixState::Inner(self.automaton.start())
+            } else {
+                JsonPathPrefixState::Prefix(0)
+            }
+        }
+
+        fn is_match(&self, state: &Self::State) -> bool {
+            match state {
+                JsonPathPrefixState::Prefix(_) => false,
+                JsonPathPrefixState::Inner(inner_state) => self.automaton.is_match(inner_state),
+                JsonPathPrefixState::PrefixFailed => false,
+            }
+        }
+
+        fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
+            match state {
+                JsonPathPrefixState::Prefix(i) => {
+                    if self.prefix.get(*i) != Some(&byte) {
+                        return JsonPathPrefixState::PrefixFailed;
+                    }
+                    let next_pos = i + 1;
+                    if next_pos == self.prefix.len() {
+                        JsonPathPrefixState::Inner(self.automaton.start())
+                    } else {
+                        JsonPathPrefixState::Prefix(next_pos)
+                    }
+                }
+                JsonPathPrefixState::Inner(inner_state) => {
+                    JsonPathPrefixState::Inner(self.automaton.accept(inner_state, byte))
+                }
+                JsonPathPrefixState::PrefixFailed => JsonPathPrefixState::PrefixFailed,
+            }
+        }
+
+        fn can_match(&self, state: &Self::State) -> bool {
+            match state {
+                JsonPathPrefixState::Prefix(_) => true,
+                JsonPathPrefixState::Inner(inner_state) => self.automaton.can_match(inner_state),
+                JsonPathPrefixState::PrefixFailed => false,
+            }
+        }
+
+        fn will_always_match(&self, state: &Self::State) -> bool {
+            match state {
+                JsonPathPrefixState::Prefix(_) => false,
+                JsonPathPrefixState::Inner(inner_state) => {
+                    self.automaton.will_always_match(inner_state)
+                }
+                JsonPathPrefixState::PrefixFailed => false,
+            }
+        }
+    }
+
+    pub struct AutomatonQuery<A> {
+        pub automaton: Arc<A>,
+        pub field: Field,
+    }
+
+    impl<A> std::fmt::Debug for AutomatonQuery<A> {
+        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+            f.debug_struct("AutomatonQuery")
+                .field("field", &self.field)
+                .field("automaton", &std::any::type_name::<A>())
+                .finish()
+        }
+    }
+
+    impl<A> Clone for AutomatonQuery<A> {
+        fn clone(&self) -> Self {
+            AutomatonQuery {
+                automaton: self.automaton.clone(),
+                field: self.field,
+            }
+        }
+    }
+
+    impl<A: Automaton + Send + Sync + 'static> Query for AutomatonQuery<A>
+    where A::State: Clone
+    {
+        fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> tantivy::Result<Box<dyn Weight>> {
+            Ok(Box::new(AutomatonWeight::<A>::new(
+                self.field,
+                self.automaton.clone(),
+            )))
+        }
     }
 }
 
@@ -229,8 +349,9 @@ mod tests {
             schema_builder.add_text_field("text_field", text_options);
             let schema = schema_builder.build();
 
-            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
+            let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
             assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut");
+            assert!(path.is_none());
         }
 
         for tokenizer in [
@@ -248,9 +369,9 @@ mod tests {
             schema_builder.add_text_field("text_field", text_options);
             let schema = schema_builder.build();
 
-            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
-
+            let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
             assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut");
+            assert!(path.is_none());
         }
     }
 
@@ -271,11 +392,9 @@ mod tests {
             schema_builder.add_json_field("json_field", text_options);
             let schema = schema_builder.build();
 
-            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
-            assert_eq!(
-                regex,
-                "Inner\u{1}Fie\\*ld\0sMyString Wh1ch.a\\.nOrMal Tokenizer would.*cut"
-            );
+            let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
+            assert_eq!(regex, "MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut");
+            assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes());
         }
 
         for tokenizer in [
@@ -293,12 +412,9 @@ mod tests {
             schema_builder.add_json_field("json_field", text_options);
             let schema = schema_builder.build();
 
-            let (_field, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
-
-            assert_eq!(
-                regex,
-                "Inner\u{1}Fie\\*ld\0smystring wh1ch.a\\.normal tokenizer would.*cut"
-            );
+            let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
+            assert_eq!(regex, "mystring wh1ch.a\\.normal tokenizer would.*cut");
+            assert_eq!(path.unwrap(), "Inner\u{1}Fie*ld\0s".as_bytes());
         }
     }
 }
diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs
index 7c99a04f013..c321b4fe662 100644
--- a/quickwit/quickwit-search/src/leaf.rs
+++ b/quickwit/quickwit-search/src/leaf.rs
@@ -353,11 +353,16 @@ async fn warm_up_automatons(
                 let inv_idx_clone = inv_idx.clone();
                 warm_up_futures.push(async move {
                     match automaton {
-                        Automaton::Regex(regex_str) => {
+                        Automaton::Regex(path, regex_str) => {
                             let regex = tantivy_fst::Regex::new(regex_str)
                                 .context("failed parsing regex during warmup")?;
                             inv_idx_clone
-                                .warm_postings_automaton(&regex)
+                                .warm_postings_automaton(
+                                    &quickwit_query::query_ast::JsonPathPrefix {
+                                        automaton: regex,
+                                        prefix: path.clone().unwrap_or_default(),
+                                    },
+                                )
                                 .await
                                 .context("failed loading automaton")
                         }

From 4aada41c084e2ae7641d843455a9b1b355145ebe Mon Sep 17 00:00:00 2001
From: trinity Pointard <trinity.pointard@datadoghq.com>
Date: Wed, 18 Dec 2024 15:54:24 +0100
Subject: [PATCH 4/7] regex support

---
 .../quickwit-doc-mapper/src/query_builder.rs  |   8 +-
 .../quickwit-doc-mapper/src/tag_pruning.rs    |   1 +
 .../src/elastic_query_dsl/mod.rs              |   4 +
 .../src/elastic_query_dsl/regex_query.rs      |  42 ++++++
 quickwit/quickwit-query/src/query_ast/mod.rs  |  14 +-
 .../src/query_ast/regex_query.rs              | 124 ++++++++++++++++++
 .../quickwit-query/src/query_ast/visitor.rs   |  14 +-
 .../src/query_ast/wildcard_query.rs           |   2 +-
 8 files changed, 203 insertions(+), 6 deletions(-)
 create mode 100644 quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs
 create mode 100644 quickwit/quickwit-query/src/query_ast/regex_query.rs

diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
index f42231afef8..77fda563987 100644
--- a/quickwit/quickwit-doc-mapper/src/query_builder.rs
+++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -23,7 +23,7 @@ use std::ops::Bound;
 
 use quickwit_query::query_ast::{
     FieldPresenceQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, QueryAstVisitor, RangeQuery,
-    TermSetQuery, WildcardQuery,
+    RegexQuery, TermSetQuery, WildcardQuery,
 };
 use quickwit_query::tokenizers::TokenizerManager;
 use quickwit_query::{find_field_or_hit_dynamic, InvalidQuery};
@@ -272,6 +272,12 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
         self.add_automaton(field, Automaton::Regex(path, regex));
         Ok(())
     }
+
+    fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> {
+        let (field, path, regex) = regex_query.to_regex(self.schema)?;
+        self.add_automaton(field, Automaton::Regex(path, regex));
+        Ok(())
+    }
 }
 
 type TermRangeWarmupInfo = HashMap<Field, HashMap<TermRange, PositionNeeded>>;
diff --git a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs
index e3b2255efc3..76082b34bb4 100644
--- a/quickwit/quickwit-doc-mapper/src/tag_pruning.rs
+++ b/quickwit/quickwit-doc-mapper/src/tag_pruning.rs
@@ -118,6 +118,7 @@ fn extract_unsimplified_tags_filter_ast(query_ast: QueryAst) -> UnsimplifiedTagF
             panic!("Extract unsimplified should only be called on AST without UserInputQuery.");
         }
         QueryAst::FieldPresence(_) => UnsimplifiedTagFilterAst::Uninformative,
+        QueryAst::Regex(_) => UnsimplifiedTagFilterAst::Uninformative,
     }
 }
 
diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs
index 9e49c866d95..8b841f43f7f 100644
--- a/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs
+++ b/quickwit/quickwit-query/src/elastic_query_dsl/mod.rs
@@ -29,6 +29,7 @@ mod one_field_map;
 mod phrase_prefix_query;
 mod query_string_query;
 mod range_query;
+mod regex_query;
 mod string_or_struct;
 mod term_query;
 mod terms_query;
@@ -46,6 +47,7 @@ use crate::elastic_query_dsl::match_bool_prefix::MatchBoolPrefixQuery;
 use crate::elastic_query_dsl::match_phrase_query::MatchPhraseQuery;
 use crate::elastic_query_dsl::match_query::MatchQuery;
 use crate::elastic_query_dsl::multi_match::MultiMatchQuery;
+use crate::elastic_query_dsl::regex_query::RegexQuery;
 use crate::elastic_query_dsl::terms_query::TermsQuery;
 use crate::not_nan_f32::NotNaNf32;
 use crate::query_ast::QueryAst;
@@ -79,6 +81,7 @@ pub(crate) enum ElasticQueryDslInner {
     MultiMatch(MultiMatchQuery),
     Range(RangeQuery),
     Exists(ExistsQuery),
+    Regexp(RegexQuery),
 }
 
 #[derive(Deserialize, Debug, Eq, PartialEq, Clone)]
@@ -126,6 +129,7 @@ impl ConvertibleToQueryAst for ElasticQueryDslInner {
             Self::Match(match_query) => match_query.convert_to_query_ast(),
             Self::Exists(exists_query) => exists_query.convert_to_query_ast(),
             Self::MultiMatch(multi_match_query) => multi_match_query.convert_to_query_ast(),
+            Self::Regexp(regex_query) => regex_query.convert_to_query_ast(),
         }
     }
 }
diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs
new file mode 100644
index 00000000000..f3c2d0427f0
--- /dev/null
+++ b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs
@@ -0,0 +1,42 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+use serde::Deserialize;
+
+use crate::elastic_query_dsl::one_field_map::OneFieldMap;
+use crate::elastic_query_dsl::ConvertibleToQueryAst;
+use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery};
+
+#[derive(Deserialize, Debug, Default, Eq, PartialEq, Clone)]
+#[serde(deny_unknown_fields)]
+pub struct RegexQueryParams {
+    value: String,
+}
+
+pub type RegexQuery = OneFieldMap<RegexQueryParams>;
+
+impl ConvertibleToQueryAst for RegexQuery {
+    fn convert_to_query_ast(self) -> anyhow::Result<QueryAst> {
+        Ok(AstRegexQuery {
+            field: self.field,
+            regex: self.value.value,
+        }
+        .into())
+    }
+}
diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs
index b99d5079d41..9f2c5b34f61 100644
--- a/quickwit/quickwit-query/src/query_ast/mod.rs
+++ b/quickwit/quickwit-query/src/query_ast/mod.rs
@@ -28,6 +28,7 @@ mod field_presence;
 mod full_text_query;
 mod phrase_prefix_query;
 mod range_query;
+mod regex_query;
 mod tantivy_query_ast;
 mod term_query;
 mod term_set_query;
@@ -41,12 +42,13 @@ pub use field_presence::FieldPresenceQuery;
 pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery};
 pub use phrase_prefix_query::PhrasePrefixQuery;
 pub use range_query::RangeQuery;
+pub use regex_query::RegexQuery;
 use tantivy_query_ast::TantivyQueryAst;
 pub use term_query::TermQuery;
 pub use term_set_query::TermSetQuery;
 pub use user_input_query::UserInputQuery;
 pub use visitor::{QueryAstTransformer, QueryAstVisitor};
-pub use wildcard_query::{JsonPathPrefix, WildcardQuery};
+pub use wildcard_query::{AutomatonQuery, JsonPathPrefix, WildcardQuery};
 
 use crate::{BooleanOperand, InvalidQuery, NotNaNf32};
 
@@ -63,6 +65,7 @@ pub enum QueryAst {
     Range(RangeQuery),
     UserInput(UserInputQuery),
     Wildcard(WildcardQuery),
+    Regex(RegexQuery),
     MatchAll,
     MatchNone,
     Boost {
@@ -105,7 +108,8 @@ impl QueryAst {
             | ast @ QueryAst::MatchNone
             | ast @ QueryAst::FieldPresence(_)
             | ast @ QueryAst::Range(_)
-            | ast @ QueryAst::Wildcard(_) => Ok(ast),
+            | ast @ QueryAst::Wildcard(_)
+            | ast @ QueryAst::Regex(_) => Ok(ast),
             QueryAst::UserInput(user_text_query) => {
                 user_text_query.parse_user_query(default_search_fields)
             }
@@ -249,6 +253,12 @@ impl BuildTantivyAst for QueryAst {
                 search_fields,
                 with_validation,
             ),
+            QueryAst::Regex(regex) => regex.build_tantivy_ast_call(
+                schema,
+                tokenizer_manager,
+                search_fields,
+                with_validation,
+            ),
         }
     }
 }
diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs
new file mode 100644
index 00000000000..513cec1a9b6
--- /dev/null
+++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs
@@ -0,0 +1,124 @@
+// Copyright (C) 2024 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at hello@quickwit.io.
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+use std::sync::Arc;
+
+use anyhow::Context;
+use serde::{Deserialize, Serialize};
+use tantivy::schema::{Field, FieldType, Schema as TantivySchema};
+use tantivy::Term;
+
+use super::{BuildTantivyAst, QueryAst};
+use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst};
+use crate::tokenizers::TokenizerManager;
+use crate::{find_field_or_hit_dynamic, InvalidQuery};
+
+/// A Wildcard query allows to match 'bond' with a query like 'b*d'.
+///
+/// At the moment, only wildcard at end of term is supported.
+#[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
+pub struct RegexQuery {
+    pub field: String,
+    pub regex: String,
+}
+
+impl From<RegexQuery> for QueryAst {
+    fn from(regex_query: RegexQuery) -> Self {
+        Self::Regex(regex_query)
+    }
+}
+
+impl RegexQuery {
+    #[cfg(test)]
+    pub fn from_field_value(field: impl ToString, regex: impl ToString) -> Self {
+        Self {
+            field: field.to_string(),
+            regex: regex.to_string(),
+        }
+    }
+}
+
+impl RegexQuery {
+    pub fn to_regex(
+        &self,
+        schema: &TantivySchema,
+    ) -> Result<(Field, Option<Vec<u8>>, String), InvalidQuery> {
+        let (field, field_entry, json_path) = find_field_or_hit_dynamic(&self.field, schema)?;
+        let field_type = field_entry.field_type();
+
+        match field_type {
+            FieldType::Str(ref text_options) => {
+                text_options.get_indexing_options().ok_or_else(|| {
+                    InvalidQuery::SchemaError(format!(
+                        "field {} is not full-text searchable",
+                        field_entry.name()
+                    ))
+                })?;
+
+                Ok((field, None, self.regex.clone()))
+            }
+            FieldType::JsonObject(json_options) => {
+                json_options.get_text_indexing_options().ok_or_else(|| {
+                    InvalidQuery::SchemaError(format!(
+                        "field {} is not full-text searchable",
+                        field_entry.name()
+                    ))
+                })?;
+
+                let mut term_for_path = Term::from_field_json_path(
+                    field,
+                    json_path,
+                    json_options.is_expand_dots_enabled(),
+                );
+                term_for_path.append_type_and_str("");
+
+                let value = term_for_path.value();
+                // We skip the 1st byte which is a marker to tell this is json. This isn't present
+                // in the dictionary
+                let byte_path_prefix = value.as_serialized()[1..].to_owned();
+                Ok((field, Some(byte_path_prefix), self.regex.clone()))
+            }
+            _ => Err(InvalidQuery::SchemaError(
+                "trying to run a regex query on a non-text field".to_string(),
+            )),
+        }
+    }
+}
+
+impl BuildTantivyAst for RegexQuery {
+    fn build_tantivy_ast_impl(
+        &self,
+        schema: &TantivySchema,
+        _tokenizer_manager: &TokenizerManager,
+        _search_fields: &[String],
+        _with_validation: bool,
+    ) -> Result<TantivyQueryAst, InvalidQuery> {
+        let (field, path, regex) = self.to_regex(schema)?;
+        let regex = tantivy_fst::Regex::new(&regex).context("failed to parse regex")?;
+        let regex_automaton_with_path = JsonPathPrefix {
+            prefix: path.unwrap_or_default(),
+            automaton: regex,
+        };
+        let regex_query_with_path = AutomatonQuery {
+            field,
+            automaton: Arc::new(regex_automaton_with_path),
+        };
+        Ok(regex_query_with_path.into())
+    }
+}
diff --git a/quickwit/quickwit-query/src/query_ast/visitor.rs b/quickwit/quickwit-query/src/query_ast/visitor.rs
index bd85a71d64e..c9ce180b518 100644
--- a/quickwit/quickwit-query/src/query_ast/visitor.rs
+++ b/quickwit/quickwit-query/src/query_ast/visitor.rs
@@ -21,8 +21,8 @@ use crate::not_nan_f32::NotNaNf32;
 use crate::query_ast::field_presence::FieldPresenceQuery;
 use crate::query_ast::user_input_query::UserInputQuery;
 use crate::query_ast::{
-    BoolQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, TermQuery, TermSetQuery,
-    WildcardQuery,
+    BoolQuery, FullTextQuery, PhrasePrefixQuery, QueryAst, RangeQuery, RegexQuery, TermQuery,
+    TermSetQuery, WildcardQuery,
 };
 
 /// Simple trait to implement a Visitor over the QueryAst.
@@ -45,6 +45,7 @@ pub trait QueryAstVisitor<'a> {
             QueryAst::UserInput(user_text_query) => self.visit_user_text(user_text_query),
             QueryAst::FieldPresence(exists) => self.visit_exists(exists),
             QueryAst::Wildcard(wildcard) => self.visit_wildcard(wildcard),
+            QueryAst::Regex(regex) => self.visit_regex(regex),
         }
     }
 
@@ -111,6 +112,10 @@ pub trait QueryAstVisitor<'a> {
     fn visit_wildcard(&mut self, _wildcard_query: &'a WildcardQuery) -> Result<(), Self::Err> {
         Ok(())
     }
+
+    fn visit_regex(&mut self, _regex_query: &'a RegexQuery) -> Result<(), Self::Err> {
+        Ok(())
+    }
 }
 
 /// Simple trait to implement a Visitor over the QueryAst.
@@ -133,6 +138,7 @@ pub trait QueryAstTransformer {
             QueryAst::UserInput(user_text_query) => self.transform_user_text(user_text_query),
             QueryAst::FieldPresence(exists) => self.transform_exists(exists),
             QueryAst::Wildcard(wildcard) => self.transform_wildcard(wildcard),
+            QueryAst::Regex(regex) => self.transform_regex(regex),
         }
     }
 
@@ -231,4 +237,8 @@ pub trait QueryAstTransformer {
     ) -> Result<Option<QueryAst>, Self::Err> {
         Ok(Some(QueryAst::Wildcard(wildcard_query)))
     }
+
+    fn transform_regex(&mut self, regex_query: RegexQuery) -> Result<Option<QueryAst>, Self::Err> {
+        Ok(Some(QueryAst::Regex(regex_query)))
+    }
 }
diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
index 64bc47ac082..2c184883a88 100644
--- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -21,7 +21,7 @@ use std::borrow::Cow;
 use std::sync::Arc;
 
 use anyhow::{bail, Context};
-pub use prefix::JsonPathPrefix;
+pub use prefix::{AutomatonQuery, JsonPathPrefix};
 use serde::{Deserialize, Serialize};
 use tantivy::schema::{Field, FieldType, Schema as TantivySchema};
 use tantivy::Term;

From 314fd9337dce52e961f1571be42c2356422bd453 Mon Sep 17 00:00:00 2001
From: trinity Pointard <trinity.pointard@datadoghq.com>
Date: Mon, 6 Jan 2025 13:47:12 +0100
Subject: [PATCH 5/7] run automaton in search thread pool

---
 quickwit/Cargo.lock                           | 19 ++++++++++---------
 .../src/query_ast/regex_query.rs              |  2 +-
 .../src/query_ast/wildcard_query.rs           | 15 +++++++++++++--
 quickwit/quickwit-search/src/leaf.rs          | 11 +++++++++--
 4 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index d8ac98ead61..aa681d503cd 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -5143,7 +5143,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "ownedbytes"
 version = "0.7.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "stable_deref_trait",
 ]
@@ -8733,7 +8733,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
 [[package]]
 name = "tantivy"
 version = "0.23.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "aho-corasick",
  "arc-swap",
@@ -8747,6 +8747,7 @@ dependencies = [
  "fastdivide",
  "fnv",
  "fs4",
+ "futures-channel",
  "futures-util",
  "htmlescape",
  "hyperloglogplus",
@@ -8786,7 +8787,7 @@ dependencies = [
 [[package]]
 name = "tantivy-bitpacker"
 version = "0.6.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "bitpacking",
 ]
@@ -8794,7 +8795,7 @@ dependencies = [
 [[package]]
 name = "tantivy-columnar"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "downcast-rs",
  "fastdivide",
@@ -8809,7 +8810,7 @@ dependencies = [
 [[package]]
 name = "tantivy-common"
 version = "0.7.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -8832,7 +8833,7 @@ dependencies = [
 [[package]]
 name = "tantivy-query-grammar"
 version = "0.22.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "nom",
 ]
@@ -8840,7 +8841,7 @@ dependencies = [
 [[package]]
 name = "tantivy-sstable"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "futures-util",
  "itertools 0.13.0",
@@ -8853,7 +8854,7 @@ dependencies = [
 [[package]]
 name = "tantivy-stacker"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "murmurhash32",
  "rand_distr",
@@ -8863,7 +8864,7 @@ dependencies = [
 [[package]]
 name = "tantivy-tokenizer-api"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#24c5dc2398024de68c09a54e21f37ab2f844b30b"
+source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
 dependencies = [
  "serde",
 ]
diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs
index 513cec1a9b6..bea193c76eb 100644
--- a/quickwit/quickwit-query/src/query_ast/regex_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs
@@ -113,7 +113,7 @@ impl BuildTantivyAst for RegexQuery {
         let regex = tantivy_fst::Regex::new(&regex).context("failed to parse regex")?;
         let regex_automaton_with_path = JsonPathPrefix {
             prefix: path.unwrap_or_default(),
-            automaton: regex,
+            automaton: regex.into(),
         };
         let regex_query_with_path = AutomatonQuery {
             field,
diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
index 2c184883a88..cc36bc7480e 100644
--- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -206,7 +206,7 @@ impl BuildTantivyAst for WildcardQuery {
             tantivy_fst::Regex::new(&regex).context("failed to parse regex built from wildcard")?;
         let regex_automaton_with_path = prefix::JsonPathPrefix {
             prefix: path.unwrap_or_default(),
-            automaton: regex,
+            automaton: regex.into(),
         };
         let regex_query_with_path = prefix::AutomatonQuery {
             field,
@@ -222,9 +222,20 @@ mod prefix {
     use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight};
     use tantivy::schema::Field;
     use tantivy_fst::Automaton;
+
     pub struct JsonPathPrefix<A> {
         pub prefix: Vec<u8>,
-        pub automaton: A,
+        pub automaton: Arc<A>,
+    }
+
+    // we need to implement manually because the std adds an unnecessary bound `A: Clone`
+    impl<A> Clone for JsonPathPrefix<A> {
+        fn clone(&self) -> Self {
+            JsonPathPrefix {
+                prefix: self.prefix.clone(),
+                automaton: self.automaton.clone(),
+            }
+        }
     }
 
     #[derive(Clone)]
diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs
index c321b4fe662..c93960bb137 100644
--- a/quickwit/quickwit-search/src/leaf.rs
+++ b/quickwit/quickwit-search/src/leaf.rs
@@ -346,6 +346,12 @@ async fn warm_up_automatons(
     terms_grouped_by_field: &HashMap<Field, HashSet<Automaton>>,
 ) -> anyhow::Result<()> {
     let mut warm_up_futures = Vec::new();
+    let cpu_intensive_executor = |task| async {
+        crate::search_thread_pool()
+            .run_cpu_intensive(task)
+            .await
+            .map_err(|_| std::io::Error::other("task panicked"))?
+    };
     for (field, automatons) in terms_grouped_by_field {
         for segment_reader in searcher.segment_readers() {
             let inv_idx = segment_reader.inverted_index(*field)?;
@@ -358,10 +364,11 @@ async fn warm_up_automatons(
                                 .context("failed parsing regex during warmup")?;
                             inv_idx_clone
                                 .warm_postings_automaton(
-                                    &quickwit_query::query_ast::JsonPathPrefix {
-                                        automaton: regex,
+                                    quickwit_query::query_ast::JsonPathPrefix {
+                                        automaton: regex.into(),
                                         prefix: path.clone().unwrap_or_default(),
                                     },
+                                    cpu_intensive_executor,
                                 )
                                 .await
                                 .context("failed loading automaton")

From a7f502b91f3991533054d54bebbef83c07993127 Mon Sep 17 00:00:00 2001
From: trinity Pointard <trinity.pointard@datadoghq.com>
Date: Thu, 9 Jan 2025 16:58:44 +0100
Subject: [PATCH 6/7] cleanup, refactor and test

---
 quickwit/Cargo.lock                           |  44 ++-
 quickwit/Cargo.toml                           |   2 +-
 .../quickwit-doc-mapper/src/doc_mapper/mod.rs |   1 +
 .../quickwit-doc-mapper/src/query_builder.rs  |   2 +-
 .../src/elastic_query_dsl/regex_query.rs      |   1 +
 .../src/query_ast/field_presence.rs           |   2 +-
 quickwit/quickwit-query/src/query_ast/mod.rs  |   4 +-
 .../src/query_ast/regex_query.rs              | 265 +++++++++++++++++-
 .../src/query_ast/wildcard_query.rs           | 218 +++-----------
 .../0005-query_string_query.yaml              |  10 +
 10 files changed, 351 insertions(+), 198 deletions(-)

diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
index aa681d503cd..298c2626237 100644
--- a/quickwit/Cargo.lock
+++ b/quickwit/Cargo.lock
@@ -1121,6 +1121,31 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "bon"
+version = "3.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe7acc34ff59877422326db7d6f2d845a582b16396b6b08194942bf34c6528ab"
+dependencies = [
+ "bon-macros",
+ "rustversion",
+]
+
+[[package]]
+name = "bon-macros"
+version = "3.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4159dd617a7fbc9be6a692fe69dc2954f8e6bb6bb5e4d7578467441390d77fd0"
+dependencies = [
+ "darling 0.20.10",
+ "ident_case",
+ "prettyplease 0.2.25",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.89",
+]
+
 [[package]]
 name = "borsh"
 version = "1.5.3"
@@ -5143,7 +5168,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 [[package]]
 name = "ownedbytes"
 version = "0.7.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "stable_deref_trait",
 ]
@@ -8733,12 +8758,13 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
 [[package]]
 name = "tantivy"
 version = "0.23.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "aho-corasick",
  "arc-swap",
  "base64 0.22.1",
  "bitpacking",
+ "bon",
  "byteorder",
  "census",
  "crc32fast",
@@ -8787,7 +8813,7 @@ dependencies = [
 [[package]]
 name = "tantivy-bitpacker"
 version = "0.6.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "bitpacking",
 ]
@@ -8795,7 +8821,7 @@ dependencies = [
 [[package]]
 name = "tantivy-columnar"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "downcast-rs",
  "fastdivide",
@@ -8810,7 +8836,7 @@ dependencies = [
 [[package]]
 name = "tantivy-common"
 version = "0.7.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "async-trait",
  "byteorder",
@@ -8833,7 +8859,7 @@ dependencies = [
 [[package]]
 name = "tantivy-query-grammar"
 version = "0.22.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "nom",
 ]
@@ -8841,7 +8867,7 @@ dependencies = [
 [[package]]
 name = "tantivy-sstable"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "futures-util",
  "itertools 0.13.0",
@@ -8854,7 +8880,7 @@ dependencies = [
 [[package]]
 name = "tantivy-stacker"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "murmurhash32",
  "rand_distr",
@@ -8864,7 +8890,7 @@ dependencies = [
 [[package]]
 name = "tantivy-tokenizer-api"
 version = "0.3.0"
-source = "git+https://github.com/quickwit-oss/tantivy/?branch=trinity/sstable-partial-automaton#037d12c9c9b8c96c09288297cacc7e20d88ea842"
+source = "git+https://github.com/quickwit-oss/tantivy/?rev=d281ca3#d281ca3e659d9eaedd5c64dd1613c5b9d11d6c65"
 dependencies = [
  "serde",
 ]
diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
index 6e7fa89c9a5..11a04ec0f0f 100644
--- a/quickwit/Cargo.toml
+++ b/quickwit/Cargo.toml
@@ -328,7 +328,7 @@ quickwit-serve = { path = "quickwit-serve" }
 quickwit-storage = { path = "quickwit-storage" }
 quickwit-telemetry = { path = "quickwit-telemetry" }
 
-tantivy = { git = "https://github.com/quickwit-oss/tantivy/", branch = "trinity/sstable-partial-automaton", default-features = false, features = [
+tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "d281ca3", default-features = false, features = [
   "lz4-compression",
   "mmap",
   "quickwit",
diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
index 1504e26c743..4754a153873 100644
--- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
+++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs
@@ -91,6 +91,7 @@ pub enum Automaton {
     /// A regex in it's str representation as tantivy_fst::Regex isn't PartialEq, and the path if
     /// inside a json field
     Regex(Option<Vec<u8>>, String),
+    // we could add termset query here, instead of downloading the whole dictionary
 }
 
 /// Information about what a DocMapper think should be warmed up before
diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
index 77fda563987..36d03ca23b7 100644
--- a/quickwit/quickwit-doc-mapper/src/query_builder.rs
+++ b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -274,7 +274,7 @@ impl<'a, 'b: 'a> QueryAstVisitor<'a> for ExtractPrefixTermRanges<'b> {
     }
 
     fn visit_regex(&mut self, regex_query: &'a RegexQuery) -> Result<(), Self::Err> {
-        let (field, path, regex) = regex_query.to_regex(self.schema)?;
+        let (field, path, regex) = regex_query.to_field_and_regex(self.schema)?;
         self.add_automaton(field, Automaton::Regex(path, regex));
         Ok(())
     }
diff --git a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs
index f3c2d0427f0..55b76131571 100644
--- a/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs
+++ b/quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs
@@ -27,6 +27,7 @@ use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery};
 #[serde(deny_unknown_fields)]
 pub struct RegexQueryParams {
     value: String,
+    // we could probably add case_insensitive
 }
 
 pub type RegexQuery = OneFieldMap<RegexQueryParams>;
diff --git a/quickwit/quickwit-query/src/query_ast/field_presence.rs b/quickwit/quickwit-query/src/query_ast/field_presence.rs
index df82ab3591a..1544bd5ac46 100644
--- a/quickwit/quickwit-query/src/query_ast/field_presence.rs
+++ b/quickwit/quickwit-query/src/query_ast/field_presence.rs
@@ -87,7 +87,7 @@ impl BuildTantivyAst for FieldPresenceQuery {
             } else {
                 format!("{}.{}", field_entry.name(), path)
             };
-            let exists_query = tantivy::query::ExistsQuery::new_exists_query(full_path);
+            let exists_query = tantivy::query::ExistsQuery::new(full_path, true);
             Ok(TantivyQueryAst::from(exists_query))
         } else {
             // fallback to the presence field
diff --git a/quickwit/quickwit-query/src/query_ast/mod.rs b/quickwit/quickwit-query/src/query_ast/mod.rs
index 9f2c5b34f61..31d53ff65f9 100644
--- a/quickwit/quickwit-query/src/query_ast/mod.rs
+++ b/quickwit/quickwit-query/src/query_ast/mod.rs
@@ -42,13 +42,13 @@ pub use field_presence::FieldPresenceQuery;
 pub use full_text_query::{FullTextMode, FullTextParams, FullTextQuery};
 pub use phrase_prefix_query::PhrasePrefixQuery;
 pub use range_query::RangeQuery;
-pub use regex_query::RegexQuery;
+pub use regex_query::{AutomatonQuery, JsonPathPrefix, RegexQuery};
 use tantivy_query_ast::TantivyQueryAst;
 pub use term_query::TermQuery;
 pub use term_set_query::TermSetQuery;
 pub use user_input_query::UserInputQuery;
 pub use visitor::{QueryAstTransformer, QueryAstVisitor};
-pub use wildcard_query::{AutomatonQuery, JsonPathPrefix, WildcardQuery};
+pub use wildcard_query::WildcardQuery;
 
 use crate::{BooleanOperand, InvalidQuery, NotNaNf32};
 
diff --git a/quickwit/quickwit-query/src/query_ast/regex_query.rs b/quickwit/quickwit-query/src/query_ast/regex_query.rs
index bea193c76eb..09960caf3cc 100644
--- a/quickwit/quickwit-query/src/query_ast/regex_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/regex_query.rs
@@ -20,18 +20,17 @@
 use std::sync::Arc;
 
 use anyhow::Context;
+pub use prefix::{AutomatonQuery, JsonPathPrefix};
 use serde::{Deserialize, Serialize};
 use tantivy::schema::{Field, FieldType, Schema as TantivySchema};
 use tantivy::Term;
 
 use super::{BuildTantivyAst, QueryAst};
-use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst};
+use crate::query_ast::TantivyQueryAst;
 use crate::tokenizers::TokenizerManager;
 use crate::{find_field_or_hit_dynamic, InvalidQuery};
 
-/// A Wildcard query allows to match 'bond' with a query like 'b*d'.
-///
-/// At the moment, only wildcard at end of term is supported.
+/// A Regex query
 #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
 pub struct RegexQuery {
     pub field: String,
@@ -55,7 +54,7 @@ impl RegexQuery {
 }
 
 impl RegexQuery {
-    pub fn to_regex(
+    pub fn to_field_and_regex(
         &self,
         schema: &TantivySchema,
     ) -> Result<(Field, Option<Vec<u8>>, String), InvalidQuery> {
@@ -109,7 +108,7 @@ impl BuildTantivyAst for RegexQuery {
         _search_fields: &[String],
         _with_validation: bool,
     ) -> Result<TantivyQueryAst, InvalidQuery> {
-        let (field, path, regex) = self.to_regex(schema)?;
+        let (field, path, regex) = self.to_field_and_regex(schema)?;
         let regex = tantivy_fst::Regex::new(&regex).context("failed to parse regex")?;
         let regex_automaton_with_path = JsonPathPrefix {
             prefix: path.unwrap_or_default(),
@@ -122,3 +121,257 @@ impl BuildTantivyAst for RegexQuery {
         Ok(regex_query_with_path.into())
     }
 }
+
+mod prefix {
+    use std::sync::Arc;
+
+    use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight};
+    use tantivy::schema::Field;
+    use tantivy_fst::Automaton;
+
+    pub struct JsonPathPrefix<A> {
+        pub prefix: Vec<u8>,
+        pub automaton: Arc<A>,
+    }
+
+    // we need to implement manually because the std adds an unnecessary bound `A: Clone`
+    impl<A> Clone for JsonPathPrefix<A> {
+        fn clone(&self) -> Self {
+            JsonPathPrefix {
+                prefix: self.prefix.clone(),
+                automaton: self.automaton.clone(),
+            }
+        }
+    }
+
+    #[derive(Clone, Debug, PartialEq)]
+    pub enum JsonPathPrefixState<A> {
+        Prefix(usize),
+        Inner(A),
+        PrefixFailed,
+    }
+
+    impl<A: Automaton> Automaton for JsonPathPrefix<A> {
+        type State = JsonPathPrefixState<A::State>;
+
+        fn start(&self) -> Self::State {
+            if self.prefix.is_empty() {
+                JsonPathPrefixState::Inner(self.automaton.start())
+            } else {
+                JsonPathPrefixState::Prefix(0)
+            }
+        }
+
+        fn is_match(&self, state: &Self::State) -> bool {
+            match state {
+                JsonPathPrefixState::Prefix(_) => false,
+                JsonPathPrefixState::Inner(inner_state) => self.automaton.is_match(inner_state),
+                JsonPathPrefixState::PrefixFailed => false,
+            }
+        }
+
+        fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
+            match state {
+                JsonPathPrefixState::Prefix(i) => {
+                    if self.prefix.get(*i) != Some(&byte) {
+                        return JsonPathPrefixState::PrefixFailed;
+                    }
+                    let next_pos = i + 1;
+                    if next_pos == self.prefix.len() {
+                        JsonPathPrefixState::Inner(self.automaton.start())
+                    } else {
+                        JsonPathPrefixState::Prefix(next_pos)
+                    }
+                }
+                JsonPathPrefixState::Inner(inner_state) => {
+                    JsonPathPrefixState::Inner(self.automaton.accept(inner_state, byte))
+                }
+                JsonPathPrefixState::PrefixFailed => JsonPathPrefixState::PrefixFailed,
+            }
+        }
+
+        fn can_match(&self, state: &Self::State) -> bool {
+            match state {
+                JsonPathPrefixState::Prefix(_) => true,
+                JsonPathPrefixState::Inner(inner_state) => self.automaton.can_match(inner_state),
+                JsonPathPrefixState::PrefixFailed => false,
+            }
+        }
+
+        fn will_always_match(&self, state: &Self::State) -> bool {
+            match state {
+                JsonPathPrefixState::Prefix(_) => false,
+                JsonPathPrefixState::Inner(inner_state) => {
+                    self.automaton.will_always_match(inner_state)
+                }
+                JsonPathPrefixState::PrefixFailed => false,
+            }
+        }
+    }
+
+    // we don't use RegexQuery to handle our path. We could tinker with the regex to embed
+    // json field path inside, but that seems not as clean, and would prevent support of
+    // case-insensitive search in the future (we would also make the path insensitive,
+    // which we shouldn't)
+    pub struct AutomatonQuery<A> {
+        pub automaton: Arc<A>,
+        pub field: Field,
+    }
+
+    impl<A> std::fmt::Debug for AutomatonQuery<A> {
+        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+            f.debug_struct("AutomatonQuery")
+                .field("field", &self.field)
+                .field("automaton", &std::any::type_name::<A>())
+                .finish()
+        }
+    }
+
+    impl<A> Clone for AutomatonQuery<A> {
+        fn clone(&self) -> Self {
+            AutomatonQuery {
+                automaton: self.automaton.clone(),
+                field: self.field,
+            }
+        }
+    }
+
+    impl<A: Automaton + Send + Sync + 'static> Query for AutomatonQuery<A>
+    where A::State: Clone
+    {
+        fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> tantivy::Result<Box<dyn Weight>> {
+            Ok(Box::new(AutomatonWeight::<A>::new(
+                self.field,
+                self.automaton.clone(),
+            )))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use tantivy::schema::{Schema as TantivySchema, TEXT};
+    use tantivy_fst::{Automaton, Regex};
+
+    use super::prefix::JsonPathPrefixState;
+    use super::{JsonPathPrefix, RegexQuery};
+
+    #[test]
+    fn test_regex_query_text_field() {
+        let mut schema_builder = TantivySchema::builder();
+        schema_builder.add_text_field("field", TEXT);
+        let schema = schema_builder.build();
+
+        let query = RegexQuery {
+            field: "field".to_string(),
+            regex: "abc.*xyz".to_string(),
+        };
+        let (field, path, regex) = query.to_field_and_regex(&schema).unwrap();
+        assert_eq!(field, schema.get_field("field").unwrap());
+        assert!(path.is_none());
+        assert_eq!(regex, query.regex);
+    }
+
+    #[test]
+    fn test_regex_query_json_field() {
+        let mut schema_builder = TantivySchema::builder();
+        schema_builder.add_json_field("field", TEXT);
+        let schema = schema_builder.build();
+
+        let query = RegexQuery {
+            field: "field.sub.field".to_string(),
+            regex: "abc.*xyz".to_string(),
+        };
+        let (field, path, regex) = query.to_field_and_regex(&schema).unwrap();
+        assert_eq!(field, schema.get_field("field").unwrap());
+        assert_eq!(path.unwrap(), b"sub\x01field\0s");
+        assert_eq!(regex, query.regex);
+
+        // i believe this is how concatenated field behave
+        let query_empty_path = RegexQuery {
+            field: "field".to_string(),
+            regex: "abc.*xyz".to_string(),
+        };
+        let (field, path, regex) = query_empty_path.to_field_and_regex(&schema).unwrap();
+        assert_eq!(field, schema.get_field("field").unwrap());
+        assert_eq!(path.unwrap(), b"\0s");
+        assert_eq!(regex, query_empty_path.regex);
+    }
+
+    #[test]
+    fn test_json_prefix_automaton_empty_path() {
+        let regex = Arc::new(Regex::new("e(f|g.*)").unwrap());
+        let empty_path_automaton = JsonPathPrefix {
+            prefix: Vec::new(),
+            automaton: regex.clone(),
+        };
+
+        let start = empty_path_automaton.start();
+        assert_eq!(start, JsonPathPrefixState::Inner(regex.start()));
+    }
+
+    #[test]
+    fn test_json_prefix_automaton() {
+        let regex = Arc::new(Regex::new("e(f|g.*)").unwrap());
+        let automaton = JsonPathPrefix {
+            prefix: b"ab".to_vec(),
+            automaton: regex.clone(),
+        };
+
+        let start = automaton.start();
+        assert!(matches!(start, JsonPathPrefixState::Prefix(_)));
+        assert!(automaton.can_match(&start));
+        assert!(!automaton.is_match(&start));
+
+        let miss = automaton.accept(&start, b'g');
+        assert_eq!(miss, JsonPathPrefixState::PrefixFailed);
+        // supporting this is important for optimisation
+        assert!(!automaton.can_match(&miss));
+        assert!(!automaton.is_match(&miss));
+
+        let a = automaton.accept(&start, b'a');
+        assert!(matches!(a, JsonPathPrefixState::Prefix(_)));
+        assert!(automaton.can_match(&a));
+        assert!(!automaton.is_match(&a));
+
+        let ab = automaton.accept(&a, b'b');
+        assert_eq!(ab, JsonPathPrefixState::Inner(regex.start()));
+        assert!(automaton.can_match(&ab));
+        assert!(!automaton.is_match(&ab));
+
+        // starting here, we just take that we passthrough correctly,
+        // and reply to can_match as well as possible
+        // (we don't test will_always_match because Regex doesn't support it)
+        let abc = automaton.accept(&ab, b'c');
+        assert!(matches!(abc, JsonPathPrefixState::Inner(_)));
+        assert!(!automaton.can_match(&abc));
+        assert!(!automaton.is_match(&abc));
+
+        let abe = automaton.accept(&ab, b'e');
+        assert!(matches!(abe, JsonPathPrefixState::Inner(_)));
+        assert!(automaton.can_match(&abe));
+        assert!(!automaton.is_match(&abe));
+
+        let abef = automaton.accept(&abe, b'f');
+        assert!(matches!(abef, JsonPathPrefixState::Inner(_)));
+        assert!(automaton.can_match(&abef));
+        assert!(automaton.is_match(&abef));
+
+        let abefg = automaton.accept(&abef, b'g');
+        assert!(matches!(abefg, JsonPathPrefixState::Inner(_)));
+        assert!(!automaton.can_match(&abefg));
+        assert!(!automaton.is_match(&abefg));
+
+        let abeg = automaton.accept(&abe, b'g');
+        assert!(matches!(abeg, JsonPathPrefixState::Inner(_)));
+        assert!(automaton.can_match(&abeg));
+        assert!(automaton.is_match(&abeg));
+
+        let abegh = automaton.accept(&abeg, b'h');
+        assert!(matches!(abegh, JsonPathPrefixState::Inner(_)));
+        assert!(automaton.can_match(&abegh));
+        assert!(automaton.is_match(&abegh));
+    }
+}
diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
index cc36bc7480e..78640e6d6b0 100644
--- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -21,19 +21,16 @@ use std::borrow::Cow;
 use std::sync::Arc;
 
 use anyhow::{bail, Context};
-pub use prefix::{AutomatonQuery, JsonPathPrefix};
 use serde::{Deserialize, Serialize};
 use tantivy::schema::{Field, FieldType, Schema as TantivySchema};
 use tantivy::Term;
 
 use super::{BuildTantivyAst, QueryAst};
-use crate::query_ast::TantivyQueryAst;
+use crate::query_ast::{AutomatonQuery, JsonPathPrefix, TantivyQueryAst};
 use crate::tokenizers::TokenizerManager;
 use crate::{find_field_or_hit_dynamic, InvalidQuery};
 
 /// A Wildcard query allows to match 'bond' with a query like 'b*d'.
-///
-/// At the moment, only wildcard at end of term is supported.
 #[derive(PartialEq, Eq, Debug, Serialize, Deserialize, Clone)]
 pub struct WildcardQuery {
     pub field: String,
@@ -72,7 +69,8 @@ fn parse_wildcard_query(mut query: &str) -> Vec<SubQuery> {
                     res.push(SubQuery::Text(chr.to_string()));
                     query = &query[chr.len_utf8()..];
                 } else {
-                    // this is invalid, but let's just ignore that escape sequence
+                    // escaping at the end is invalid, handle it as if that escape sequence wasn't
+                    // present
                     break;
                 }
             }
@@ -91,6 +89,36 @@ enum SubQuery {
     QuestionMark,
 }
 
+fn sub_query_parts_to_regex(
+    sub_query_parts: Vec<SubQuery>,
+    tokenizer_name: &str,
+    tokenizer_manager: &TokenizerManager,
+) -> anyhow::Result<String> {
+    let mut normalizer = tokenizer_manager
+        .get_normalizer(tokenizer_name)
+        .with_context(|| format!("no tokenizer named `{}` is registered", tokenizer_name))?;
+
+    sub_query_parts
+        .into_iter()
+        .map(|part| match part {
+            SubQuery::Text(text) => {
+                let mut token_stream = normalizer.token_stream(&text);
+                let expected_token = token_stream
+                    .next()
+                    .context("normalizer generated no content")?
+                    .text
+                    .clone();
+                if let Some(_unexpected_token) = token_stream.next() {
+                    bail!("normalizer generated multiple tokens")
+                }
+                Ok(Cow::Owned(regex::escape(&expected_token)))
+            }
+            SubQuery::Wildcard => Ok(Cow::Borrowed(".*")),
+            SubQuery::QuestionMark => Ok(Cow::Borrowed(".")),
+        })
+        .collect::<Result<String, _>>()
+}
+
 impl WildcardQuery {
     pub fn to_regex(
         &self,
@@ -111,31 +139,8 @@ impl WildcardQuery {
                     ))
                 })?;
                 let tokenizer_name = text_field_indexing.tokenizer();
-                let mut normalizer = tokenizer_manager
-                    .get_normalizer(tokenizer_name)
-                    .with_context(|| {
-                        format!("no tokenizer named `{}` is registered", tokenizer_name)
-                    })?;
-
-                let regex = sub_query_parts
-                    .into_iter()
-                    .map(|part| match part {
-                        SubQuery::Text(text) => {
-                            let mut token_stream = normalizer.token_stream(&text);
-                            let expected_token = token_stream
-                                .next()
-                                .context("normalizer generated no content")?
-                                .text
-                                .clone();
-                            if let Some(_unexpected_token) = token_stream.next() {
-                                bail!("normalizer generated multiple tokens")
-                            }
-                            Ok(Cow::Owned(regex::escape(&expected_token)))
-                        }
-                        SubQuery::Wildcard => Ok(Cow::Borrowed(".*")),
-                        SubQuery::QuestionMark => Ok(Cow::Borrowed(".")),
-                    })
-                    .collect::<Result<String, _>>()?;
+                let regex =
+                    sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?;
 
                 Ok((field, None, regex))
             }
@@ -148,11 +153,8 @@ impl WildcardQuery {
                         ))
                     })?;
                 let tokenizer_name = text_field_indexing.tokenizer();
-                let mut normalizer = tokenizer_manager
-                    .get_normalizer(tokenizer_name)
-                    .with_context(|| {
-                        format!("no tokenizer named `{}` is registered", tokenizer_name)
-                    })?;
+                let regex =
+                    sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?;
 
                 let mut term_for_path = Term::from_field_json_path(
                     field,
@@ -165,25 +167,7 @@ impl WildcardQuery {
                 // We skip the 1st byte which is a marker to tell this is json. This isn't present
                 // in the dictionary
                 let byte_path_prefix = value.as_serialized()[1..].to_owned();
-                let regex = sub_query_parts
-                    .into_iter()
-                    .map(|part| match part {
-                        SubQuery::Text(text) => {
-                            let mut token_stream = normalizer.token_stream(&text);
-                            let expected_token = token_stream
-                                .next()
-                                .context("normalizer generated no content")?
-                                .text
-                                .clone();
-                            if let Some(_unexpected_token) = token_stream.next() {
-                                bail!("normalizer generated multiple tokens")
-                            }
-                            Ok(Cow::Owned(regex::escape(&expected_token)))
-                        }
-                        SubQuery::Wildcard => Ok(Cow::Borrowed(".*")),
-                        SubQuery::QuestionMark => Ok(Cow::Borrowed(".")),
-                    })
-                    .collect::<Result<String, _>>()?;
+
                 Ok((field, Some(byte_path_prefix), regex))
             }
             _ => Err(InvalidQuery::SchemaError(
@@ -204,11 +188,11 @@ impl BuildTantivyAst for WildcardQuery {
         let (field, path, regex) = self.to_regex(schema, tokenizer_manager)?;
         let regex =
             tantivy_fst::Regex::new(&regex).context("failed to parse regex built from wildcard")?;
-        let regex_automaton_with_path = prefix::JsonPathPrefix {
+        let regex_automaton_with_path = JsonPathPrefix {
             prefix: path.unwrap_or_default(),
             automaton: regex.into(),
         };
-        let regex_query_with_path = prefix::AutomatonQuery {
+        let regex_query_with_path = AutomatonQuery {
             field,
             automaton: Arc::new(regex_automaton_with_path),
         };
@@ -216,128 +200,6 @@ impl BuildTantivyAst for WildcardQuery {
     }
 }
 
-mod prefix {
-    use std::sync::Arc;
-
-    use tantivy::query::{AutomatonWeight, EnableScoring, Query, Weight};
-    use tantivy::schema::Field;
-    use tantivy_fst::Automaton;
-
-    pub struct JsonPathPrefix<A> {
-        pub prefix: Vec<u8>,
-        pub automaton: Arc<A>,
-    }
-
-    // we need to implement manually because the std adds an unnecessary bound `A: Clone`
-    impl<A> Clone for JsonPathPrefix<A> {
-        fn clone(&self) -> Self {
-            JsonPathPrefix {
-                prefix: self.prefix.clone(),
-                automaton: self.automaton.clone(),
-            }
-        }
-    }
-
-    #[derive(Clone)]
-    pub enum JsonPathPrefixState<A> {
-        Prefix(usize),
-        Inner(A),
-        PrefixFailed,
-    }
-
-    impl<A: Automaton> Automaton for JsonPathPrefix<A> {
-        type State = JsonPathPrefixState<A::State>;
-
-        fn start(&self) -> Self::State {
-            if self.prefix.is_empty() {
-                JsonPathPrefixState::Inner(self.automaton.start())
-            } else {
-                JsonPathPrefixState::Prefix(0)
-            }
-        }
-
-        fn is_match(&self, state: &Self::State) -> bool {
-            match state {
-                JsonPathPrefixState::Prefix(_) => false,
-                JsonPathPrefixState::Inner(inner_state) => self.automaton.is_match(inner_state),
-                JsonPathPrefixState::PrefixFailed => false,
-            }
-        }
-
-        fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
-            match state {
-                JsonPathPrefixState::Prefix(i) => {
-                    if self.prefix.get(*i) != Some(&byte) {
-                        return JsonPathPrefixState::PrefixFailed;
-                    }
-                    let next_pos = i + 1;
-                    if next_pos == self.prefix.len() {
-                        JsonPathPrefixState::Inner(self.automaton.start())
-                    } else {
-                        JsonPathPrefixState::Prefix(next_pos)
-                    }
-                }
-                JsonPathPrefixState::Inner(inner_state) => {
-                    JsonPathPrefixState::Inner(self.automaton.accept(inner_state, byte))
-                }
-                JsonPathPrefixState::PrefixFailed => JsonPathPrefixState::PrefixFailed,
-            }
-        }
-
-        fn can_match(&self, state: &Self::State) -> bool {
-            match state {
-                JsonPathPrefixState::Prefix(_) => true,
-                JsonPathPrefixState::Inner(inner_state) => self.automaton.can_match(inner_state),
-                JsonPathPrefixState::PrefixFailed => false,
-            }
-        }
-
-        fn will_always_match(&self, state: &Self::State) -> bool {
-            match state {
-                JsonPathPrefixState::Prefix(_) => false,
-                JsonPathPrefixState::Inner(inner_state) => {
-                    self.automaton.will_always_match(inner_state)
-                }
-                JsonPathPrefixState::PrefixFailed => false,
-            }
-        }
-    }
-
-    pub struct AutomatonQuery<A> {
-        pub automaton: Arc<A>,
-        pub field: Field,
-    }
-
-    impl<A> std::fmt::Debug for AutomatonQuery<A> {
-        fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-            f.debug_struct("AutomatonQuery")
-                .field("field", &self.field)
-                .field("automaton", &std::any::type_name::<A>())
-                .finish()
-        }
-    }
-
-    impl<A> Clone for AutomatonQuery<A> {
-        fn clone(&self) -> Self {
-            AutomatonQuery {
-                automaton: self.automaton.clone(),
-                field: self.field,
-            }
-        }
-    }
-
-    impl<A: Automaton + Send + Sync + 'static> Query for AutomatonQuery<A>
-    where A::State: Clone
-    {
-        fn weight(&self, _enabled_scoring: EnableScoring<'_>) -> tantivy::Result<Box<dyn Weight>> {
-            Ok(Box::new(AutomatonWeight::<A>::new(
-                self.field,
-                self.automaton.clone(),
-            )))
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use tantivy::schema::{TextFieldIndexing, TextOptions};
diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
index 84c6bb3d790..3e11e8aa561 100644
--- a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
+++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
@@ -200,6 +200,16 @@ expected:
     total:
       value: 1
 ---
+json:
+  query:
+    regexp:
+      payload.description:
+          value: "jour.*"
+expected:
+  hits:
+    total:
+      value: 3
+---
 json:
   query:
     query_string:

From 3f65e615d39c46a7fff6b65c68d4998b06d95424 Mon Sep 17 00:00:00 2001
From: trinity-1686a <trinity@quickwit.io>
Date: Fri, 10 Jan 2025 15:56:25 +0100
Subject: [PATCH 7/7] improve error messages

Co-authored-by: Adrien Guillo <adrien@quickwit.io>
---
 quickwit/quickwit-query/src/query_ast/wildcard_query.rs | 2 +-
 quickwit/quickwit-search/src/leaf.rs                    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
index e52268bf636..a5bdaf1b430 100644
--- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
+++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs
@@ -88,7 +88,7 @@ fn sub_query_parts_to_regex(
 ) -> anyhow::Result<String> {
     let mut normalizer = tokenizer_manager
         .get_normalizer(tokenizer_name)
-        .with_context(|| format!("no tokenizer named `{}` is registered", tokenizer_name))?;
+        .with_context(|| format!("no tokenizer named `{tokenizer_name}` is registered"))?;
 
     sub_query_parts
         .into_iter()
diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs
index 5c10931501d..03e6a0ebc8b 100644
--- a/quickwit/quickwit-search/src/leaf.rs
+++ b/quickwit/quickwit-search/src/leaf.rs
@@ -375,7 +375,7 @@ async fn warm_up_automatons(
                     match automaton {
                         Automaton::Regex(path, regex_str) => {
                             let regex = tantivy_fst::Regex::new(regex_str)
-                                .context("failed parsing regex during warmup")?;
+                                .context("failed to parse regex during warmup")?;
                             inv_idx_clone
                                 .warm_postings_automaton(
                                     quickwit_query::query_ast::JsonPathPrefix {
@@ -385,7 +385,7 @@ async fn warm_up_automatons(
                                     cpu_intensive_executor,
                                 )
                                 .await
-                                .context("failed loading automaton")
+                                .context("failed to load automaton")
                         }
                     }
                 });