diff --git a/.cargo/config b/.cargo/config new file mode 100644 index 0000000..33272c4 --- /dev/null +++ b/.cargo/config @@ -0,0 +1,5 @@ +[build] +rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] + +[env] +CXXFLAGS = "-std=c++11 -Wno-unused-parameter" diff --git a/Cargo.toml b/Cargo.toml index 66cbacf..195b9a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,10 +16,11 @@ crate-type = ["lib"] [dependencies] -rustc-serialize = "0.3.24" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" stemmer = "0.3.2" -unicode-normalization = "0.1.19" -unicode-segmentation = "1.8.0" +unicode-normalization = "0.1.25" +unicode-segmentation = "1.12.0" noise_search_deps_rocksdb = "0.1.1" varint = "0.9.0" -uuid = { version = "1.1.2", features = ["v4"] } +uuid = { version = "1.18.1", features = ["v4"] } diff --git a/src/bin/debug_bbox.rs b/src/bin/debug_bbox.rs new file mode 100644 index 0000000..f34a7fa --- /dev/null +++ b/src/bin/debug_bbox.rs @@ -0,0 +1,74 @@ +use noise_search::index::{Batch, Index, OpenOptions}; +extern crate rocksdb; +use std::convert::TryInto; + +fn main() { + // Clean start + let dbname = "target/tests/debug_bbox_db"; + let _ = Index::drop(dbname); + + // Create index + let mut index = Index::open(dbname, Some(OpenOptions::Create)).unwrap(); + let mut batch = Batch::new(); + + // Add a point + println!("Adding point at [10.9, 48.4]"); + index.add(r#"{"_id": "point1", "geometry": {"type": "Point", "coordinates": [10.9, 48.4]}}"#, &mut batch).unwrap(); + index.flush(batch).unwrap(); + + println!("\nChecking what's stored in rtree:"); + let rtree_cf = index.rocks.cf_handle("rtree").unwrap(); + let iter = index.rocks.iterator_cf(rtree_cf, rocksdb::IteratorMode::Start).unwrap(); + + for (key, _value) in iter { + println!("Key length: {}", key.len()); + + // Parse the key + let mut offset = 0; + + // Read keypath length (varint) + let keypath_len = key[offset] as usize; + offset += 1; + + // Read keypath + let keypath = &key[offset..offset + keypath_len]; + let keypath_str = String::from_utf8_lossy(keypath); + println!("Keypath: '{}'", keypath_str); + offset += keypath_len; + + // Read IID (u64) + let iid_bytes: [u8; 8] = key[offset..offset + 8] + .try_into() + .expect("iid bytes"); + let iid = u64::from_le_bytes(iid_bytes); + println!("IID: {}", iid); + offset += 8; + + // Read bounding box (4 f64 values) + if key.len() >= offset + 32 { + let mut bbox = [0.0_f64; 4]; + for (idx, value) in bbox.iter_mut().enumerate() { + let start = offset + (idx * 8); + let bytes: [u8; 8] = key[start..start + 8] + .try_into() + .expect("bbox bytes"); + *value = f64::from_le_bytes(bytes); + } + println!("Bounding box: [{}, {}, {}, {}]", bbox[0], bbox[1], bbox[2], bbox[3]); + + // Check if point [10.9, 48.4] should be inside various test bboxes + println!("\nChecking query matches:"); + let test_bboxes = vec![ + ([-1000.0, -1000.0, 99.0, 1000.0], "[-1000, -1000, 99, 1000]"), + ([0.0, 0.0, 20.0, 50.0], "[0, 0, 20, 50]"), + ([10.0, 48.0, 11.0, 49.0], "[10, 48, 11, 49]"), + ]; + + for (query_bbox, desc) in test_bboxes { + let intersects = bbox[0] <= query_bbox[2] && bbox[2] >= query_bbox[0] && + bbox[1] <= query_bbox[3] && bbox[3] >= query_bbox[1]; + println!(" Query {}: intersects = {}", desc, intersects); + } + } + } +} diff --git a/src/bin/debug_bind_filter.rs b/src/bin/debug_bind_filter.rs new file mode 100644 index 0000000..2a964cd --- /dev/null +++ b/src/bin/debug_bind_filter.rs @@ -0,0 +1,64 @@ +use noise_search::index::{Batch, Index, OpenOptions}; +extern crate rocksdb; +use std::convert::TryInto; + +fn main() { + let dbname = "target/tests/debug_bind_filter"; + let _ = Index::drop(dbname); + + let mut index = Index::open(dbname, Some(OpenOptions::Create)).unwrap(); + let mut batch = Batch::new(); + + // Add document with array of points + index.add(r#"{"_id": "arraypoint", "area": [{"type": "Point", "coordinates": [10.9, 48.4]}, {"type": "Point", "coordinates": [-5.0, -20.1]}]}"#, &mut batch).unwrap(); + index.flush(batch).unwrap(); + + // Check what's in the rtree + println!("=== RTree entries ==="); + let rtree_cf = index.rocks.cf_handle("rtree").unwrap(); + let iter = index.rocks.iterator_cf(rtree_cf, rocksdb::IteratorMode::Start).unwrap(); + + for (key, value) in iter { + // Parse key + let mut offset = 0; + let keypath_len = key[offset] as usize; + offset += 1; + let keypath = String::from_utf8_lossy(&key[offset..offset + keypath_len]); + offset += keypath_len; + + let iid_bytes: [u8; 8] = key[offset..offset + 8] + .try_into() + .expect("iid bytes"); + let iid = u64::from_le_bytes(iid_bytes); + offset += 8; + + let mut bbox = [0.0_f64; 4]; + for (idx, value) in bbox.iter_mut().enumerate() { + let start = offset + (idx * 8); + let bytes: [u8; 8] = key[start..start + 8] + .try_into() + .expect("bbox bytes"); + *value = f64::from_le_bytes(bytes); + } + + // Parse value (arraypath) + let arraypath = if value.len() >= 8 { + let bytes: [u8; 8] = value[..8].try_into().expect("array path bytes"); + vec![u64::from_le_bytes(bytes)] + } else { + vec![] + }; + + println!("Keypath: '{}', IID: {}, BBox: {:?}, ArrayPath: {:?}", keypath, iid, bbox, arraypath); + } + + // Check main CF entries for area + println!("\n=== Main CF area entries ==="); + let main_iter = index.rocks.iterator(rocksdb::IteratorMode::Start); + for (key, _value) in main_iter { + let key_str = String::from_utf8_lossy(&key); + if key_str.contains(".area") && key_str.contains("coordinates") { + println!(" {}", key_str); + } + } +} diff --git a/src/bin/test_rtree_raw.rs b/src/bin/test_rtree_raw.rs new file mode 100644 index 0000000..258db1d --- /dev/null +++ b/src/bin/test_rtree_raw.rs @@ -0,0 +1,86 @@ +use noise_search::index::{Batch, Index, OpenOptions}; +extern crate rocksdb; +use rocksdb::IteratorMode; +use std::convert::TryInto; + +fn decode_rtree_key(key: &[u8]) -> (String, u64) { + let mut offset = 0; + + // Read keypath length (varint) + let keypath_len = key[offset] as usize; + offset += 1; + + // Read keypath + let keypath = &key[offset..offset + keypath_len]; + let keypath_str = String::from_utf8_lossy(keypath).to_string(); + offset += keypath_len; + + // Read IID (u64) + let iid_bytes: [u8; 8] = key[offset..offset + 8] + .try_into() + .expect("iid bytes"); + let iid = u64::from_le_bytes(iid_bytes); + + (keypath_str, iid) +} + +fn main() { + let dbname = "target/tests/test_rtree_raw"; + let _ = Index::drop(dbname); + + let mut index = Index::open(dbname, Some(OpenOptions::Create)).unwrap(); + let mut batch = Batch::new(); + + // Add test documents + println!("Adding documents..."); + index.add(r#"{"_id": "point1", "geometry": {"type": "Point", "coordinates": [10.9, 48.4]}}"#, &mut batch).unwrap(); + index.add(r#"{"_id": "point2", "geometry": {"type": "Point", "coordinates": [50.0, 50.0]}}"#, &mut batch).unwrap(); + index.add(r#"{"_id": "point3", "other": {"type": "Point", "coordinates": [10.0, 10.0]}}"#, &mut batch).unwrap(); + index.flush(batch).unwrap(); + + // Test 1: List all rtree entries + println!("\n=== All RTree entries ==="); + let rtree_cf = index.rocks.cf_handle("rtree").unwrap(); + let iter = index.rocks.iterator_cf(rtree_cf, IteratorMode::Start).unwrap(); + for (key, _value) in iter { + let (keypath, iid) = decode_rtree_key(&key); + println!(" Keypath: '{}', IID: {}", keypath, iid); + } + + // Test 2: Test RTree iterator with query + println!("\n=== Testing RTree spatial query ==="); + let snapshot = index.rocks.snapshot(); + + // Build a query bbox that should match point1 + let query_bbox = vec![ + 0.0f64.to_le_bytes().to_vec(), // west + 0.0f64.to_le_bytes().to_vec(), // south + 20.0f64.to_le_bytes().to_vec(), // east + 50.0f64.to_le_bytes().to_vec(), // north + ].concat(); + + println!("Query bbox: [0, 0, 20, 50] (should match point1)"); + + // Create the full query key + let keypath = ".geometry"; + let mut query = Vec::new(); + query.push(keypath.len() as u8); + query.extend_from_slice(keypath.as_bytes()); + query.extend_from_slice(&0u64.to_le_bytes()); // min IID + query.extend_from_slice(&u64::MAX.to_le_bytes()); // max IID + query.extend_from_slice(&query_bbox); + + println!("Query key length: {}", query.len()); + + let rtree_iter = snapshot.rtree_iterator(&query); + let mut count = 0; + for (key, _value) in rtree_iter { + count += 1; + let (keypath, iid) = decode_rtree_key(&key); + println!(" Result {}: Keypath: '{}', IID: {}", count, keypath, iid); + } + + if count == 0 { + println!(" No results returned by RTree iterator!"); + } +} diff --git a/src/filters.rs b/src/filters.rs index 61ba102..8ff5fcb 100644 --- a/src/filters.rs +++ b/src/filters.rs @@ -3,9 +3,10 @@ extern crate varint; use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::HashSet; +use std::convert::TryInto; use std::io::Cursor; use std::rc::Rc; -use std::{self, mem, str}; +use std::{self, str}; use self::varint::VarintRead; @@ -451,10 +452,11 @@ impl QueryRuntimeFilter for RangeFilter { } // Else it's a range query on numbers - let number = unsafe { - let array = *(value[..].as_ptr() as *const [_; 8]); - mem::transmute::<[u8; 8], f64>(array) - }; + let number = f64::from_le_bytes( + value[..8] + .try_into() + .expect("number bytes must be 8 long"), + ); let min_condition = match self.min { Some(RangeOperator::Inclusive(min)) => number >= min, @@ -506,15 +508,18 @@ pub struct BboxFilter<'a> { kb: KeyBuilder, bbox: Vec, term_ordinal: Option, + last_returned: Option<(u64, Vec)>, // (seq, arraypath) } impl<'a> BboxFilter<'a> { pub fn new(snapshot: Rc>, kb: KeyBuilder, bbox: [f64; 4]) -> BboxFilter<'a> { let mut bbox_vec = Vec::with_capacity(32); - bbox_vec.extend_from_slice(&bbox[0].to_le_bytes()); - bbox_vec.extend_from_slice(&bbox[2].to_le_bytes()); - bbox_vec.extend_from_slice(&bbox[1].to_le_bytes()); - bbox_vec.extend_from_slice(&bbox[3].to_le_bytes()); + // RTree expects bbox in order: [x_min, x_max, y_min, y_max] + // Input is [west, south, east, north] = [x_min, y_min, x_max, y_max] + bbox_vec.extend_from_slice(&bbox[0].to_le_bytes()); // x_min (west) + bbox_vec.extend_from_slice(&bbox[2].to_le_bytes()); // x_max (east) + bbox_vec.extend_from_slice(&bbox[1].to_le_bytes()); // y_min (south) + bbox_vec.extend_from_slice(&bbox[3].to_le_bytes()); // y_max (north) BboxFilter { snapshot, @@ -522,49 +527,84 @@ impl<'a> BboxFilter<'a> { kb, bbox: bbox_vec, term_ordinal: None, + last_returned: None, } } /// Function to deserialize the Arraypaths fn from_u8_slice(slice: &[u8]) -> Vec { - let u64_slice = - unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u64, slice.len() / 8) }; - u64_slice.to_vec() + slice + .chunks_exact(8) + .map(|chunk| { + u64::from_le_bytes( + chunk + .try_into() + .expect("array path entries are multiples of 8 bytes"), + ) + }) + .collect() } } impl<'a> QueryRuntimeFilter for BboxFilter<'a> { fn first_result(&mut self, start: &DocResult) -> Option { - let query = self - .kb - .rtree_query_key(start.seq, std::u64::MAX, &self.bbox); + // For array queries, we need to consider the context + let min_seq = if self.kb.arraypath.is_empty() { + // Not an array query, use normal start seq + start.seq + } else if start.seq > 0 && start.arraypath.len() == self.kb.arraypath.len() { + // Array query for a specific document - search within that document + // by using seq-1 to ensure we get all array elements + start.seq.saturating_sub(1) + } else { + start.seq + }; + + let query = self.kb.rtree_query_key(min_seq, std::u64::MAX, &self.bbox); self.iter = Some(self.snapshot.new_rtree_iterator(&query)); + // Reset last_returned to handle the start properly + self.last_returned = None; self.next_result() } fn next_result(&mut self) -> Option { let iter = self.iter.as_mut().unwrap(); - if let Some((key, value)) = iter.next() { - let mut vec = Vec::with_capacity(key.len()); - vec.extend_from_slice(&key); - let mut read = Cursor::new(vec); - let key_len = read.read_unsigned_varint_32().unwrap(); - let offset = read.position() as usize; - - let iid = unsafe { - let array = *(key[offset + key_len as usize..].as_ptr() as *const [_; 8]); - mem::transmute::<[u8; 8], u64>(array) - }; + + loop { + if let Some((key, value)) = iter.next() { + let mut vec = Vec::with_capacity(key.len()); + vec.extend_from_slice(&key); + let mut read = Cursor::new(vec); + let key_len = read.read_unsigned_varint_32().unwrap(); + let offset = read.position() as usize; + + let iid = u64::from_le_bytes( + key[offset + key_len as usize..offset + key_len as usize + 8] + .try_into() + .expect("iid bytes"), + ); + + // Get arraypath for this result + let arraypath = BboxFilter::from_u8_slice(&value); + + // Skip if we've already returned this exact result + if let Some((last_seq, ref last_arraypath)) = self.last_returned { + if iid < last_seq || (iid == last_seq && arraypath == *last_arraypath) { + continue; + } + } - let mut dr = DocResult::new(); - dr.seq = iid; - dr.arraypath = BboxFilter::from_u8_slice(&value); - if self.term_ordinal.is_some() { - dr.add_score(self.term_ordinal.unwrap(), 1.0); + let mut dr = DocResult::new(); + dr.seq = iid; + dr.arraypath = arraypath.clone(); + if self.term_ordinal.is_some() { + dr.add_score(self.term_ordinal.unwrap(), 1.0); + } + self.last_returned = Some((iid, arraypath)); + return Some(dr); + } else { + return None; } - Some(dr) - } else { - None } } @@ -1099,12 +1139,14 @@ impl<'a> BindFilter<'a> { fn collect_results(&mut self, mut first: DocResult) -> Option { let value_key = self.kb.kp_value_key_from_doc_result(&first); - first.add_bind_name_result(&self.bind_var_name, value_key); + first.add_bind_name_result(&self.bind_var_name, value_key.clone()); + + // Collect all results for the same document at the same array depth while let Some(next) = self.filter.next_result() { - if next.seq == first.seq { + if next.seq == first.seq && next.arraypath.len() == first.arraypath.len() { let value_key = self.kb.kp_value_key_from_doc_result(&next); - first.add_bind_name_result(&self.bind_var_name, value_key); + first.add_bind_name_result(&self.bind_var_name, value_key.clone()); } else { self.option_next = Some(next); return Some(first); diff --git a/src/index.rs b/src/index.rs index 65b5848..f4028cf 100644 --- a/src/index.rs +++ b/src/index.rs @@ -4,10 +4,10 @@ extern crate varint; use self::uuid::Uuid; use std::cmp::Ordering; +use std::convert::TryInto; use std::collections::{BTreeMap, HashSet}; use std::io::Cursor; use std::io::Write; -use std::mem; use std::str; use std::sync::{LockResult, Mutex, MutexGuard}; @@ -119,7 +119,7 @@ impl Index { &self.name } - pub fn new_snapshot(&self) -> Snapshot { + pub fn new_snapshot(&self) -> Snapshot<'_> { Snapshot::new(RocksSnapshot::new(&self.rocks)) } @@ -179,7 +179,11 @@ impl Index { } /// Query the index with the string and use the parameters for values if passed. - pub fn query(&self, query: &str, parameters: Option) -> Result { + pub fn query( + &self, + query: &str, + parameters: Option, + ) -> Result, Error> { QueryResults::new_query_results(query, parameters, self.new_snapshot()) } @@ -235,14 +239,17 @@ impl Index { /// since only one header is in the database it's not a problem with excess size. fn convert_bytes_to_u64(bytes: &[u8]) -> u64 { debug_assert!(bytes.len() == 8); - let mut buffer = [0; 8]; - for (n, b) in bytes.iter().enumerate() { - buffer[n] = *b; - } - unsafe { mem::transmute(buffer) } + u64::from_le_bytes( + bytes + .try_into() + .expect("convert_bytes_to_u64 requires exactly 8 bytes"), + ) } pub fn convert_bytes_to_i32(bytes: &[u8]) -> i32 { + if bytes.is_empty() { + return 0; + } let mut vec = Vec::with_capacity(bytes.len()); vec.extend(bytes.iter()); let mut read = Cursor::new(vec); @@ -299,6 +306,11 @@ impl Index { existing_val: Option<&[u8]>, operands: &mut MergeOperands, ) -> Vec { + // Safety check: ensure new_key is not empty + if new_key.is_empty() { + return Vec::new(); + } + if !(new_key[0] as char == key_builder::KEY_PREFIX_FIELD_COUNT || new_key[0] as char == key_builder::KEY_PREFIX_WORD_COUNT) { @@ -306,13 +318,19 @@ impl Index { } let mut count = if let Some(bytes) = existing_val { - Index::convert_bytes_to_i32(bytes) + if bytes.is_empty() { + 0 + } else { + Index::convert_bytes_to_i32(bytes) + } } else { 0 }; for bytes in operands { - count += Index::convert_bytes_to_i32(bytes); + if !bytes.is_empty() { + count += Index::convert_bytes_to_i32(bytes); + } } Index::convert_i32_to_bytes(count) } @@ -352,28 +370,38 @@ impl Index { } // Keypaths are the same, compare the Internal Ids value - let seq_aa = unsafe { - let array = *(aa[(offset_aa)..].as_ptr() as *const [_; 8]); - mem::transmute::<[u8; 8], u64>(array) - }; - let seq_bb = unsafe { - let array = *(bb[(offset_bb)..].as_ptr() as *const [_; 8]); - mem::transmute::<[u8; 8], u64>(array) - }; + let seq_aa = u64::from_le_bytes( + aa[offset_aa..offset_aa + 8] + .try_into() + .expect("sequence bytes"), + ); + let seq_bb = u64::from_le_bytes( + bb[offset_bb..offset_bb + 8] + .try_into() + .expect("sequence bytes"), + ); let seq_compare = seq_aa.cmp(&seq_bb); if seq_compare != Ordering::Equal { return seq_compare; } // Internal Ids are the same, compare the bounding box - let bbox_aa = unsafe { - let array = *(aa[(offset_aa + 8)..].as_ptr() as *const [_; 32]); - mem::transmute::<[u8; 32], [f64; 4]>(array) - }; - let bbox_bb = unsafe { - let array = *(bb[(offset_bb + 8)..].as_ptr() as *const [_; 32]); - mem::transmute::<[u8; 32], [f64; 4]>(array) - }; + let mut bbox_aa = [0.0_f64; 4]; + let mut bbox_bb = [0.0_f64; 4]; + for idx in 0..4 { + let start_aa = offset_aa + 8 + (idx * 8); + bbox_aa[idx] = f64::from_le_bytes( + aa[start_aa..start_aa + 8] + .try_into() + .expect("bbox bytes"), + ); + let start_bb = offset_bb + 8 + (idx * 8); + bbox_bb[idx] = f64::from_le_bytes( + bb[start_bb..start_bb + 8] + .try_into() + .expect("bbox bytes"), + ); + } for (value_aa, value_bb) in bbox_aa.iter().zip(bbox_bb.iter()) { let value_compare = value_aa.partial_cmp(value_bb).unwrap(); @@ -406,7 +434,7 @@ impl MvccRwLock { unsafe { &*self.raw } } - pub fn write(&self) -> LockResult>> { + pub fn write(&self) -> LockResult>> { self.lock.lock() } } diff --git a/src/json_shred.rs b/src/json_shred.rs index efcaaaa..aea310d 100644 --- a/src/json_shred.rs +++ b/src/json_shred.rs @@ -1,28 +1,19 @@ extern crate rocksdb; -extern crate rustc_serialize; extern crate varint; use std::collections::{BTreeMap, HashMap}; use std::io::Cursor; use std::io::Write; -use std::str::Chars; use std::{self, f64, str}; -use self::rustc_serialize::json::{JsonEvent, Parser, StackElement}; use self::varint::VarintWrite; +use serde_json::{self, Value as SerdeJsonValue}; use crate::error::Error; use crate::index::Index; use crate::key_builder::KeyBuilder; use crate::stems::Stems; -// Good example of using rustc_serialize: -// https://github.com/ajroetker/beautician/blob/master/src/lib.rs -// Callback based JSON streaming parser: -// https://github.com/gyscos/json-streamer.rs -// Another parser pased on rustc_serializ: -// https://github.com/isagalaev/ijson-rust/blob/master/src/test.rs#L11 - /// Key-value pairs, where the key is the path to the value, the value is the actual value. pub(crate) type KeyValues = BTreeMap>; @@ -45,6 +36,27 @@ enum ObjectKeyTypes { NoKey, } +#[derive(Debug)] +struct GeometryState { + seen_type: bool, + seen_coordinates: bool, + coordinates_depth: u32, + is_first: bool, + bounding_box: [f64; 4], +} + +impl GeometryState { + fn new() -> Self { + GeometryState { + seen_type: false, + seen_coordinates: false, + coordinates_depth: 0, + is_first: true, + bounding_box: [f64::MAX, f64::MAX, f64::MIN, f64::MIN], + } + } +} + #[derive(Debug)] pub struct Shredder { kb: KeyBuilder, @@ -52,17 +64,7 @@ pub struct Shredder { object_keys_indexed: Vec, shredded_key_values: KeyValues, existing_key_value_to_delete: KeyValues, - // Whether the current object is a GeoJSON geometry or not. It's a counter that increases - // the more of the geometry was seen. It is increased for the following cases: - // - a `type` field with a valid GeoJSON geometry type as value - // - a `coordinates` field with an array with numbers. It is not verified if the coordinates - // actually match the type. - // So if it is equal to 2, it is considered being a proper GeoJSON geometry - maybe_geometry: u8, - // Currently only 2D GeoJSON is supported. Hence only distinguish between first and other - // coordinates - is_first: bool, - bounding_box: [f64; 4], + geometry_stack: Vec, } impl Shredder { @@ -73,9 +75,7 @@ impl Shredder { object_keys_indexed: Vec::new(), shredded_key_values: BTreeMap::new(), existing_key_value_to_delete: BTreeMap::new(), - maybe_geometry: 0, - is_first: true, - bounding_box: [f64::MAX, f64::MAX, f64::MIN, f64::MIN], + geometry_stack: Vec::new(), } } @@ -208,15 +208,17 @@ impl Shredder { } fn calc_mbb(&mut self, value: f64) { - if self.maybe_geometry > 0 { - if self.is_first { - self.bounding_box[0] = self.bounding_box[0].min(value); - self.bounding_box[2] = self.bounding_box[0].max(value); - } else { - self.bounding_box[1] = self.bounding_box[1].min(value); - self.bounding_box[3] = self.bounding_box[3].max(value); + if let Some(state) = self.geometry_stack.last_mut() { + if state.coordinates_depth > 0 { + if state.is_first { + state.bounding_box[0] = state.bounding_box[0].min(value); + state.bounding_box[2] = state.bounding_box[0].max(value); + } else { + state.bounding_box[1] = state.bounding_box[1].min(value); + state.bounding_box[3] = state.bounding_box[3].max(value); + } + state.is_first = !state.is_first; } - self.is_first = !self.is_first; } } @@ -229,13 +231,8 @@ impl Shredder { Ok(()) } - fn maybe_add_value( - &mut self, - parser: &Parser, - code: char, - value: &[u8], - ) -> Result<(), Error> { - match self.extract_key(parser.stack().top()) { + fn maybe_add_value(&mut self, key: Option<&str>, code: char, value: &[u8]) -> Result<(), Error> { + match self.extract_key(key) { ObjectKeyTypes::Id => { if code != 's' && self.kb.kp_segments_len() == 1 { //nested fields can be _id, not root fields @@ -250,14 +247,6 @@ impl Shredder { self.add_value(code, value)?; } ObjectKeyTypes::Key(key) => { - if key == "type" { - let is_valid_type = GEOJSON_TYPES - .iter() - .position(|&tt| tt == unsafe { str::from_utf8_unchecked(value) }); - if is_valid_type.is_some() { - self.maybe_geometry += 1; - } - } // Pop the dummy object that makes ObjectEnd happy // or the previous object key self.kb.pop_object_key(); @@ -276,9 +265,9 @@ impl Shredder { // Extract key if it exists and indicates if it's a special type of key // It is called when the value is a primitive type. If it is an array or object, then // `maybe_push_key` is called. - fn extract_key(&mut self, stack_element: Option) -> ObjectKeyTypes { - match stack_element { - Some(StackElement::Key(key)) => { + fn extract_key(&mut self, key: Option<&str>) -> ObjectKeyTypes { + match key { + Some(key) => { if self.kb.kp_segments_len() == 1 && key == "_id" { ObjectKeyTypes::Id } else { @@ -293,12 +282,9 @@ impl Shredder { // Don't push them if they are reserved fields (starting with underscore) // It is called when the value is an array or object. If it is a primitive type, then // `extract_key` is called. - fn maybe_push_key( - &mut self, - stack_element: Option, - ) -> Result, Error> { - match stack_element { - Some(StackElement::Key(key)) => { + fn maybe_push_key(&mut self, key: Option<&str>) -> Result, Error> { + match key { + Some(key) => { if self.kb.kp_segments_len() == 1 && key == "_id" { return Err(Error::Shred( "Expected string for `_id` field, got another type".to_string(), @@ -316,6 +302,120 @@ impl Shredder { } } + fn walk_value(&mut self, key: Option<&str>, value: &SerdeJsonValue) -> Result<(), Error> { + match value { + SerdeJsonValue::Object(map) => self.walk_object(key, map), + SerdeJsonValue::Array(values) => self.walk_array(key, values), + SerdeJsonValue::String(text) => { + if let Some("type") = key { + if GEOJSON_TYPES + .iter() + .any(|&tt| tt == text.as_str()) + { + if let Some(state) = self.geometry_stack.last_mut() { + state.seen_type = true; + } + } + } + self.maybe_add_value(key, 's', text.as_bytes()) + } + SerdeJsonValue::Bool(tf) => { + let code = if *tf { 'T' } else { 'F' }; + self.maybe_add_value(key, code, &[]) + } + SerdeJsonValue::Number(num) => { + if let Some(i) = num.as_i64() { + let f = i as f64; + self.calc_mbb(f); + self.maybe_add_value(key, 'f', &f.to_le_bytes()) + } else if let Some(u) = num.as_u64() { + let f = u as f64; + self.calc_mbb(f); + self.maybe_add_value(key, 'f', &f.to_le_bytes()) + } else if let Some(f) = num.as_f64() { + self.calc_mbb(f); + self.maybe_add_value(key, 'f', &f.to_le_bytes()) + } else { + Err(Error::Shred("Unable to parse number value".to_string())) + } + } + SerdeJsonValue::Null => self.maybe_add_value(key, 'N', &[]), + } + } + + fn walk_object( + &mut self, + key: Option<&str>, + map: &serde_json::Map, + ) -> Result<(), Error> { + self.maybe_push_key(key)?; + self.kb.push_object_key(""); + self.object_keys_indexed.push(false); + self.geometry_stack.push(GeometryState::new()); + + for (child_key, child_value) in map.iter() { + self.walk_value(Some(child_key), child_value)?; + } + + self.kb.pop_object_key(); + let indexed = self.object_keys_indexed.pop().unwrap_or(false); + if self.kb.kp_segments_len() > 0 && !indexed { + self.maybe_add_value(key, 'o', &[])?; + } + + if let Some(state) = self.geometry_stack.pop() { + if state.seen_type && state.seen_coordinates { + let mut encoded_bbox = Vec::new(); + encoded_bbox.extend_from_slice(&state.bounding_box[0].to_le_bytes()); + encoded_bbox.extend_from_slice(&state.bounding_box[2].to_le_bytes()); + encoded_bbox.extend_from_slice(&state.bounding_box[1].to_le_bytes()); + encoded_bbox.extend_from_slice(&state.bounding_box[3].to_le_bytes()); + let _ = self.add_value('r', encoded_bbox.as_slice()); + } + } + + self.kb.inc_top_array_index(); + Ok(()) + } + + fn walk_array(&mut self, key: Option<&str>, values: &[SerdeJsonValue]) -> Result<(), Error> { + self.maybe_push_key(key)?; + let mut increased_depth = false; + if let Some(state) = self.geometry_stack.last_mut() { + if key == Some("coordinates") { + state.seen_coordinates = true; + state.coordinates_depth += 1; + increased_depth = true; + } else if state.coordinates_depth > 0 { + state.coordinates_depth += 1; + increased_depth = true; + } + } + self.kb.push_array(); + + if values.is_empty() { + self.kb.pop_array(); + self.maybe_add_value(key, 'a', &[])?; + self.kb.inc_top_array_index(); + return Ok(()); + } + + for value in values { + self.walk_value(None, value)?; + } + + self.kb.pop_array(); + if increased_depth { + if let Some(state) = self.geometry_stack.last_mut() { + if state.coordinates_depth > 0 { + state.coordinates_depth -= 1; + } + } + } + self.kb.inc_top_array_index(); + Ok(()) + } + pub fn add_all_to_batch( &mut self, seq: u64, @@ -488,95 +588,17 @@ impl Shredder { } pub fn shred(&mut self, json: &str) -> Result, Error> { - let mut parser = Parser::new(json.chars()); - loop { - // Get the next token, so that in case of an `ObjectStart` the key is already - // on the stack. - match parser.next().take() { - Some(JsonEvent::ObjectStart) => { - self.maybe_push_key(parser.stack().top())?; - // Just push something to make `ObjectEnd` happy - self.kb.push_object_key(""); - self.object_keys_indexed.push(false); - } - Some(JsonEvent::ObjectEnd) => { - self.kb.pop_object_key(); - if self.kb.kp_segments_len() > 0 && !self.object_keys_indexed.pop().unwrap() { - // this means we never wrote a key because the object was empty. - // So preserve the empty object by writing a special value. - // but not for the root object. it will always have _id field added. - self.maybe_add_value(&parser, 'o', &[])?; - } - if self.maybe_geometry == 2 { - let mut encoded_bbox = Vec::new(); - encoded_bbox.extend_from_slice(&self.bounding_box[0].to_le_bytes()); - encoded_bbox.extend_from_slice(&self.bounding_box[2].to_le_bytes()); - encoded_bbox.extend_from_slice(&self.bounding_box[1].to_le_bytes()); - encoded_bbox.extend_from_slice(&self.bounding_box[3].to_le_bytes()); - - let _ = self.add_value('r', encoded_bbox.as_slice()); - } - // Reset the values as it either wasn't a valid geometry, or it was already - // succcessfully processed - self.maybe_geometry = 0; - self.bounding_box = [f64::MAX, f64::MAX, f64::MIN, f64::MIN]; + self.doc_id = None; + self.kb.clear(); + self.object_keys_indexed.clear(); + self.shredded_key_values.clear(); + self.existing_key_value_to_delete.clear(); + self.geometry_stack.clear(); + + let value: SerdeJsonValue = + serde_json::from_str(json).map_err(|err| Error::Shred(err.to_string()))?; + self.walk_value(None, &value)?; - self.kb.inc_top_array_index(); - } - Some(JsonEvent::ArrayStart) => { - let key = self.maybe_push_key(parser.stack().top())?; - if key == Some("coordinates".to_string()) { - self.maybe_geometry += 1; - } - self.kb.push_array(); - } - #[allow(clippy::branches_sharing_code)] // false positive - Some(JsonEvent::ArrayEnd) => { - if self.kb.peek_array_index() == 0 { - // this means we never wrote a value because the object was empty. - // So preserve the empty array by writing a special value. - self.kb.pop_array(); - self.maybe_add_value(&parser, 'a', &[])?; - } else { - self.kb.pop_array(); - } - self.kb.inc_top_array_index(); - } - Some(JsonEvent::StringValue(value)) => { - self.maybe_add_value(&parser, 's', value.as_bytes())?; - } - Some(JsonEvent::BooleanValue(tf)) => { - let code = if tf { 'T' } else { 'F' }; - self.maybe_add_value(&parser, code, &[])?; - } - Some(JsonEvent::I64Value(i)) => { - let f = i as f64; - self.calc_mbb(f); - let bytes = f.to_le_bytes(); - self.maybe_add_value(&parser, 'f', &bytes[..])?; - } - Some(JsonEvent::U64Value(u)) => { - let f = u as f64; - self.calc_mbb(f); - let bytes = f.to_le_bytes(); - self.maybe_add_value(&parser, 'f', &bytes[..])?; - } - Some(JsonEvent::F64Value(f)) => { - self.calc_mbb(f); - let bytes = f.to_le_bytes(); - self.maybe_add_value(&parser, 'f', &bytes[..])?; - } - Some(JsonEvent::NullValue) => { - self.maybe_add_value(&parser, 'N', &[])?; - } - Some(JsonEvent::Error(error)) => { - return Err(Error::Shred(error.to_string())); - } - None => { - break; - } - }; - } Ok(self.doc_id.clone()) } } diff --git a/src/key_builder.rs b/src/key_builder.rs index 3ed0ca5..197822c 100644 --- a/src/key_builder.rs +++ b/src/key_builder.rs @@ -119,6 +119,15 @@ impl KeyBuilder { pub fn parse_seq_key(key: &str) -> Option { key.strip_prefix('S')?.parse().ok() } + + /// Get the keypath as a concatenated string + pub fn keypath_string(&self) -> String { + let mut keypath = String::with_capacity(100); + for segment in &self.keypath { + keypath.push_str(segment); + } + keypath + } /// Build key to query an R-tree pub fn rtree_query_key(&self, seq_min: u64, seq_max: u64, bbox: &[u8]) -> Vec { diff --git a/src/lib.rs b/src/lib.rs index ed64703..c1d6418 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,3 +13,4 @@ pub mod repl; mod returnable; mod snapshot; mod stems; +mod test_geometry; diff --git a/src/query.rs b/src/query.rs index 150178e..71f5c41 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,5 +1,3 @@ -extern crate rustc_serialize; - use std::cmp::Ordering; use std::collections::HashMap; use std::collections::VecDeque; @@ -9,7 +7,6 @@ use std::rc::Rc; use std::str; use std::usize; -use self::rustc_serialize::json::{JsonEvent, Parser as JsonParser, StackElement}; use crate::aggregates::{AggregateActionFun, AggregateExtractFun, AggregateFun, AggregateInitFun}; use crate::error::Error; use crate::filters::QueryRuntimeFilter; @@ -17,6 +14,7 @@ use crate::json_value::JsonValue; use crate::parser::Parser; use crate::returnable::{RetHidden, RetScore, RetValue, ReturnPath, Returnable}; use crate::snapshot::{JsonFetcher, Snapshot}; +use serde_json::Value as SerdeJsonValue; #[derive(Clone)] pub struct DocResult { @@ -224,7 +222,7 @@ impl<'a> QueryResults<'a> { return Err(Error::Parse( "query cannot be made up of only logical not. Must have at \ least one match clause not negated." - .to_string(), + .to_owned(), )); } @@ -242,7 +240,7 @@ impl<'a> QueryResults<'a> { returnable = if has_ordering && has_ags { return Err(Error::Parse( - "Cannot have aggregates and ordering in the same query".to_string(), + "Cannot have aggregates and ordering in the same query".to_owned(), )); } else if has_ordering { returnable.take_order_for_matching_fields(&mut orders); @@ -282,9 +280,8 @@ impl<'a> QueryResults<'a> { for option_ag in ags.iter() { if option_ag.is_none() { return Err(Error::Parse( - "Return keypaths must either all have \ - aggregate functions, or none can them." - .to_string(), + "Return keypaths must either all have aggregate functions, or none can them." + .to_owned(), )); } } @@ -383,81 +380,39 @@ impl<'a> QueryResults<'a> { } fn parse_parameters(params: &str) -> Result, Error> { - let mut parser = JsonParser::new(params.chars()); let err_msg: String = "Parameterized query values must be String, Number, / True, False, or Null" - .to_string(); - if parser.next().take() != Some(JsonEvent::ObjectStart) { - return Err(Error::Parse("Parameters must be json object".to_string())); - } + .to_owned(); + let value: SerdeJsonValue = + serde_json::from_str(params).map_err(|err| Error::Parse(err.to_string()))?; + let object = match value { + SerdeJsonValue::Object(map) => map, + _ => return Err(Error::Parse("Parameters must be json object".to_owned())), + }; + let mut map: HashMap = HashMap::new(); - loop { - // Get the next token, so that in case of an `ObjectStart` the key is already - // on the stack. - match parser.next().take() { - Some(JsonEvent::ObjectStart) => return Err(Error::Parse(err_msg)), - Some(JsonEvent::ObjectEnd) => (), - Some(JsonEvent::ArrayStart) => return Err(Error::Parse(err_msg)), - Some(JsonEvent::ArrayEnd) => (), - Some(JsonEvent::StringValue(value)) => { - if let Some(StackElement::Key(key)) = parser.stack().top() { - map.insert(key.to_string(), JsonValue::String(value)); - } else { - panic!("Top of stack isn't a key!"); - } - } - Some(JsonEvent::BooleanValue(tf)) => { - if let Some(StackElement::Key(key)) = parser.stack().top() { - map.insert( - key.to_string(), - if tf { - JsonValue::True - } else { - JsonValue::False - }, - ); - } else { - panic!("Top of stack isn't a key!"); - } - } - Some(JsonEvent::I64Value(i)) => { - let f = i as f64; - if let Some(StackElement::Key(key)) = parser.stack().top() { - map.insert(key.to_string(), JsonValue::Number(f)); - } else { - panic!("Top of stack isn't a key!"); - } - } - Some(JsonEvent::U64Value(u)) => { - let f = u as f64; - if let Some(StackElement::Key(key)) = parser.stack().top() { - map.insert(key.to_string(), JsonValue::Number(f)); + for (key, json_value) in object { + let parsed_value = match json_value { + SerdeJsonValue::String(text) => JsonValue::String(text), + SerdeJsonValue::Number(number) => { + if let Some(f) = number.as_f64() { + JsonValue::Number(f) + } else if let Some(i) = number.as_i64() { + JsonValue::Number(i as f64) + } else if let Some(u) = number.as_u64() { + JsonValue::Number(u as f64) } else { - panic!("Top of stack isn't a key!"); + return Err(Error::Parse(err_msg.clone())); } } - Some(JsonEvent::F64Value(f)) => { - if let Some(StackElement::Key(key)) = parser.stack().top() { - map.insert(key.to_string(), JsonValue::Number(f)); - } else { - panic!("Top of stack isn't a key!"); - } - } - Some(JsonEvent::NullValue) => { - if let Some(StackElement::Key(key)) = parser.stack().top() { - map.insert(key.to_string(), JsonValue::Null); - } else { - panic!("Top of stack isn't a key!"); - } - } - Some(JsonEvent::Error(error)) => { - return Err(Error::Parse(format!("Error parsing parameters: {}", error))); - } - None => { - break; - } + SerdeJsonValue::Bool(true) => JsonValue::True, + SerdeJsonValue::Bool(false) => JsonValue::False, + SerdeJsonValue::Null => JsonValue::Null, + _ => return Err(Error::Parse(err_msg.clone())), }; + map.insert(key, parsed_value); } + Ok(map) } @@ -798,8 +753,6 @@ pub struct OrderInfo { #[cfg(test)] mod tests { - extern crate rustc_serialize; - use crate::index::{Batch, Index, OpenOptions}; #[test] diff --git a/src/snapshot.rs b/src/snapshot.rs index 83be04f..7803ecd 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -2,10 +2,10 @@ use rocksdb::{self, DBIterator, IteratorMode, Snapshot as RocksSnapshot}; extern crate varint; +use std::convert::TryInto; use std::f32; use std::io::Cursor; use std::iter::Peekable; -use std::mem::transmute; use std::str; use self::varint::VarintRead; @@ -210,11 +210,11 @@ impl JsonFetcher { } 'f' => { assert!(bytes.len() == 9); - let mut bytes2: [u8; 8] = [0; 8]; - for (n, b) in bytes[1..9].iter().enumerate() { - bytes2[n] = *b; - } - let double: f64 = unsafe { transmute(bytes2) }; + let double = f64::from_le_bytes( + bytes[1..9] + .try_into() + .expect("number bytes must be 8 long"), + ); JsonValue::Number(double) } 'T' => JsonValue::True, diff --git a/src/stems.rs b/src/stems.rs index 2c00031..caebbf7 100644 --- a/src/stems.rs +++ b/src/stems.rs @@ -21,7 +21,7 @@ pub struct StemmedWord { } impl<'a> Stems<'a> { - pub fn new(text: &str) -> Stems { + pub fn new(text: &'a str) -> Stems<'a> { Stems { words: text.split_word_bound_indices(), stemmer: Stemmer::new("english").unwrap(),