Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor and optimize document parsing #38

Merged
merged 7 commits into from
Dec 18, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
opt: checking whitespace using lookup
liuq19 committed Dec 15, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 361b28ae441058959f1f581f4e6422d113cfdd30
25 changes: 19 additions & 6 deletions src/parser.rs
Original file line number Diff line number Diff line change
@@ -71,8 +71,11 @@ fn get_escaped_branchless_u64(prev_escaped: &mut u64, backslash: u64) -> u64 {
}

#[inline(always)]
fn is_whitespace(ch: u8) -> bool {
ch == b' ' || ch == b'\r' || ch == b'\n' || ch == b'\t'
pub(crate) fn is_whitespace(ch: u8) -> bool {
// NOTE: the compiler not optimize as lookup, so we hard code here.
const SPACE_MASK: u64 = (1u64 << b' ') | (1u64 << b'\r') | (1u64 << b'\n') | (1u64 << b'\t');
1u64.checked_shl(ch as u32)
.is_some_and(|v| v & SPACE_MASK != 0)
}

#[inline(always)]
@@ -1066,10 +1069,20 @@ where
// parse the Colon :
#[inline(always)]
pub(crate) fn parse_object_clo(&mut self) -> Result<()> {
match self.skip_space() {
Some(b':') => Ok(()),
Some(_) => perr!(self, ExpectedColon),
None => perr!(self, EofWhileParsing),
if let Some(ch) = self.read.peek() {
// fast path for compact json
if ch == b':' {
self.read.eat(1);
return Ok(());
}

match self.skip_space() {
Some(b':') => Ok(()),
Some(_) => perr!(self, ExpectedColon),
None => perr!(self, EofWhileParsing),
}
} else {
perr!(self, EofWhileParsing)
}
}

2 changes: 2 additions & 0 deletions src/reader.rs
Original file line number Diff line number Diff line change
@@ -60,6 +60,8 @@ pub trait Reader<'de>: Sealed {
fn at(&self, index: usize) -> u8;
fn set_index(&mut self, index: usize);
fn next_n(&mut self, n: usize) -> Option<&'de [u8]>;

#[inline(always)]
fn next(&mut self) -> Option<u8> {
self.peek().map(|a| {
self.eat(1);
35 changes: 30 additions & 5 deletions src/serde/de.rs
Original file line number Diff line number Diff line change
@@ -2,14 +2,19 @@

// The code is cloned from [serde_json](https://github.com/serde-rs/json) and modified necessary parts.

use crate::error::{
Error,
ErrorCode::{self, EofWhileParsing, RecursionLimitExceeded},
Result,
};
use std::ptr::slice_from_raw_parts;

use crate::parser::{as_str, Parser};
use crate::reader::{Reader, Reference, SliceRead};
use crate::util::num::ParserNumber;
use crate::{
error::{
Error,
ErrorCode::{self, EofWhileParsing, RecursionLimitExceeded},
Result,
},
Document,
};

use serde::de::{self, Expected, Unexpected};
use serde::forward_to_deserialize_any;
@@ -235,6 +240,24 @@ impl<'de, R: Reader<'de>> Deserializer<R> {
visitor.visit_borrowed_str(raw)
}

fn deserialize_document<V>(&mut self, visitor: V) -> Result<V::Value>
where
V: de::Visitor<'de>,
{
// #Safety
// the json is validate before parsing json, and we pass the document using visit_bytes here.
unsafe {
let raw = as_str(self.parser.skip_one_unchecked()?);
let dom = crate::dom_from_slice_unchecked(raw.as_bytes())?;
let binary = &*slice_from_raw_parts(
&dom as *const _ as *const u8,
std::mem::size_of::<Document>(),
);
std::mem::forget(dom);
visitor.visit_bytes(binary)
}
}

// we deserialize json number from string or number types
fn deserialize_json_number<V>(&mut self, visitor: V) -> Result<V::Value>
where
@@ -560,6 +583,8 @@ impl<'de, 'a, R: Reader<'de>> de::Deserializer<'de> for &'a mut Deserializer<R>
return self.deserialize_json_number(visitor);
} else if name == crate::lazyvalue::TOKEN {
return self.deserialize_lazy_value(visitor);
} else if name == crate::value::TOKEN {
return self.deserialize_document(visitor);
}
}

1 change: 1 addition & 0 deletions src/util/arch/x86_64.rs
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@ pub unsafe fn prefix_xor(bitmask: u64) -> u64 {
}
}

#[inline(always)]
pub unsafe fn get_nonspace_bits(data: &[u8; 64]) -> u64 {
unsafe {
let lo: std::arch::x86_64::__m256i = _mm256_loadu_si256(data.as_ptr() as *const __m256i);
1 change: 1 addition & 0 deletions src/value/mod.rs
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@ mod value_trait;

pub use crate::RawValue;
pub use index::{Index, IndexMut};
pub(crate) use node::TOKEN;
pub use node::{
dom_from_slice, dom_from_slice_unchecked, dom_from_str, Array, ArrayMut, Document, Object,
ObjectMut, Value, ValueMut,
59 changes: 58 additions & 1 deletion src/value/node.rs
Original file line number Diff line number Diff line change
@@ -12,10 +12,12 @@ use crate::visitor::JsonVisitor;
use crate::{to_string, Number};
use bumpalo::Bump;
use core::mem::size_of;
use serde::de::Visitor;
use serde::ser::{Error, Serialize, SerializeMap, SerializeSeq};
use serde::Deserialize;
use std::alloc::Layout;
use std::marker::PhantomData;
use std::mem::transmute;
use std::mem::{transmute, MaybeUninit};
use std::ops;
use std::ptr::NonNull;
use std::slice::{from_raw_parts, from_raw_parts_mut};
@@ -1249,6 +1251,49 @@ impl JsonValue for Document {
}
}

use serde::de;
use std::result::Result as StdResult;
struct DomKey;

pub(crate) const TOKEN: &str = "$sonic_rs::private::Document";

// Serde for document
impl<'de> Deserialize<'de> for Document {
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error>
where
D: ::serde::Deserializer<'de>,
{
struct DomVisitor;

impl<'de> Visitor<'de> for DomVisitor {
type Value = Document;

fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
formatter.write_str("a valid json")
}

fn visit_bytes<E>(self, dom_binary: &[u8]) -> StdResult<Self::Value, E>
where
E: de::Error,
{
// we pass the document from dom_binary
unsafe {
assert!(dom_binary.len() == size_of::<Document>());
let mut dom: MaybeUninit<Document> = MaybeUninit::zeroed();
std::ptr::copy_nonoverlapping(
dom_binary.as_ptr() as *const Document,
dom.as_mut_ptr(),
1,
);
Ok(dom.assume_init())
}
}
}

deserializer.deserialize_newtype_struct(TOKEN, DomVisitor)
}
}

impl Serialize for Document {
#[inline]
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
@@ -1670,4 +1715,16 @@ mod test {
f64::MAX
);
}

#[test]
fn test_document_serde() {
use crate::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
struct Person {
any: Document,
}
let json = r#"{"any": {"name": "John", "age": 30}}"#;
let person: Person = crate::from_str(json).unwrap();
assert_eq!(person.any.get("name").as_str().unwrap(), "John");
}
}