From 41a4c8f4d5c23bb220074889e10285b0dc12b838 Mon Sep 17 00:00:00 2001 From: Joe Neeman Date: Thu, 18 Jul 2024 14:57:48 +0700 Subject: [PATCH] Evaluate yaml-rust2 --- Cargo.lock | 43 +++++++ Cargo.toml | 1 + core/Cargo.toml | 1 + core/src/cache.rs | 36 +----- core/src/lib.rs | 1 + core/src/yaml.rs | 310 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 358 insertions(+), 34 deletions(-) create mode 100644 core/src/yaml.rs diff --git a/Cargo.lock b/Cargo.lock index 6a2d92545e..4cd7256138 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -39,6 +39,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" + [[package]] name = "anes" version = "0.1.6" @@ -108,6 +114,12 @@ version = "1.0.80" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" +[[package]] +name = "arraydeque" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236" + [[package]] name = "arrayvec" version = "0.5.2" @@ -909,6 +921,15 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "encoding_rs" +version = "0.8.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" +dependencies = [ + "cfg-if", +] + [[package]] name = "endian-type" version = "0.1.2" @@ -1214,6 +1235,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" dependencies = [ "ahash", + "allocator-api2", +] + +[[package]] +name = "hashlink" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +dependencies = [ + "hashbrown 0.14.3", ] [[package]] @@ -1802,6 +1833,7 @@ dependencies = [ "unicode-segmentation", "void", "wasm-bindgen", + "yaml-rust2", ] [[package]] @@ -3607,6 +3639,17 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "yaml-rust2" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8902160c4e6f2fb145dbe9d6760a75e3c9522d8bf796ed7047c85919ac7115f8" +dependencies = [ + "arraydeque", + "encoding_rs", + "hashlink", +] + [[package]] name = "yansi" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index edf8872f80..0fcc19b304 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,6 +95,7 @@ toml = "0.8" typed-arena = "2.0.2" unicode-segmentation = "1.10.1" void = "1" +yaml-rust2 = "0.8.1" metrics = "0.21" metrics-util = "0.15" diff --git a/core/Cargo.toml b/core/Cargo.toml index de6963701c..a409c56720 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -54,6 +54,7 @@ sha2.workspace = true md-5.workspace = true unicode-segmentation.workspace = true indoc.workspace = true +yaml-rust2.workspace = true termimad = { workspace = true, optional = true } ansi_term = { workspace = true, optional = true } diff --git a/core/src/cache.rs b/core/src/cache.rs index e5f962f3ea..58f925db90 100644 --- a/core/src/cache.rs +++ b/core/src/cache.rs @@ -540,40 +540,8 @@ impl Cache { .map(|t| (attach_pos(t), ParseErrors::default())) .map_err(|err| ParseError::from_serde_json(err, file_id, &self.files)), InputFormat::Yaml => { - // YAML files can contain multiple documents. If there is only - // one we transparently deserialize it. If there are multiple, - // we deserialize the file as an array. - let de = serde_yaml::Deserializer::from_str(self.files.source(file_id)); - let mut terms = de - .map(|de| { - RichTerm::deserialize(de) - .map(attach_pos) - .map_err(|err| (ParseError::from_serde_yaml(err, file_id))) - }) - .collect::, _>>()?; - - if terms.is_empty() { - unreachable!( - "serde always produces at least one document, \ - the empty string turns into `null`" - ) - } else if terms.len() == 1 { - Ok(( - terms.pop().expect("we just checked the length"), - ParseErrors::default(), - )) - } else { - Ok(( - attach_pos( - Term::Array( - Array::new(Rc::from(terms.into_boxed_slice())), - Default::default(), - ) - .into(), - ), - ParseErrors::default(), - )) - } + let rt = crate::yaml::load(self.files.source(file_id), file_id).unwrap(); + Ok((rt, ParseErrors::default())) } InputFormat::Toml => { crate::serialize::toml_deser::from_str(self.files.source(file_id), file_id) diff --git a/core/src/lib.rs b/core/src/lib.rs index e1785af775..c21c69a72b 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -20,6 +20,7 @@ pub mod term; pub mod transform; pub mod typ; pub mod typecheck; +pub mod yaml; pub(crate) mod metrics; diff --git a/core/src/yaml.rs b/core/src/yaml.rs new file mode 100644 index 0000000000..387c99e63c --- /dev/null +++ b/core/src/yaml.rs @@ -0,0 +1,310 @@ +use std::collections::BTreeMap; +use std::num::NonZero; +use std::rc::Rc; + +use codespan::{ByteIndex, ByteOffset, FileId}; +use indexmap::IndexMap; +use yaml_rust2::parser::{Event, MarkedEventReceiver, Tag}; +use yaml_rust2::scanner::{Marker, ScanError, TScalarStyle}; + +use crate::identifier::LocIdent; +use crate::position::RawSpan; +use crate::term::array::{Array, ArrayAttrs}; +use crate::term::record::{Field, RecordData}; +use crate::term::{Number, RichTerm, Term}; + +// parse f64 as Core schema +// See: https://github.com/chyh1990/yaml-rust/issues/51 +// FIXME: nickel doesn't support inf/nan anyway... +fn parse_f64(v: &str) -> Option { + match v { + ".inf" | ".Inf" | ".INF" | "+.inf" | "+.Inf" | "+.INF" => Some(f64::INFINITY), + "-.inf" | "-.Inf" | "-.INF" => Some(f64::NEG_INFINITY), + ".nan" | "NaN" | ".NAN" => Some(f64::NAN), + _ => v.parse::().ok(), + } +} + +// yaml_rust2 uses usizes for optional anchor ids, with zero meaning "no id". +// We represent this using Option> instead. +type AnchorId = NonZero; + +enum Entry { + Array { + start: ByteIndex, + elems: Vec, + }, + Map { + start: ByteIndex, + // We only support string keys. + fields: IndexMap, + last_key: Option, + }, + Scalar(RichTerm), +} + +impl Entry { + fn new_array(start: ByteIndex) -> Self { + Self::Array { + start, + elems: Vec::new(), + } + } + + fn new_map(start: ByteIndex) -> Self { + Self::Map { + start, + fields: IndexMap::new(), + last_key: None, + } + } + + fn finish(self, src_id: FileId, end: ByteIndex) -> Result { + match self { + Entry::Array { start, elems } => { + let term = Term::Array( + Array::new(Rc::from(elems.into_boxed_slice())), + ArrayAttrs::default(), + ); + Ok(RichTerm::from(term).with_pos(RawSpan { src_id, start, end }.into())) + } + Entry::Map { + start, + fields, + last_key, + } => { + if last_key.is_some() { + panic!(); + } + let term = Term::Record(RecordData { + fields, + ..Default::default() + }); + Ok(RichTerm::from(term).with_pos(RawSpan { src_id, start, end }.into())) + } + Entry::Scalar(rt) => Ok(rt), + } + } +} + +struct YamlLoader { + docs: Vec, + src_id: FileId, + // states + doc_stack: Vec<(Entry, Option)>, + anchor_map: BTreeMap, + /// An error, if one was encountered. + error: Option, +} + +pub fn load(input: &str, src_id: FileId) -> Result { + let mut loader = YamlLoader { + docs: Vec::new(), + src_id, + doc_stack: Vec::new(), + anchor_map: BTreeMap::new(), + error: None, + }; + + let mut parser = yaml_rust2::parser::Parser::new_from_str(input); + parser.load(&mut loader, true).map_err(LoadError::Scan)?; + + // YAML files can contain multiple documents. If there is only + // one we transparently deserialize it. If there are multiple, + // we deserialize the file as an array. + if loader.docs.is_empty() { + Ok(RichTerm::from(Term::Null).with_pos( + RawSpan { + src_id, + start: 0.into(), + end: 0.into(), + } + .into(), + )) + } else if loader.docs.len() == 1 { + Ok(loader.docs.pop().unwrap()) + } else { + let term = Term::Array( + Array::new(Rc::from(loader.docs.into_boxed_slice())), + Default::default(), + ); + Ok(term.into()) // TODO: attach a position + } +} + +impl MarkedEventReceiver for YamlLoader { + fn on_event(&mut self, ev: Event, mark: Marker) { + if self.error.is_some() { + return; + } + if let Err(e) = self.on_event_impl(ev, mark) { + self.error = Some(e); + } + } +} + +/// An error that happened when loading a YAML document. +#[derive(Debug)] +pub enum LoadError { + /// An I/O error. + IO(std::io::Error), + /// An error within the scanner. This indicates a malformed YAML input. + Scan(ScanError), + /// A decoding error (e.g.: Invalid UTF-8). + Decode(std::borrow::Cow<'static, str>), +} + +impl From for LoadError { + fn from(error: std::io::Error) -> Self { + LoadError::IO(error) + } +} + +impl YamlLoader { + fn key_slot(&mut self) -> Option<&mut Option> { + match self.doc_stack.last_mut() { + Some(( + Entry::Map { + last_key: x @ None, .. + }, + _, + )) => Some(x), + _ => None, + } + } + + fn on_event_impl(&mut self, ev: Event, mark: Marker) -> Result<(), ScanError> { + let byte = ByteIndex::from(mark.index() as u32); + match ev { + Event::DocumentStart | Event::Nothing | Event::StreamStart | Event::StreamEnd => { + // do nothing + } + Event::DocumentEnd => { + assert_eq!(self.doc_stack.len(), 1); + let doc = self.doc_stack.pop().unwrap().0; + match doc { + Entry::Scalar(s) => self.docs.push(s), + _ => panic!(), + } + } + Event::SequenceStart(aid, _) => { + self.doc_stack + .push((Entry::new_array(byte), AnchorId::new(aid))); + } + Event::MappingStart(aid, _) => { + self.doc_stack + .push((Entry::new_map(byte), AnchorId::new(aid))); + } + Event::SequenceEnd | Event::MappingEnd => { + let (node, aid) = self.doc_stack.pop().unwrap(); + self.insert_new_node(node.finish(self.src_id, byte)?, aid, mark)?; + } + Event::Scalar(v, style, aid, tag) => { + let aid = AnchorId::new(aid); + let end = byte + ByteOffset::from(v.len() as i64); + let span = RawSpan { + src_id: self.src_id, + start: byte, + end, + }; + if let Some(slot) = self.key_slot() { + *slot = Some(LocIdent::new(v).with_pos(span.into())); + } else { + let term = if style != TScalarStyle::Plain { + Term::Str(v.into()) + } else if let Some(Tag { + ref handle, + ref suffix, + }) = tag + { + if handle == "tag:yaml.org,2002:" { + match suffix.as_ref() { + "bool" => match v.parse::() { + Err(_) => todo!(), + Ok(v) => Term::Bool(v), + }, + "int" => match v.parse::() { + Err(_) => todo!(), + Ok(v) => Term::Num(v.into()), + }, + "float" => match parse_f64(&v) { + Some(n) => { + Term::Num(Number::try_from_float_simplest(n).unwrap()) + } // FIXME + None => todo!(), + }, + "null" => match v.as_ref() { + "~" | "null" => Term::Null, + _ => todo!(), + }, + _ => Term::Str(v.into()), + } + } else { + Term::Str(v.into()) + } + } else if let Ok(b) = v.parse::() { + Term::Bool(b) + } else if let Ok(i) = v.parse::() { + Term::Num(i.into()) + } else if let Some(n) = parse_f64(&v) { + Term::Num(Number::try_from_float_simplest(n).unwrap()) + } else { + // Datatype is not specified, or unrecognized + Term::Str(v.into()) + }; + + let rt = RichTerm::from(term).with_pos( + RawSpan { + src_id: self.src_id, + start: byte, + end, + } + .into(), + ); + + self.insert_new_node(rt, aid, mark)?; + } + } + Event::Alias(id) => { + let n = match AnchorId::new(id).and_then(|id| self.anchor_map.get(&id)) { + Some(v) => v.clone(), + None => todo!(), + }; + self.insert_new_node(n, None, mark)?; + } + } + Ok(()) + } + + fn insert_new_node( + &mut self, + node: RichTerm, + aid: Option, + mark: Marker, + ) -> Result<(), ScanError> { + // valid anchor id starts from 1 + if let Some(aid) = aid { + self.anchor_map.insert(aid, node.clone()); + } + if self.doc_stack.is_empty() { + self.doc_stack.push((Entry::Scalar(node), None)); + } else { + let parent = self.doc_stack.last_mut().unwrap(); + match &mut parent.0 { + Entry::Array { elems, .. } => elems.push(node), + Entry::Map { + fields, last_key, .. + } => { + let Some(key) = last_key.take() else { + panic!(); + }; + if fields.insert(key, node.into()).is_some() { + panic!("duplicated value"); + } + } + Entry::Scalar(_) => unreachable!(), + } + } + Ok(()) + } +}