From c897b287ec57281684764364dcb8a72fc84e35da Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 14 Apr 2026 14:42:09 -0400 Subject: [PATCH 1/2] remove metadata from `DynArray` Signed-off-by: Connor Tsui --- encodings/alp/src/alp/plugin.rs | 6 +- encodings/bytebool/src/array.rs | 3 +- encodings/fastlanes/src/bitpacking/plugin.rs | 6 +- encodings/parquet-variant/src/vtable.rs | 9 +- encodings/pco/src/lib.rs | 5 +- encodings/pco/src/{test.rs => tests.rs} | 2 +- encodings/runend/src/arrow.rs | 8 +- encodings/zstd/src/zstd_buffers.rs | 6 +- vortex-array/public-api.lock | 2 - vortex-array/src/array/erased.rs | 6 - vortex-array/src/array/mod.rs | 10 -- vortex-array/src/arrow/executor/run_end.rs | 6 +- vortex-array/src/serde.rs | 112 +------------------ vortex-array/src/session/mod.rs | 11 +- vortex-cuda/src/hybrid_dispatch/mod.rs | 3 +- 15 files changed, 42 insertions(+), 153 deletions(-) rename encodings/pco/src/{test.rs => tests.rs} (99%) diff --git a/encodings/alp/src/alp/plugin.rs b/encodings/alp/src/alp/plugin.rs index 3b2c36ebd02..2ac3ac85c54 100644 --- a/encodings/alp/src/alp/plugin.rs +++ b/encodings/alp/src/alp/plugin.rs @@ -133,7 +133,7 @@ mod tests { let array = alp_encoded.as_array(); - let metadata = array.metadata(&SESSION)?.unwrap_or_default(); + let metadata = SESSION.array_serialize(array)?.unwrap(); let children = array.children(); let buffers = array .buffers() @@ -182,7 +182,7 @@ mod tests { let array = alp_encoded.as_array(); - let metadata = array.metadata(&SESSION)?.unwrap_or_default(); + let metadata = SESSION.array_serialize(array)?.unwrap(); let children = array.children(); let buffers = array .buffers() @@ -213,7 +213,7 @@ mod tests { fn primitive_array_returns_error() { let array = PrimitiveArray::from_iter([1.0f64, 2.0, 3.0]).into_array(); - let metadata = array.metadata(&SESSION).unwrap().unwrap_or_default(); + let metadata = SESSION.array_serialize(&array).unwrap().unwrap(); let children = array.children(); let buffers = array .buffers() diff --git a/encodings/bytebool/src/array.rs b/encodings/bytebool/src/array.rs index 28205d973be..e98c5bd656d 100644 --- a/encodings/bytebool/src/array.rs +++ b/encodings/bytebool/src/array.rs @@ -352,7 +352,6 @@ impl From>> for ByteBoolData { mod tests { use vortex_array::ArrayContext; use vortex_array::IntoArray; - use vortex_array::LEGACY_SESSION; use vortex_array::assert_arrays_eq; use vortex_array::serde::SerializeOptions; use vortex_array::serde::SerializedArray; @@ -407,7 +406,7 @@ mod tests { let serialized = array .clone() .into_array() - .serialize(&ctx, &LEGACY_SESSION, &SerializeOptions::default()) + .serialize(&ctx, &session, &SerializeOptions::default()) .unwrap(); let mut concat = ByteBufferMut::empty(); diff --git a/encodings/fastlanes/src/bitpacking/plugin.rs b/encodings/fastlanes/src/bitpacking/plugin.rs index fec101ff895..49511a10748 100644 --- a/encodings/fastlanes/src/bitpacking/plugin.rs +++ b/encodings/fastlanes/src/bitpacking/plugin.rs @@ -133,7 +133,7 @@ mod tests { let array = bitpacked.as_array(); - let metadata = array.metadata(&SESSION)?.unwrap_or_default(); + let metadata = SESSION.array_serialize(array)?.unwrap(); let children = array.children(); let buffers = array .buffers() @@ -182,7 +182,7 @@ mod tests { let array = bitpacked.as_array(); - let metadata = array.metadata(&SESSION)?.unwrap_or_default(); + let metadata = SESSION.array_serialize(array)?.unwrap(); let children = array.children(); let buffers = array .buffers() @@ -212,7 +212,7 @@ mod tests { fn primitive_array_returns_error() -> VortexResult<()> { let array = PrimitiveArray::from_iter([1i32, 2, 3]).into_array(); - let metadata = array.metadata(&SESSION)?.unwrap_or_default(); + let metadata = SESSION.array_serialize(&array)?.unwrap(); let children = array.children(); let buffers = array .buffers() diff --git a/encodings/parquet-variant/src/vtable.rs b/encodings/parquet-variant/src/vtable.rs index 7dc3b3c90b2..75ea8036c9a 100644 --- a/encodings/parquet-variant/src/vtable.rs +++ b/encodings/parquet-variant/src/vtable.rs @@ -244,13 +244,13 @@ mod tests { use vortex_array::IntoArray; use vortex_array::Precision; use vortex_array::arrays::VarBinViewArray; - use vortex_array::arrays::Variant; use vortex_array::arrays::VariantArray; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; use vortex_array::serde::SerializeOptions; use vortex_array::serde::SerializedArray; + use vortex_array::session::ArraySession; use vortex_array::session::ArraySessionExt; use vortex_array::validity::Validity; use vortex_buffer::BitBuffer; @@ -261,11 +261,14 @@ mod tests { use crate::ParquetVariant; use crate::array::ParquetVariantArrayExt; + fn roundtrip(array: ArrayRef) -> ArrayRef { let dtype = array.dtype().clone(); let len = array.len(); - let session = VortexSession::empty().with::(); + let session = VortexSession::empty().with::(); + session.arrays().register(ParquetVariant); + let ctx = ArrayContext::empty(); let serialized = array .serialize(&ctx, &session, &SerializeOptions::default()) @@ -276,8 +279,6 @@ mod tests { concat.extend_from_slice(buf.as_ref()); } let concat = concat.freeze(); - session.arrays().register(ParquetVariant); - session.arrays().register(Variant); let parts = SerializedArray::try_from(concat).unwrap(); parts diff --git a/encodings/pco/src/lib.rs b/encodings/pco/src/lib.rs index 924322c2a7e..fcf9a9397fb 100644 --- a/encodings/pco/src/lib.rs +++ b/encodings/pco/src/lib.rs @@ -5,8 +5,6 @@ mod array; mod compute; mod rules; mod slice; -#[cfg(test)] -mod test; pub use array::*; @@ -35,3 +33,6 @@ pub struct PcoMetadata { #[prost(message, repeated, tag = "2")] pub chunks: Vec, } + +#[cfg(test)] +mod tests; diff --git a/encodings/pco/src/test.rs b/encodings/pco/src/tests.rs similarity index 99% rename from encodings/pco/src/test.rs rename to encodings/pco/src/tests.rs index f674fb7d1bb..d694a0efdbe 100644 --- a/encodings/pco/src/test.rs +++ b/encodings/pco/src/tests.rs @@ -184,7 +184,7 @@ fn test_serde() -> VortexResult<()> { let bytes = pco .serialize( &context, - &LEGACY_SESSION, + &SESSION, &SerializeOptions { offset: 0, include_padding: true, diff --git a/encodings/runend/src/arrow.rs b/encodings/runend/src/arrow.rs index 564c1d06cb2..206a014fa7f 100644 --- a/encodings/runend/src/arrow.rs +++ b/encodings/runend/src/arrow.rs @@ -90,6 +90,7 @@ mod tests { use vortex_array::search_sorted::SearchSorted; use vortex_array::search_sorted::SearchSortedSide; use vortex_array::session::ArraySession; + use vortex_array::session::ArraySessionExt; use vortex_array::validity::Validity; use vortex_buffer::Buffer; use vortex_buffer::buffer; @@ -99,8 +100,11 @@ mod tests { use crate::RunEnd; use crate::ops::find_slice_end_index; - static SESSION: LazyLock = - LazyLock::new(|| VortexSession::empty().with::()); + static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::empty().with::(); + session.arrays().register(RunEnd); + session + }); fn decode_run_array( array: &RunArray, diff --git a/encodings/zstd/src/zstd_buffers.rs b/encodings/zstd/src/zstd_buffers.rs index ffeb70210ae..5d00fd5a6a4 100644 --- a/encodings/zstd/src/zstd_buffers.rs +++ b/encodings/zstd/src/zstd_buffers.rs @@ -59,9 +59,9 @@ impl ZstdBuffers { session: &VortexSession, ) -> VortexResult { let encoding_id = array.encoding_id(); - let metadata = array - .metadata(session)? - .ok_or_else(|| vortex_err!("Array does not support serialization"))?; + let metadata = session + .array_serialize(array)? + .ok_or_else(|| vortex_err!("[ZstdBuffers]: Array does not support serialization"))?; let buffer_handles = array.buffer_handles(); let children = array.children(); diff --git a/vortex-array/public-api.lock b/vortex-array/public-api.lock index 8f26230791a..25736c6df84 100644 --- a/vortex-array/public-api.lock +++ b/vortex-array/public-api.lock @@ -22024,8 +22024,6 @@ pub fn vortex_array::ArrayRef::is_valid(&self, index: usize) -> vortex_error::Vo pub fn vortex_array::ArrayRef::len(&self) -> usize -pub fn vortex_array::ArrayRef::metadata(&self, session: &vortex_session::VortexSession) -> vortex_error::VortexResult>> - pub fn vortex_array::ArrayRef::metadata_fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub fn vortex_array::ArrayRef::named_buffers(&self) -> alloc::vec::Vec<(alloc::string::String, vortex_array::buffer::BufferHandle)> diff --git a/vortex-array/src/array/erased.rs b/vortex-array/src/array/erased.rs index cfefe316ca4..f951f9423ae 100644 --- a/vortex-array/src/array/erased.rs +++ b/vortex-array/src/array/erased.rs @@ -15,7 +15,6 @@ use vortex_error::vortex_ensure; use vortex_error::vortex_err; use vortex_error::vortex_panic; use vortex_mask::Mask; -use vortex_session::VortexSession; use crate::AnyCanonical; use crate::Array; @@ -561,11 +560,6 @@ impl ArrayRef { self.0.slot_name(self, idx) } - /// Returns the serialized metadata of the array. - pub fn metadata(&self, session: &VortexSession) -> VortexResult>> { - self.0.metadata(self, session) - } - /// Formats a human-readable metadata description. pub fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { self.0.metadata_fmt(f) diff --git a/vortex-array/src/array/mod.rs b/vortex-array/src/array/mod.rs index fae2d486ab6..eada9e79a0a 100644 --- a/vortex-array/src/array/mod.rs +++ b/vortex-array/src/array/mod.rs @@ -13,7 +13,6 @@ use vortex_error::VortexResult; use vortex_error::vortex_ensure; use vortex_error::vortex_err; use vortex_error::vortex_panic; -use vortex_session::VortexSession; use vortex_session::registry::Id; use crate::ExecutionCtx; @@ -132,10 +131,6 @@ pub(crate) trait DynArray: 'static + private::Sealed + Send + Sync + Debug { /// Returns the name of the slot at the given index. fn slot_name(&self, this: &ArrayRef, idx: usize) -> String; - /// Returns the serialized metadata of the array, or `None` if the array does not - /// support serialization. - fn metadata(&self, this: &ArrayRef, session: &VortexSession) -> VortexResult>>; - /// Formats a human-readable metadata description. fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result; @@ -341,11 +336,6 @@ impl DynArray for ArrayInner { V::slot_name(view, idx) } - fn metadata(&self, this: &ArrayRef, session: &VortexSession) -> VortexResult>> { - let view = unsafe { ArrayView::new_unchecked(this, &self.data) }; - V::serialize(view, session) - } - fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { std::fmt::Display::fmt(&self.data, f) } diff --git a/vortex-array/src/arrow/executor/run_end.rs b/vortex-array/src/arrow/executor/run_end.rs index 69aca62a994..b9424bc866b 100644 --- a/vortex-array/src/arrow/executor/run_end.rs +++ b/vortex-array/src/arrow/executor/run_end.rs @@ -24,6 +24,7 @@ use crate::IntoArray; use crate::arrays::Constant; use crate::arrays::ConstantArray; use crate::arrow::ArrowArrayExecutor; +use crate::session::ArraySessionExt; /// The encoding ID used by `vortex-runend`. We match on this string to avoid a crate dependency. const VORTEX_RUNEND_ID: &str = "vortex.runend"; @@ -79,8 +80,9 @@ fn run_end_to_arrow( ctx: &mut ExecutionCtx, ) -> VortexResult { let length = array.len(); - let metadata_bytes = array - .metadata(ctx.session())? + let metadata_bytes = ctx + .session() + .array_serialize(&array)? .ok_or_else(|| vortex_err!("RunEndArray missing metadata"))?; let metadata = RunEndMetadata::decode(&*metadata_bytes) .map_err(|e| vortex_err!("Failed to decode RunEndMetadata: {e}"))?; diff --git a/vortex-array/src/serde.rs b/vortex-array/src/serde.rs index 946a40518bc..c560aa9fb7e 100644 --- a/vortex-array/src/serde.rs +++ b/vortex-array/src/serde.rs @@ -170,16 +170,6 @@ impl<'a> ArrayNodeFlatBuffer<'a> { session: &'a VortexSession, array: &'a ArrayRef, ) -> VortexResult { - // Depth-first traversal of the array to ensure it supports serialization. - // FIXME(ngates): this serializes the metadata and throws it away! - for child in array.depth_first_traversal() { - if child.metadata(session)?.is_none() { - vortex_bail!( - "Array {} does not support serialization", - child.encoding_id() - ); - } - } let n_buffers_recursive = array.nbuffers_recursive(); if n_buffers_recursive > u16::MAX as usize { vortex_bail!( @@ -210,13 +200,13 @@ impl<'a> ArrayNodeFlatBuffer<'a> { ) })?; - let metadata = self.array.metadata(self.session)?.ok_or_else(|| { + let metadata_bytes = self.session.array_serialize(self.array)?.ok_or_else(|| { vortex_err!( "Array {} does not support serialization", self.array.encoding_id() ) })?; - let metadata = Some(fbb.create_vector(metadata.as_slice())); + let metadata = Some(fbb.create_vector(metadata_bytes.as_slice())); // Assign buffer indices for all child arrays. let nbuffers = u16::try_from(self.array.nbuffers()) @@ -701,101 +691,3 @@ impl TryFrom for SerializedArray { Self::try_from(value.try_to_host_sync()?) } } - -#[cfg(test)] -mod tests { - use std::sync::LazyLock; - - use flatbuffers::FlatBufferBuilder; - use vortex_session::VortexSession; - use vortex_session::registry::ReadContext; - - use super::SerializeOptions; - use super::SerializedArray; - use crate::ArrayContext; - use crate::array::ArrayId; - use crate::dtype::DType; - use crate::dtype::Nullability; - use crate::flatbuffers as fba; - use crate::session::ArraySession; - - static SESSION: LazyLock = LazyLock::new(VortexSession::empty); - - #[test] - fn unknown_array_encoding_allow_unknown() { - let mut fbb = FlatBufferBuilder::new(); - - let child_metadata = fbb.create_vector(&[9u8]); - let child = fba::ArrayNode::create( - &mut fbb, - &fba::ArrayNodeArgs { - encoding: 1, - metadata: Some(child_metadata), - children: None, - buffers: None, - stats: None, - }, - ); - - let children = fbb.create_vector(&[child]); - let metadata = fbb.create_vector(&[1u8, 2, 3]); - let root = fba::ArrayNode::create( - &mut fbb, - &fba::ArrayNodeArgs { - encoding: 0, - metadata: Some(metadata), - children: Some(children), - buffers: None, - stats: None, - }, - ); - let array = fba::Array::create( - &mut fbb, - &fba::ArrayArgs { - root: Some(root), - buffers: None, - }, - ); - fbb.finish_minimal(array); - let (buf, start) = fbb.collapse(); - let tree = vortex_buffer::ByteBuffer::from(buf).slice(start..); - - let ser = SerializedArray::from_array_tree(tree).unwrap(); - let ctx = ReadContext::new([ - ArrayId::new("vortex.test.foreign_array"), - ArrayId::new("vortex.test.foreign_child"), - ]); - let session = VortexSession::empty() - .with::() - .allow_unknown(); - - let decoded = ser - .decode(&DType::Variant(Nullability::Nullable), 5, &ctx, &session) - .unwrap(); - assert_eq!(decoded.encoding_id().as_ref(), "vortex.test.foreign_array"); - assert_eq!(decoded.nchildren(), 1); - assert_eq!( - decoded.nth_child(0).unwrap().encoding_id().as_ref(), - "vortex.test.foreign_child" - ); - assert_eq!(decoded.metadata(&SESSION).unwrap().unwrap(), vec![1, 2, 3]); - assert_eq!( - decoded - .nth_child(0) - .unwrap() - .metadata(&SESSION) - .unwrap() - .unwrap(), - vec![9] - ); - - let serialized = decoded - .serialize( - &ArrayContext::default(), - &SESSION, - &SerializeOptions::default(), - ) - .unwrap(); - assert!(!serialized.is_empty()); - } -} diff --git a/vortex-array/src/session/mod.rs b/vortex-array/src/session/mod.rs index 62ed46cb34c..034a369694f 100644 --- a/vortex-array/src/session/mod.rs +++ b/vortex-array/src/session/mod.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use vortex_error::VortexResult; +use vortex_error::vortex_bail; use vortex_session::Ref; use vortex_session::SessionExt; use vortex_session::registry::Registry; @@ -15,6 +16,7 @@ use crate::arrays::Bool; use crate::arrays::Chunked; use crate::arrays::Constant; use crate::arrays::Decimal; +use crate::arrays::Dict; use crate::arrays::Extension; use crate::arrays::FixedSizeList; use crate::arrays::List; @@ -26,6 +28,7 @@ use crate::arrays::Primitive; use crate::arrays::Struct; use crate::arrays::VarBin; use crate::arrays::VarBinView; +use crate::arrays::Variant; pub type ArrayRegistry = Registry; @@ -68,11 +71,13 @@ impl Default for ArraySession { this.register(ListView); this.register(FixedSizeList); this.register(Struct); + this.register(Variant); this.register(Extension); // Register the utility encodings. this.register(Chunked); this.register(Constant); + this.register(Dict); this.register(List); this.register(Masked); this.register(Patched); @@ -92,8 +97,12 @@ pub trait ArraySessionExt: SessionExt { /// Serialize an array using a plugin from the registry. fn array_serialize(&self, array: &ArrayRef) -> VortexResult>> { let Some(plugin) = self.arrays().registry.find(&array.encoding_id()) else { - return Ok(None); + vortex_bail!( + "Array {} is not registered for serializations", + array.encoding_id() + ); }; + plugin.serialize(array, &self.session()) } } diff --git a/vortex-cuda/src/hybrid_dispatch/mod.rs b/vortex-cuda/src/hybrid_dispatch/mod.rs index 0845a46fdc3..a5ca2560d43 100644 --- a/vortex-cuda/src/hybrid_dispatch/mod.rs +++ b/vortex-cuda/src/hybrid_dispatch/mod.rs @@ -257,8 +257,7 @@ mod tests { 0u32.into(), ) .vortex_expect("for"); - let vals = ZstdBuffers::compress(&vals.into_array(), 3, &VortexSession::empty()) - .vortex_expect("zstd"); + let vals = ZstdBuffers::compress(&vals.into_array(), 3, &session).vortex_expect("zstd"); // codes = FoR(BitPacked) let codes = PrimitiveArray::new( From 26f742602604d71530a576c1f82d56a284d394ba Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 14 Apr 2026 15:24:42 -0400 Subject: [PATCH 2/2] update file sizes Signed-off-by: Connor Tsui --- vortex-python/src/io.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vortex-python/src/io.rs b/vortex-python/src/io.rs index edf1061188d..1d1cb63cce7 100644 --- a/vortex-python/src/io.rs +++ b/vortex-python/src/io.rs @@ -280,7 +280,7 @@ impl PyVortexWriteOptions { /// >>> vx.io.VortexWriteOptions.default().write(sprl, "chonky.vortex") /// >>> import os /// >>> os.path.getsize('chonky.vortex') - /// 216004 + /// 216036 /// ``` /// /// Wow, Vortex manages to use about two bytes per integer! So advanced. So tiny. @@ -292,7 +292,7 @@ impl PyVortexWriteOptions { /// ```python /// >>> vx.io.VortexWriteOptions.compact().write(sprl, "tiny.vortex") /// >>> os.path.getsize('tiny.vortex') - /// 55120 + /// 55152 /// ``` /// /// Random numbers are not (usually) composed of random bytes!