Skip to content

Variant Support for Arrow and Parquet [DRAFT] #7404

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
1657aa4
schema: add initial Variant type as a Canonical Extension Type
Feb 26, 2025
7b098f1
parquet: initial support for LogicalType and ConvertedType for Variant
Feb 26, 2025
2b659c6
schema: enforce required value field in Variant
Feb 26, 2025
7796a1e
parquet: add support for variant extension type conversion
Mar 10, 2025
b47df75
turn variant into primitive type, add roundtrip test
PinkCrow007 Mar 20, 2025
0220e97
test variant roundtrip with multiple columns RecordBatch; refine variant
PinkCrow007 Mar 25, 2025
36a96ab
update variant roundtrip test
PinkCrow007 Mar 25, 2025
a8ba629
add VariantArray and VariantBuilder draft
PinkCrow007 Mar 27, 2025
0eaa7f0
implement VariantArrayReader and encode_variant_array for variant rou…
PinkCrow007 Mar 27, 2025
01a2e70
update comment
PinkCrow007 Mar 27, 2025
f245655
implement get_metadata_length
PinkCrow007 Apr 1, 2025
d81959f
modify comments
PinkCrow007 Apr 3, 2025
816d189
create arrow-variant; implement variant metadata encoding
PinkCrow007 Apr 3, 2025
83d8048
implement sorted_string to metadata; encode value draft
PinkCrow007 Apr 7, 2025
d8d6dae
initial variant encoder and decoder
PinkCrow007 Apr 7, 2025
8de7de5
add json_variant_parquet_roundtrip test; refine variant <-> json
PinkCrow007 Apr 10, 2025
1313697
fix bug
PinkCrow007 Apr 11, 2025
2096395
Merge remote-tracking branch 'upstream/main' into struct_group_type
PinkCrow007 Apr 16, 2025
c1b6bf2
Make Variant an ExtensionType over Struct in Arrow;
PinkCrow007 Apr 17, 2025
626765e
Variant ExtensionType over Struct, GroupType in Parquet
PinkCrow007 Apr 21, 2025
f93c238
minor fix
PinkCrow007 Apr 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ members = [
"arrow-schema",
"arrow-select",
"arrow-string",
"arrow-variant",
"parquet",
"parquet_derive",
"parquet_derive_test",
Expand Down
1 change: 1 addition & 0 deletions arrow-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ all-features = true
[features]
ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
force_validate = []
canonical_extension_types = ["arrow-schema/canonical_extension_types"]

[dev-dependencies]
rand = { version = "0.9", default-features = false, features = ["std", "std_rng", "thread_rng"] }
Expand Down
2 changes: 1 addition & 1 deletion arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1271,4 +1271,4 @@ mod tests {
let expected: Int32Array = vec![1, 2, 3].into_iter().map(Some).collect();
assert_eq!(array, expected);
}
}
}
4 changes: 2 additions & 2 deletions arrow-schema/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ serde = { version = "1.0", default-features = false, features = [
], optional = true }
bitflags = { version = "2.0.0", default-features = false, optional = true }
serde_json = { version = "1.0", optional = true }

base64 = { version = "0.21", optional = true }
[features]
canonical_extension_types = ["dep:serde", "dep:serde_json"]
canonical_extension_types = ["dep:serde", "dep:serde_json", "dep:base64"]
# Enable ffi support
ffi = ["bitflags"]
serde = ["dep:serde"]
Expand Down
12 changes: 12 additions & 0 deletions arrow-schema/src/extension/canonical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ mod uuid;
pub use uuid::Uuid;
mod variable_shape_tensor;
pub use variable_shape_tensor::{VariableShapeTensor, VariableShapeTensorMetadata};
mod variant;
pub use variant::Variant;

use crate::{ArrowError, Field};

Expand Down Expand Up @@ -77,6 +79,9 @@ pub enum CanonicalExtensionType {
///
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
Bool8(Bool8),

/// The extension type for `Variant`.
Variant(Variant),
}

impl TryFrom<&Field> for CanonicalExtensionType {
Expand All @@ -93,6 +98,7 @@ impl TryFrom<&Field> for CanonicalExtensionType {
Uuid::NAME => value.try_extension_type::<Uuid>().map(Into::into),
Opaque::NAME => value.try_extension_type::<Opaque>().map(Into::into),
Bool8::NAME => value.try_extension_type::<Bool8>().map(Into::into),
Variant::NAME => value.try_extension_type::<Variant>().map(Into::into),
_ => Err(ArrowError::InvalidArgumentError(format!("Unsupported canonical extension type: {name}"))),
},
// Name missing the expected prefix
Expand Down Expand Up @@ -140,3 +146,9 @@ impl From<Bool8> for CanonicalExtensionType {
CanonicalExtensionType::Bool8(value)
}
}

impl From<Variant> for CanonicalExtensionType {
fn from(value: Variant) -> Self {
CanonicalExtensionType::Variant(value)
}
}
238 changes: 238 additions & 0 deletions arrow-schema/src/extension/canonical/variant.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

//! Variant
//!
//! <https://arrow.apache.org/docs/format/CanonicalExtensions.html#variant>

use base64::engine::Engine as _;
use base64::engine::general_purpose::STANDARD;
use crate::{extension::ExtensionType, ArrowError, DataType};

/// The extension type for `Variant`.
///
/// Extension name: `arrow.variant`.
///
/// The storage type of this extension is **Struct containing two binary fields**:
/// - metadata: Binary field containing the variant metadata
/// - value: Binary field containing the serialized variant data
///
/// A Variant is a flexible structure that can store **Primitives, Arrays, or Objects**.
///
/// Both metadata and value fields are required.
///
/// <https://arrow.apache.org/docs/format/CanonicalExtensions.html#variant>
#[derive(Debug, Clone, PartialEq)]
pub struct Variant {
metadata: Vec<u8>, // Required binary metadata
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One concern I have with this approach is that it will require two allocations (and memory copies) to read a Variant value

I am hoping we can use Rust's borrow checker to do this with pointers (well, slices in rust) and no copying -- something like

pub struct Variant<'a> {
    metadata: &'a [u8], // Required binary metadata
    value: &'a [u8],
}

value: Vec<u8>, // Required binary value
}

impl Variant {
/// Creates a new `Variant` with metadata and value.
pub fn new(metadata: Vec<u8>, value: Vec<u8>) -> Self {
Self { metadata, value }
}

/// Creates a Variant representing an empty structure.
pub fn empty() -> Result<Self, ArrowError> {
Err(ArrowError::InvalidArgumentError(
"Variant cannot be empty because metadata and value are required".to_owned(),
))
}

/// Returns the metadata as a byte array.
pub fn metadata(&self) -> &[u8] {
&self.metadata
}

/// Returns the value as an byte array.
pub fn value(&self) -> &[u8] {
&self.value
}

/// Sets the value of the Variant.
pub fn set_value(mut self, value: Vec<u8>) -> Self {
self.value = value;
self
}
}

impl ExtensionType for Variant {
const NAME: &'static str = "arrow.variant";

type Metadata = Vec<u8>;

fn metadata(&self) -> &Self::Metadata {
&self.metadata
}

fn serialize_metadata(&self) -> Option<String> {
Some(STANDARD.encode(&self.metadata)) // Encode metadata as STANDARD string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding of the extension metadata for a variant column isn't the same as the metadata for each variant value -- I probably don't fully understand the code but I think the extension type metadata is stored for each field where the variant's metadata is stored for each row (basically the word metadata is overloaded 😢 )

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you're right. The ExtensionType metadata and the Variant metadata are not related. Right now the ExtensionType metadata isn’t storing anything meaningful.

}

fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
match metadata {
Some(meta) => STANDARD.decode(meta)
.map_err(|_| ArrowError::InvalidArgumentError("Invalid Variant metadata".to_owned())),
None => Ok(Vec::new()), // Default to empty metadata if None
}
}

fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
match data_type {
DataType::Struct(fields) => {
if fields.len() != 2 {
return Err(ArrowError::InvalidArgumentError(
"Variant struct must have exactly two fields".to_owned(),
));
}

let metadata_field = fields.iter()
.find(|f| f.name() == "metadata")
.ok_or_else(|| ArrowError::InvalidArgumentError(
"Variant struct must have a field named 'metadata'".to_owned(),
))?;

let value_field = fields.iter()
.find(|f| f.name() == "value")
.ok_or_else(|| ArrowError::InvalidArgumentError(
"Variant struct must have a field named 'value'".to_owned(),
))?;

match (metadata_field.data_type(), value_field.data_type()) {
(DataType::Binary, DataType::Binary) |
(DataType::LargeBinary, DataType::LargeBinary) => Ok(()),
_ => Err(ArrowError::InvalidArgumentError(
"Variant struct fields must both be Binary or LargeBinary".to_owned(),
)),
}
}
_ => Err(ArrowError::InvalidArgumentError(format!(
"Variant data type mismatch, expected Struct, found {data_type}"
))),
}
}

fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
let variant = Self { metadata, value: vec![0] };
variant.supports_data_type(data_type)?;
Ok(variant)
}
}

#[cfg(test)]
mod tests {
#[cfg(feature = "canonical_extension_types")]
use crate::extension::CanonicalExtensionType;
use crate::{
extension::{EXTENSION_TYPE_NAME_KEY},
Field, DataType,
};

use super::*;

#[test]
fn variant_metadata_encoding_decoding() {
let metadata = b"variant_metadata".to_vec();
let encoded = STANDARD.encode(&metadata);
let decoded = Variant::deserialize_metadata(Some(&encoded)).unwrap();
assert_eq!(metadata, decoded);
}

#[test]
fn variant_metadata_invalid_decoding() {
let result = Variant::deserialize_metadata(Some("invalid_base64"));
assert!(result.is_err());
}

#[test]
fn variant_metadata_none_decoding() {
let decoded = Variant::deserialize_metadata(None).unwrap();
assert!(decoded.is_empty());
}

#[test]
fn variant_supports_valid_data_types() {
// Test with actual binary data
let metadata = vec![0x01, 0x02, 0x03, 0x04, 0x05];
let value = vec![0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F];
let variant = Variant::new(metadata.clone(), value.clone());

// Test with Binary fields
let struct_type = DataType::Struct(vec![
Field::new("metadata", DataType::Binary, false),
Field::new("value", DataType::Binary, false)
].into());
assert!(variant.supports_data_type(&struct_type).is_ok());

// Test with LargeBinary fields
let struct_type = DataType::Struct(vec![
Field::new("metadata", DataType::LargeBinary, false),
Field::new("value", DataType::LargeBinary, false)
].into());
assert!(variant.supports_data_type(&struct_type).is_ok());

// Test with invalid type
let result = Variant::try_new(&DataType::Utf8, metadata);
assert!(result.is_err());
if let Err(ArrowError::InvalidArgumentError(msg)) = result {
assert!(msg.contains("Variant data type mismatch"));
}
}

#[test]
fn variant_creation_and_access() {
// Test with actual binary data
let metadata = vec![0x01, 0x02, 0x03, 0x04, 0x05];
let value = vec![0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F];
let variant = Variant::new(metadata.clone(), value.clone());
assert_eq!(variant.metadata(), &metadata);
assert_eq!(variant.value(), &value);
}

#[test]
fn variant_field_extension() {
let struct_type = DataType::Struct(vec![
Field::new("metadata", DataType::Binary, false),
Field::new("value", DataType::Binary, false)
].into());

// Test with actual binary data
let metadata = vec![0x01, 0x02, 0x03, 0x04, 0x05];
let value = vec![0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F];
let variant = Variant::new(metadata.clone(), value.clone());

let mut field = Field::new("", struct_type, false);
field.try_with_extension_type(variant.clone()).unwrap();

assert_eq!(
field.metadata().get(EXTENSION_TYPE_NAME_KEY),
Some(&"arrow.variant".to_owned())
);

#[cfg(feature = "canonical_extension_types")]
{
let recovered = field.try_canonical_extension_type().unwrap();
if let CanonicalExtensionType::Variant(recovered_variant) = recovered {
assert_eq!(recovered_variant.metadata(), variant.metadata());
} else {
panic!("Expected Variant type");
}
}
}
}
51 changes: 51 additions & 0 deletions arrow-variant/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

[package]
name = "arrow-variant"
version = { workspace = true }
description = "JSON to Arrow Variant conversion utilities"
homepage = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
keywords = ["arrow"]
include = [
"src/**/*.rs",
"Cargo.toml",
]
edition = { workspace = true }
rust-version = { workspace = true }

[lib]
name = "arrow_variant"
path = "src/lib.rs"

[features]
default = []

[dependencies]
arrow-array = { workspace = true, features = ["canonical_extension_types"] }
arrow-buffer = { workspace = true }
arrow-cast = { workspace = true, optional = true }
arrow-data = { workspace = true }
arrow-schema = { workspace = true, features = ["canonical_extension_types"] }
serde = { version = "1.0", default-features = false }
serde_json = { version = "1.0", default-features = false, features = ["std"] }

[dev-dependencies]
arrow-cast = { workspace = true }
Loading
Loading