diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs index d363b6be9780..ea23403c4b18 100644 --- a/arrow-json/src/reader/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -18,28 +18,33 @@ use crate::reader::tape::{Tape, TapeElement}; use crate::reader::{ArrayDecoder, DecoderContext}; use arrow_array::OffsetSizeTrait; -use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; -use arrow_buffer::buffer::NullBuffer; +use arrow_array::builder::BooleanBufferBuilder; +use arrow_buffer::{Buffer, buffer::NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::marker::PhantomData; -pub struct ListArrayDecoder { +pub type ListArrayDecoder = ListLikeArrayDecoder; +pub type ListViewArrayDecoder = ListLikeArrayDecoder; + +pub struct ListLikeArrayDecoder { data_type: DataType, decoder: Box, phantom: PhantomData, is_nullable: bool, } -impl ListArrayDecoder { +impl ListLikeArrayDecoder { pub fn new( ctx: &DecoderContext, data_type: &DataType, is_nullable: bool, ) -> Result { - let field = match data_type { - DataType::List(f) if !O::IS_LARGE => f, - DataType::LargeList(f) if O::IS_LARGE => f, + let field = match (IS_VIEW, data_type) { + (false, DataType::List(f)) if !O::IS_LARGE => f, + (false, DataType::LargeList(f)) if O::IS_LARGE => f, + (true, DataType::ListView(f)) if !O::IS_LARGE => f, + (true, DataType::LargeListView(f)) if O::IS_LARGE => f, _ => unreachable!(), }; let decoder = ctx.make_decoder(field.data_type(), field.is_nullable())?; @@ -53,11 +58,11 @@ impl ListArrayDecoder { } } -impl ArrayDecoder for ListArrayDecoder { +impl ArrayDecoder for ListLikeArrayDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut child_pos = Vec::with_capacity(pos.len()); - let mut offsets = BufferBuilder::::new(pos.len() + 1); - offsets.append(O::from_usize(0).unwrap()); + let mut offsets = Vec::with_capacity(pos.len() + 1); + offsets.push(O::from_usize(0).unwrap()); let mut nulls = self .is_nullable @@ -88,18 +93,30 @@ impl ArrayDecoder for ListArrayDecoder { let offset = O::from_usize(child_pos.len()).ok_or_else(|| { ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) })?; - offsets.append(offset) + offsets.push(offset); } let child_data = self.decoder.decode(tape, &child_pos)?; let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish())); - let data = ArrayDataBuilder::new(self.data_type.clone()) + let mut data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) .nulls(nulls) - .add_buffer(offsets.finish()) .child_data(vec![child_data]); + if IS_VIEW { + let mut sizes = Vec::with_capacity(offsets.len() - 1); + for i in 1..offsets.len() { + sizes.push(offsets[i] - offsets[i - 1]); + } + offsets.pop(); + data = data + .add_buffer(Buffer::from_vec(offsets)) + .add_buffer(Buffer::from_vec(sizes)); + } else { + data = data.add_buffer(Buffer::from_vec(offsets)); + } + // Safety // Validated lengths above Ok(unsafe { data.build_unchecked() }) diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 786cf9212d04..d1d9709135bc 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -153,7 +153,7 @@ pub use schema::*; use crate::reader::boolean_array::BooleanArrayDecoder; use crate::reader::decimal_array::DecimalArrayDecoder; -use crate::reader::list_array::ListArrayDecoder; +use crate::reader::list_array::{ListArrayDecoder, ListViewArrayDecoder}; use crate::reader::map_array::MapArrayDecoder; use crate::reader::null_array::NullArrayDecoder; use crate::reader::primitive_array::PrimitiveArrayDecoder; @@ -790,6 +790,8 @@ fn make_decoder( DataType::LargeUtf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), DataType::List(_) => Ok(Box::new(ListArrayDecoder::::new(ctx, data_type, is_nullable)?)), DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(ctx, data_type, is_nullable)?)), + DataType::ListView(_) => Ok(Box::new(ListViewArrayDecoder::::new(ctx, data_type, is_nullable)?)), + DataType::LargeListView(_) => Ok(Box::new(ListViewArrayDecoder::::new(ctx, data_type, is_nullable)?)), DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(ctx, data_type, is_nullable)?)), DataType::Binary => Ok(Box::new(BinaryArrayDecoder::::default())), DataType::LargeBinary => Ok(Box::new(BinaryArrayDecoder::::default())), @@ -813,7 +815,10 @@ mod tests { use std::io::{BufReader, Cursor, Seek}; use arrow_array::cast::AsArray; - use arrow_array::{Array, BooleanArray, Float64Array, ListArray, StringArray, StringViewArray}; + use arrow_array::{ + Array, BooleanArray, Float64Array, GenericListViewArray, ListArray, OffsetSizeTrait, + StringArray, StringViewArray, + }; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_data::ArrayDataBuilder; @@ -2190,6 +2195,81 @@ mod tests { assert_eq!(read, expected); } + fn assert_read_list_view() { + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let data_type = GenericListViewArray::::DATA_TYPE_CONSTRUCTOR(field.clone()); + let schema = Arc::new(Schema::new(vec![Field::new("lv", data_type, true)])); + + let buf = r#" + {"lv": [1, 2, 3]} + {"lv": [4, null]} + {"lv": null} + {"lv": [6]} + {"lv": []} + "#; + + let batches = do_read(buf, 1024, false, false, schema); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + let col = batch.column(0); + let list_view = col + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(list_view.len(), 5); + + // Check offsets and sizes + let expected_offsets: Vec = vec![0, 3, 5, 5, 6] + .into_iter() + .map(|v| O::usize_as(v)) + .collect(); + let expected_sizes: Vec = vec![3, 2, 0, 1, 0] + .into_iter() + .map(|v| O::usize_as(v)) + .collect(); + assert_eq!(list_view.value_offsets(), &expected_offsets); + assert_eq!(list_view.value_sizes(), &expected_sizes); + + // Row 0: [1, 2, 3] + assert!(list_view.is_valid(0)); + let vals = list_view.value(0); + let ints = vals.as_primitive::(); + assert_eq!(ints.values(), &[1, 2, 3]); + + // Row 1: [4, null] + assert!(list_view.is_valid(1)); + let vals = list_view.value(1); + let ints = vals.as_primitive::(); + assert_eq!(ints.len(), 2); + assert_eq!(ints.value(0), 4); + assert!(ints.is_null(1)); + + // Row 2: null + assert!(list_view.is_null(2)); + + // Row 3: [6] + assert!(list_view.is_valid(3)); + let vals = list_view.value(3); + let ints = vals.as_primitive::(); + assert_eq!(ints.values(), &[6]); + + // Row 4: [] + assert!(list_view.is_valid(4)); + let vals = list_view.value(4); + assert_eq!(vals.len(), 0); + } + + #[test] + fn test_read_list_view() { + assert_read_list_view::(); + } + + #[test] + fn test_read_large_list_view() { + assert_read_list_view::(); + } + #[test] fn test_skip_empty_lines() { let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs index d7c3fbbe2e34..45055c5a36a8 100644 --- a/arrow-json/src/writer/encoder.rs +++ b/arrow-json/src/writer/encoder.rs @@ -352,15 +352,23 @@ pub fn make_encoder<'a>( } DataType::List(_) => { let array = array.as_list::(); - NullableEncoder::new(Box::new(ListEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::LargeList(_) => { let array = array.as_list::(); - NullableEncoder::new(Box::new(ListEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) + } + DataType::ListView(_) => { + let array = array.as_list_view::(); + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) + } + DataType::LargeListView(_) => { + let array = array.as_list_view::(); + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::FixedSizeList(_, _) => { let array = array.as_fixed_size_list(); - NullableEncoder::new(Box::new(FixedSizeListEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::Dictionary(_, _) => downcast_dictionary_array! { @@ -639,77 +647,30 @@ impl Encoder for BinaryViewEncoder<'_> { } } -struct ListEncoder<'a, O: OffsetSizeTrait> { - offsets: OffsetBuffer, - encoder: NullableEncoder<'a>, -} - -impl<'a, O: OffsetSizeTrait> ListEncoder<'a, O> { - fn try_new( - field: &'a FieldRef, - array: &'a GenericListArray, - options: &'a EncoderOptions, - ) -> Result { - let encoder = make_encoder(field, array.values().as_ref(), options)?; - Ok(Self { - offsets: array.offsets().clone(), - encoder, - }) - } -} - -impl Encoder for ListEncoder<'_, O> { - fn encode(&mut self, idx: usize, out: &mut Vec) { - let end = self.offsets[idx + 1].as_usize(); - let start = self.offsets[idx].as_usize(); - out.push(b'['); - - if self.encoder.has_nulls() { - for idx in start..end { - if idx != start { - out.push(b',') - } - if self.encoder.is_null(idx) { - out.extend_from_slice(b"null"); - } else { - self.encoder.encode(idx, out); - } - } - } else { - for idx in start..end { - if idx != start { - out.push(b',') - } - self.encoder.encode(idx, out); - } - } - out.push(b']'); - } -} - -struct FixedSizeListEncoder<'a> { - value_length: usize, +struct ListLikeEncoder<'a, L: ListLikeArray> { + list_array: &'a L, encoder: NullableEncoder<'a>, } -impl<'a> FixedSizeListEncoder<'a> { +impl<'a, L: ListLikeArray> ListLikeEncoder<'a, L> { fn try_new( field: &'a FieldRef, - array: &'a FixedSizeListArray, + array: &'a L, options: &'a EncoderOptions, ) -> Result { let encoder = make_encoder(field, array.values().as_ref(), options)?; Ok(Self { + list_array: array, encoder, - value_length: array.value_length().as_usize(), }) } } -impl Encoder for FixedSizeListEncoder<'_> { +impl Encoder for ListLikeEncoder<'_, L> { fn encode(&mut self, idx: usize, out: &mut Vec) { - let start = idx * self.value_length; - let end = start + self.value_length; + let range = self.list_array.element_range(idx); + let start = range.start; + let end = range.end; out.push(b'['); if self.encoder.has_nulls() { for idx in start..end { diff --git a/arrow-json/src/writer/mod.rs b/arrow-json/src/writer/mod.rs index 2fac5ab62353..7c4bb737152f 100644 --- a/arrow-json/src/writer/mod.rs +++ b/arrow-json/src/writer/mod.rs @@ -1241,6 +1241,58 @@ mod tests { ); } + fn assert_write_list_view() { + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let data_type = GenericListViewArray::::DATA_TYPE_CONSTRUCTOR(field.clone()); + let schema = Schema::new(vec![Field::new("lv", data_type, true)]); + + // rows: [1, 2, 3], [4, null], null, [6] + let values = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), None, Some(6)]); + let offsets = [0, 3, 0, 5] + .iter() + .map(|&v| O::from_usize(v).unwrap()) + .collect::>(); + let sizes = [3, 2, 0, 1] + .iter() + .map(|&v| O::from_usize(v).unwrap()) + .collect::>(); + let list_view = GenericListViewArray::::try_new( + field, + ScalarBuffer::from(offsets), + ScalarBuffer::from(sizes), + Arc::new(values), + Some(NullBuffer::from_iter([true, true, false, true])), + ) + .unwrap(); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(list_view)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } + + assert_json_eq( + &buf, + r#"{"lv":[1,2,3]} +{"lv":[4,null]} +{} +{"lv":[6]} +"#, + ); + } + + #[test] + fn write_list_view() { + assert_write_list_view::(); + } + + #[test] + fn write_large_list_view() { + assert_write_list_view::(); + } + fn test_write_for_file(test_file: &str, remove_nulls: bool) { let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file);