From 70430ab1135f6f2c66f4cdda0b8783e5bd6eb976 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Wed, 18 Feb 2026 21:38:50 -0500 Subject: [PATCH 1/4] Impl ListView writer/reader --- arrow-json/src/reader/list_view_array.rs | 112 +++++++++++++++++++++++ arrow-json/src/reader/mod.rs | 84 ++++++++++++++++- arrow-json/src/writer/encoder.rs | 58 ++++++++++++ arrow-json/src/writer/mod.rs | 52 +++++++++++ 4 files changed, 305 insertions(+), 1 deletion(-) create mode 100644 arrow-json/src/reader/list_view_array.rs diff --git a/arrow-json/src/reader/list_view_array.rs b/arrow-json/src/reader/list_view_array.rs new file mode 100644 index 000000000000..952b105948b9 --- /dev/null +++ b/arrow-json/src/reader/list_view_array.rs @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::reader::tape::{Tape, TapeElement}; +use crate::reader::{ArrayDecoder, DecoderContext}; +use arrow_array::OffsetSizeTrait; +use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; +use arrow_buffer::buffer::NullBuffer; +use arrow_data::{ArrayData, ArrayDataBuilder}; +use arrow_schema::{ArrowError, DataType}; +use std::marker::PhantomData; + +pub struct ListViewArrayDecoder { + data_type: DataType, + decoder: Box, + phantom: PhantomData, + is_nullable: bool, +} + +impl ListViewArrayDecoder { + pub fn new( + ctx: &DecoderContext, + data_type: &DataType, + is_nullable: bool, + ) -> Result { + let field = match data_type { + DataType::ListView(f) if !O::IS_LARGE => f, + DataType::LargeListView(f) if O::IS_LARGE => f, + _ => unreachable!(), + }; + let decoder = ctx.make_decoder(field.data_type(), field.is_nullable())?; + + Ok(Self { + data_type: data_type.clone(), + decoder, + phantom: Default::default(), + is_nullable, + }) + } +} + +impl ArrayDecoder for ListViewArrayDecoder { + fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { + let mut child_pos = Vec::with_capacity(pos.len()); + let mut offsets = BufferBuilder::::new(pos.len()); + let mut sizes = BufferBuilder::::new(pos.len()); + + let mut nulls = self + .is_nullable + .then(|| BooleanBufferBuilder::new(pos.len())); + + for p in pos { + let end_idx = match (tape.get(*p), nulls.as_mut()) { + (TapeElement::StartList(end_idx), None) => end_idx, + (TapeElement::StartList(end_idx), Some(nulls)) => { + nulls.append(true); + end_idx + } + (TapeElement::Null, Some(nulls)) => { + nulls.append(false); + *p + 1 + } + _ => return Err(tape.error(*p, "[")), + }; + + let offset = O::from_usize(child_pos.len()).ok_or_else(|| { + ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) + })?; + offsets.append(offset); + + let start_count = child_pos.len(); + let mut cur_idx = *p + 1; + while cur_idx < end_idx { + child_pos.push(cur_idx); + cur_idx = tape.next(cur_idx, "list value")?; + } + + let size = O::from_usize(child_pos.len() - start_count).ok_or_else(|| { + ArrowError::JsonError(format!("size overflow decoding {}", self.data_type)) + })?; + sizes.append(size); + } + + let child_data = self.decoder.decode(tape, &child_pos)?; + let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish())); + + let data = ArrayDataBuilder::new(self.data_type.clone()) + .len(pos.len()) + .nulls(nulls) + .add_buffer(offsets.finish()) + .add_buffer(sizes.finish()) + .child_data(vec![child_data]); + + // Safety + // Validated lengths above + Ok(unsafe { data.build_unchecked() }) + } +} diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index 786cf9212d04..d189cf0dcc4d 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -154,6 +154,7 @@ pub use schema::*; use crate::reader::boolean_array::BooleanArrayDecoder; use crate::reader::decimal_array::DecimalArrayDecoder; use crate::reader::list_array::ListArrayDecoder; +use crate::reader::list_view_array::ListViewArrayDecoder; use crate::reader::map_array::MapArrayDecoder; use crate::reader::null_array::NullArrayDecoder; use crate::reader::primitive_array::PrimitiveArrayDecoder; @@ -168,6 +169,7 @@ mod binary_array; mod boolean_array; mod decimal_array; mod list_array; +mod list_view_array; mod map_array; mod null_array; mod primitive_array; @@ -790,6 +792,8 @@ fn make_decoder( DataType::LargeUtf8 => Ok(Box::new(StringArrayDecoder::::new(coerce_primitive))), DataType::List(_) => Ok(Box::new(ListArrayDecoder::::new(ctx, data_type, is_nullable)?)), DataType::LargeList(_) => Ok(Box::new(ListArrayDecoder::::new(ctx, data_type, is_nullable)?)), + DataType::ListView(_) => Ok(Box::new(ListViewArrayDecoder::::new(ctx, data_type, is_nullable)?)), + DataType::LargeListView(_) => Ok(Box::new(ListViewArrayDecoder::::new(ctx, data_type, is_nullable)?)), DataType::Struct(_) => Ok(Box::new(StructArrayDecoder::new(ctx, data_type, is_nullable)?)), DataType::Binary => Ok(Box::new(BinaryArrayDecoder::::default())), DataType::LargeBinary => Ok(Box::new(BinaryArrayDecoder::::default())), @@ -813,7 +817,10 @@ mod tests { use std::io::{BufReader, Cursor, Seek}; use arrow_array::cast::AsArray; - use arrow_array::{Array, BooleanArray, Float64Array, ListArray, StringArray, StringViewArray}; + use arrow_array::{ + Array, BooleanArray, Float64Array, GenericListViewArray, ListArray, OffsetSizeTrait, + StringArray, StringViewArray, + }; use arrow_buffer::{ArrowNativeType, Buffer}; use arrow_cast::display::{ArrayFormatter, FormatOptions}; use arrow_data::ArrayDataBuilder; @@ -2190,6 +2197,81 @@ mod tests { assert_eq!(read, expected); } + fn assert_read_list_view() { + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let data_type = GenericListViewArray::::DATA_TYPE_CONSTRUCTOR(field.clone()); + let schema = Arc::new(Schema::new(vec![Field::new("lv", data_type, true)])); + + let buf = r#" + {"lv": [1, 2, 3]} + {"lv": [4, null]} + {"lv": null} + {"lv": [6]} + {"lv": []} + "#; + + let batches = do_read(buf, 1024, false, false, schema); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + let col = batch.column(0); + let list_view = col + .as_any() + .downcast_ref::>() + .unwrap(); + + assert_eq!(list_view.len(), 5); + + // Check offsets and sizes + let expected_offsets: Vec = vec![0, 3, 5, 5, 6] + .into_iter() + .map(|v| O::usize_as(v)) + .collect(); + let expected_sizes: Vec = vec![3, 2, 0, 1, 0] + .into_iter() + .map(|v| O::usize_as(v)) + .collect(); + assert_eq!(list_view.value_offsets(), &expected_offsets); + assert_eq!(list_view.value_sizes(), &expected_sizes); + + // Row 0: [1, 2, 3] + assert!(list_view.is_valid(0)); + let vals = list_view.value(0); + let ints = vals.as_primitive::(); + assert_eq!(ints.values(), &[1, 2, 3]); + + // Row 1: [4, null] + assert!(list_view.is_valid(1)); + let vals = list_view.value(1); + let ints = vals.as_primitive::(); + assert_eq!(ints.len(), 2); + assert_eq!(ints.value(0), 4); + assert!(ints.is_null(1)); + + // Row 2: null + assert!(list_view.is_null(2)); + + // Row 3: [6] + assert!(list_view.is_valid(3)); + let vals = list_view.value(3); + let ints = vals.as_primitive::(); + assert_eq!(ints.values(), &[6]); + + // Row 4: [] + assert!(list_view.is_valid(4)); + let vals = list_view.value(4); + assert_eq!(vals.len(), 0); + } + + #[test] + fn test_read_list_view() { + assert_read_list_view::(); + } + + #[test] + fn test_read_large_list_view() { + assert_read_list_view::(); + } + #[test] fn test_skip_empty_lines() { let schema = Schema::new(vec![Field::new("a", DataType::Int64, true)]); diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs index d7c3fbbe2e34..18e4f05c214d 100644 --- a/arrow-json/src/writer/encoder.rs +++ b/arrow-json/src/writer/encoder.rs @@ -358,6 +358,14 @@ pub fn make_encoder<'a>( let array = array.as_list::(); NullableEncoder::new(Box::new(ListEncoder::try_new(field, array, options)?), array.nulls().cloned()) } + DataType::ListView(_) => { + let array = array.as_list_view::(); + NullableEncoder::new(Box::new(ListViewEncoder::try_new(field, array, options)?), array.nulls().cloned()) + } + DataType::LargeListView(_) => { + let array = array.as_list_view::(); + NullableEncoder::new(Box::new(ListViewEncoder::try_new(field, array, options)?), array.nulls().cloned()) + } DataType::FixedSizeList(_, _) => { let array = array.as_fixed_size_list(); NullableEncoder::new(Box::new(FixedSizeListEncoder::try_new(field, array, options)?), array.nulls().cloned()) @@ -687,6 +695,56 @@ impl Encoder for ListEncoder<'_, O> { } } +struct ListViewEncoder<'a, O: OffsetSizeTrait> { + offsets: ScalarBuffer, + sizes: ScalarBuffer, + encoder: NullableEncoder<'a>, +} + +impl<'a, O: OffsetSizeTrait> ListViewEncoder<'a, O> { + fn try_new( + field: &'a FieldRef, + array: &'a GenericListViewArray, + options: &'a EncoderOptions, + ) -> Result { + let encoder = make_encoder(field, array.values().as_ref(), options)?; + Ok(Self { + offsets: array.offsets().clone(), + sizes: array.sizes().clone(), + encoder, + }) + } +} + +impl Encoder for ListViewEncoder<'_, O> { + fn encode(&mut self, idx: usize, out: &mut Vec) { + let start = self.offsets[idx].as_usize(); + let end = start + self.sizes[idx].as_usize(); + out.push(b'['); + + if self.encoder.has_nulls() { + for idx in start..end { + if idx != start { + out.push(b',') + } + if self.encoder.is_null(idx) { + out.extend_from_slice(b"null"); + } else { + self.encoder.encode(idx, out); + } + } + } else { + for idx in start..end { + if idx != start { + out.push(b',') + } + self.encoder.encode(idx, out); + } + } + out.push(b']'); + } +} + struct FixedSizeListEncoder<'a> { value_length: usize, encoder: NullableEncoder<'a>, diff --git a/arrow-json/src/writer/mod.rs b/arrow-json/src/writer/mod.rs index 2fac5ab62353..7c4bb737152f 100644 --- a/arrow-json/src/writer/mod.rs +++ b/arrow-json/src/writer/mod.rs @@ -1241,6 +1241,58 @@ mod tests { ); } + fn assert_write_list_view() { + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let data_type = GenericListViewArray::::DATA_TYPE_CONSTRUCTOR(field.clone()); + let schema = Schema::new(vec![Field::new("lv", data_type, true)]); + + // rows: [1, 2, 3], [4, null], null, [6] + let values = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), None, Some(6)]); + let offsets = [0, 3, 0, 5] + .iter() + .map(|&v| O::from_usize(v).unwrap()) + .collect::>(); + let sizes = [3, 2, 0, 1] + .iter() + .map(|&v| O::from_usize(v).unwrap()) + .collect::>(); + let list_view = GenericListViewArray::::try_new( + field, + ScalarBuffer::from(offsets), + ScalarBuffer::from(sizes), + Arc::new(values), + Some(NullBuffer::from_iter([true, true, false, true])), + ) + .unwrap(); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(list_view)]).unwrap(); + + let mut buf = Vec::new(); + { + let mut writer = LineDelimitedWriter::new(&mut buf); + writer.write_batches(&[&batch]).unwrap(); + } + + assert_json_eq( + &buf, + r#"{"lv":[1,2,3]} +{"lv":[4,null]} +{} +{"lv":[6]} +"#, + ); + } + + #[test] + fn write_list_view() { + assert_write_list_view::(); + } + + #[test] + fn write_large_list_view() { + assert_write_list_view::(); + } + fn test_write_for_file(test_file: &str, remove_nulls: bool) { let file = File::open(test_file).unwrap(); let mut reader = BufReader::new(file); From 4ad72f3be416c0960e5bf4b10f626a0690eddd50 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Wed, 18 Feb 2026 21:46:09 -0500 Subject: [PATCH 2/4] Refactor using ListLikeArray --- arrow-json/src/writer/encoder.rs | 125 ++++--------------------------- 1 file changed, 14 insertions(+), 111 deletions(-) diff --git a/arrow-json/src/writer/encoder.rs b/arrow-json/src/writer/encoder.rs index 18e4f05c214d..45055c5a36a8 100644 --- a/arrow-json/src/writer/encoder.rs +++ b/arrow-json/src/writer/encoder.rs @@ -352,23 +352,23 @@ pub fn make_encoder<'a>( } DataType::List(_) => { let array = array.as_list::(); - NullableEncoder::new(Box::new(ListEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::LargeList(_) => { let array = array.as_list::(); - NullableEncoder::new(Box::new(ListEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::ListView(_) => { let array = array.as_list_view::(); - NullableEncoder::new(Box::new(ListViewEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::LargeListView(_) => { let array = array.as_list_view::(); - NullableEncoder::new(Box::new(ListViewEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::FixedSizeList(_, _) => { let array = array.as_fixed_size_list(); - NullableEncoder::new(Box::new(FixedSizeListEncoder::try_new(field, array, options)?), array.nulls().cloned()) + NullableEncoder::new(Box::new(ListLikeEncoder::try_new(field, array, options)?), array.nulls().cloned()) } DataType::Dictionary(_, _) => downcast_dictionary_array! { @@ -647,127 +647,30 @@ impl Encoder for BinaryViewEncoder<'_> { } } -struct ListEncoder<'a, O: OffsetSizeTrait> { - offsets: OffsetBuffer, +struct ListLikeEncoder<'a, L: ListLikeArray> { + list_array: &'a L, encoder: NullableEncoder<'a>, } -impl<'a, O: OffsetSizeTrait> ListEncoder<'a, O> { +impl<'a, L: ListLikeArray> ListLikeEncoder<'a, L> { fn try_new( field: &'a FieldRef, - array: &'a GenericListArray, - options: &'a EncoderOptions, - ) -> Result { - let encoder = make_encoder(field, array.values().as_ref(), options)?; - Ok(Self { - offsets: array.offsets().clone(), - encoder, - }) - } -} - -impl Encoder for ListEncoder<'_, O> { - fn encode(&mut self, idx: usize, out: &mut Vec) { - let end = self.offsets[idx + 1].as_usize(); - let start = self.offsets[idx].as_usize(); - out.push(b'['); - - if self.encoder.has_nulls() { - for idx in start..end { - if idx != start { - out.push(b',') - } - if self.encoder.is_null(idx) { - out.extend_from_slice(b"null"); - } else { - self.encoder.encode(idx, out); - } - } - } else { - for idx in start..end { - if idx != start { - out.push(b',') - } - self.encoder.encode(idx, out); - } - } - out.push(b']'); - } -} - -struct ListViewEncoder<'a, O: OffsetSizeTrait> { - offsets: ScalarBuffer, - sizes: ScalarBuffer, - encoder: NullableEncoder<'a>, -} - -impl<'a, O: OffsetSizeTrait> ListViewEncoder<'a, O> { - fn try_new( - field: &'a FieldRef, - array: &'a GenericListViewArray, - options: &'a EncoderOptions, - ) -> Result { - let encoder = make_encoder(field, array.values().as_ref(), options)?; - Ok(Self { - offsets: array.offsets().clone(), - sizes: array.sizes().clone(), - encoder, - }) - } -} - -impl Encoder for ListViewEncoder<'_, O> { - fn encode(&mut self, idx: usize, out: &mut Vec) { - let start = self.offsets[idx].as_usize(); - let end = start + self.sizes[idx].as_usize(); - out.push(b'['); - - if self.encoder.has_nulls() { - for idx in start..end { - if idx != start { - out.push(b',') - } - if self.encoder.is_null(idx) { - out.extend_from_slice(b"null"); - } else { - self.encoder.encode(idx, out); - } - } - } else { - for idx in start..end { - if idx != start { - out.push(b',') - } - self.encoder.encode(idx, out); - } - } - out.push(b']'); - } -} - -struct FixedSizeListEncoder<'a> { - value_length: usize, - encoder: NullableEncoder<'a>, -} - -impl<'a> FixedSizeListEncoder<'a> { - fn try_new( - field: &'a FieldRef, - array: &'a FixedSizeListArray, + array: &'a L, options: &'a EncoderOptions, ) -> Result { let encoder = make_encoder(field, array.values().as_ref(), options)?; Ok(Self { + list_array: array, encoder, - value_length: array.value_length().as_usize(), }) } } -impl Encoder for FixedSizeListEncoder<'_> { +impl Encoder for ListLikeEncoder<'_, L> { fn encode(&mut self, idx: usize, out: &mut Vec) { - let start = idx * self.value_length; - let end = start + self.value_length; + let range = self.list_array.element_range(idx); + let start = range.start; + let end = range.end; out.push(b'['); if self.encoder.has_nulls() { for idx in start..end { From ddb4d60710fd690558d65f9388a2cb4fca3f89a9 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Sun, 1 Mar 2026 20:22:36 -0500 Subject: [PATCH 3/4] Refactor decoder --- arrow-json/src/reader/list_array.rs | 53 ++++++++--- arrow-json/src/reader/list_view_array.rs | 112 ----------------------- arrow-json/src/reader/mod.rs | 4 +- 3 files changed, 41 insertions(+), 128 deletions(-) delete mode 100644 arrow-json/src/reader/list_view_array.rs diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs index d363b6be9780..dc45baa62420 100644 --- a/arrow-json/src/reader/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -24,22 +24,27 @@ use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::marker::PhantomData; -pub struct ListArrayDecoder { +pub type ListArrayDecoder = ListLikeArrayDecoder; +pub type ListViewArrayDecoder = ListLikeArrayDecoder; + +pub struct ListLikeArrayDecoder { data_type: DataType, decoder: Box, phantom: PhantomData, is_nullable: bool, } -impl ListArrayDecoder { +impl ListLikeArrayDecoder { pub fn new( ctx: &DecoderContext, data_type: &DataType, is_nullable: bool, ) -> Result { - let field = match data_type { - DataType::List(f) if !O::IS_LARGE => f, - DataType::LargeList(f) if O::IS_LARGE => f, + let field = match (IS_VIEW, data_type) { + (false, DataType::List(f)) if !O::IS_LARGE => f, + (false, DataType::LargeList(f)) if O::IS_LARGE => f, + (true, DataType::ListView(f)) if !O::IS_LARGE => f, + (true, DataType::LargeListView(f)) if O::IS_LARGE => f, _ => unreachable!(), }; let decoder = ctx.make_decoder(field.data_type(), field.is_nullable())?; @@ -53,11 +58,14 @@ impl ListArrayDecoder { } } -impl ArrayDecoder for ListArrayDecoder { +impl ArrayDecoder for ListLikeArrayDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut child_pos = Vec::with_capacity(pos.len()); - let mut offsets = BufferBuilder::::new(pos.len() + 1); - offsets.append(O::from_usize(0).unwrap()); + let mut offsets = BufferBuilder::::new(pos.len() + usize::from(!IS_VIEW)); + if !IS_VIEW { + offsets.append(O::from_usize(0).unwrap()); + } + let mut sizes = IS_VIEW.then(|| BufferBuilder::::new(pos.len())); let mut nulls = self .is_nullable @@ -77,6 +85,14 @@ impl ArrayDecoder for ListArrayDecoder { _ => return Err(tape.error(*p, "[")), }; + let start_idx = child_pos.len(); + if IS_VIEW { + let offset = O::from_usize(start_idx).ok_or_else(|| { + ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) + })?; + offsets.append(offset); + } + let mut cur_idx = *p + 1; while cur_idx < end_idx { child_pos.push(cur_idx); @@ -85,21 +101,32 @@ impl ArrayDecoder for ListArrayDecoder { cur_idx = tape.next(cur_idx, "list value")?; } - let offset = O::from_usize(child_pos.len()).ok_or_else(|| { - ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) - })?; - offsets.append(offset) + if IS_VIEW { + let size = O::from_usize(child_pos.len() - start_idx).ok_or_else(|| { + ArrowError::JsonError(format!("size overflow decoding {}", self.data_type)) + })?; + sizes.as_mut().unwrap().append(size); + } else { + let offset = O::from_usize(child_pos.len()).ok_or_else(|| { + ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) + })?; + offsets.append(offset); + } } let child_data = self.decoder.decode(tape, &child_pos)?; let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish())); - let data = ArrayDataBuilder::new(self.data_type.clone()) + let mut data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) .nulls(nulls) .add_buffer(offsets.finish()) .child_data(vec![child_data]); + if let Some(mut sizes) = sizes { + data = data.add_buffer(sizes.finish()); + } + // Safety // Validated lengths above Ok(unsafe { data.build_unchecked() }) diff --git a/arrow-json/src/reader/list_view_array.rs b/arrow-json/src/reader/list_view_array.rs deleted file mode 100644 index 952b105948b9..000000000000 --- a/arrow-json/src/reader/list_view_array.rs +++ /dev/null @@ -1,112 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::reader::tape::{Tape, TapeElement}; -use crate::reader::{ArrayDecoder, DecoderContext}; -use arrow_array::OffsetSizeTrait; -use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; -use arrow_buffer::buffer::NullBuffer; -use arrow_data::{ArrayData, ArrayDataBuilder}; -use arrow_schema::{ArrowError, DataType}; -use std::marker::PhantomData; - -pub struct ListViewArrayDecoder { - data_type: DataType, - decoder: Box, - phantom: PhantomData, - is_nullable: bool, -} - -impl ListViewArrayDecoder { - pub fn new( - ctx: &DecoderContext, - data_type: &DataType, - is_nullable: bool, - ) -> Result { - let field = match data_type { - DataType::ListView(f) if !O::IS_LARGE => f, - DataType::LargeListView(f) if O::IS_LARGE => f, - _ => unreachable!(), - }; - let decoder = ctx.make_decoder(field.data_type(), field.is_nullable())?; - - Ok(Self { - data_type: data_type.clone(), - decoder, - phantom: Default::default(), - is_nullable, - }) - } -} - -impl ArrayDecoder for ListViewArrayDecoder { - fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { - let mut child_pos = Vec::with_capacity(pos.len()); - let mut offsets = BufferBuilder::::new(pos.len()); - let mut sizes = BufferBuilder::::new(pos.len()); - - let mut nulls = self - .is_nullable - .then(|| BooleanBufferBuilder::new(pos.len())); - - for p in pos { - let end_idx = match (tape.get(*p), nulls.as_mut()) { - (TapeElement::StartList(end_idx), None) => end_idx, - (TapeElement::StartList(end_idx), Some(nulls)) => { - nulls.append(true); - end_idx - } - (TapeElement::Null, Some(nulls)) => { - nulls.append(false); - *p + 1 - } - _ => return Err(tape.error(*p, "[")), - }; - - let offset = O::from_usize(child_pos.len()).ok_or_else(|| { - ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) - })?; - offsets.append(offset); - - let start_count = child_pos.len(); - let mut cur_idx = *p + 1; - while cur_idx < end_idx { - child_pos.push(cur_idx); - cur_idx = tape.next(cur_idx, "list value")?; - } - - let size = O::from_usize(child_pos.len() - start_count).ok_or_else(|| { - ArrowError::JsonError(format!("size overflow decoding {}", self.data_type)) - })?; - sizes.append(size); - } - - let child_data = self.decoder.decode(tape, &child_pos)?; - let nulls = nulls.as_mut().map(|x| NullBuffer::new(x.finish())); - - let data = ArrayDataBuilder::new(self.data_type.clone()) - .len(pos.len()) - .nulls(nulls) - .add_buffer(offsets.finish()) - .add_buffer(sizes.finish()) - .child_data(vec![child_data]); - - // Safety - // Validated lengths above - Ok(unsafe { data.build_unchecked() }) - } -} diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs index d189cf0dcc4d..d1d9709135bc 100644 --- a/arrow-json/src/reader/mod.rs +++ b/arrow-json/src/reader/mod.rs @@ -153,8 +153,7 @@ pub use schema::*; use crate::reader::boolean_array::BooleanArrayDecoder; use crate::reader::decimal_array::DecimalArrayDecoder; -use crate::reader::list_array::ListArrayDecoder; -use crate::reader::list_view_array::ListViewArrayDecoder; +use crate::reader::list_array::{ListArrayDecoder, ListViewArrayDecoder}; use crate::reader::map_array::MapArrayDecoder; use crate::reader::null_array::NullArrayDecoder; use crate::reader::primitive_array::PrimitiveArrayDecoder; @@ -169,7 +168,6 @@ mod binary_array; mod boolean_array; mod decimal_array; mod list_array; -mod list_view_array; mod map_array; mod null_array; mod primitive_array; From 285baaa60136019ad59ec8bcbbd1929154f259c7 Mon Sep 17 00:00:00 2001 From: Liam Bao Date: Sun, 1 Mar 2026 20:57:05 -0500 Subject: [PATCH 4/4] Move IS_VIEW check out of the loop --- arrow-json/src/reader/list_array.rs | 48 ++++++++++++----------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs index dc45baa62420..ea23403c4b18 100644 --- a/arrow-json/src/reader/list_array.rs +++ b/arrow-json/src/reader/list_array.rs @@ -18,8 +18,8 @@ use crate::reader::tape::{Tape, TapeElement}; use crate::reader::{ArrayDecoder, DecoderContext}; use arrow_array::OffsetSizeTrait; -use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder}; -use arrow_buffer::buffer::NullBuffer; +use arrow_array::builder::BooleanBufferBuilder; +use arrow_buffer::{Buffer, buffer::NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{ArrowError, DataType}; use std::marker::PhantomData; @@ -61,11 +61,8 @@ impl ListLikeArrayDecoder { impl ArrayDecoder for ListLikeArrayDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut child_pos = Vec::with_capacity(pos.len()); - let mut offsets = BufferBuilder::::new(pos.len() + usize::from(!IS_VIEW)); - if !IS_VIEW { - offsets.append(O::from_usize(0).unwrap()); - } - let mut sizes = IS_VIEW.then(|| BufferBuilder::::new(pos.len())); + let mut offsets = Vec::with_capacity(pos.len() + 1); + offsets.push(O::from_usize(0).unwrap()); let mut nulls = self .is_nullable @@ -85,14 +82,6 @@ impl ArrayDecoder for ListLikeArrayDeco _ => return Err(tape.error(*p, "[")), }; - let start_idx = child_pos.len(); - if IS_VIEW { - let offset = O::from_usize(start_idx).ok_or_else(|| { - ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) - })?; - offsets.append(offset); - } - let mut cur_idx = *p + 1; while cur_idx < end_idx { child_pos.push(cur_idx); @@ -101,17 +90,10 @@ impl ArrayDecoder for ListLikeArrayDeco cur_idx = tape.next(cur_idx, "list value")?; } - if IS_VIEW { - let size = O::from_usize(child_pos.len() - start_idx).ok_or_else(|| { - ArrowError::JsonError(format!("size overflow decoding {}", self.data_type)) - })?; - sizes.as_mut().unwrap().append(size); - } else { - let offset = O::from_usize(child_pos.len()).ok_or_else(|| { - ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) - })?; - offsets.append(offset); - } + let offset = O::from_usize(child_pos.len()).ok_or_else(|| { + ArrowError::JsonError(format!("offset overflow decoding {}", self.data_type)) + })?; + offsets.push(offset); } let child_data = self.decoder.decode(tape, &child_pos)?; @@ -120,11 +102,19 @@ impl ArrayDecoder for ListLikeArrayDeco let mut data = ArrayDataBuilder::new(self.data_type.clone()) .len(pos.len()) .nulls(nulls) - .add_buffer(offsets.finish()) .child_data(vec![child_data]); - if let Some(mut sizes) = sizes { - data = data.add_buffer(sizes.finish()); + if IS_VIEW { + let mut sizes = Vec::with_capacity(offsets.len() - 1); + for i in 1..offsets.len() { + sizes.push(offsets[i] - offsets[i - 1]); + } + offsets.pop(); + data = data + .add_buffer(Buffer::from_vec(offsets)) + .add_buffer(Buffer::from_vec(sizes)); + } else { + data = data.add_buffer(Buffer::from_vec(offsets)); } // Safety