diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 5b313913a7ef..9eb59865177c 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -33,14 +33,18 @@ use std::sync::Arc; use super::ByteArrayType; -/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays. +/// [Variable-size Binary View Layout]: An array of variable length bytes views. +/// +/// This array type is used to store variable length byte data (e.g. Strings, Binary) +/// and has efficient operations such as `take`, `filter`, and comparison. /// /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout /// -/// This is different from [`GenericByteArray`] as it stores both an offset and -/// length meaning that take / filter operations can be implemented without -/// copying the underlying data. In addition, it stores an inlined prefix which -/// can be used to speed up comparisons. +/// This is different from [`GenericByteArray`], which also stores variable +/// length byte data, as it represents strings with an offset and length. `take` +/// and `filter` like operations are implemented by manipulating the "views" +/// (`u128`) without modifying the bytes. Each view also stores an inlined +/// prefix which speed up comparisons. /// /// # See Also /// @@ -50,11 +54,18 @@ use super::ByteArrayType; /// /// [`ByteView`]: arrow_data::ByteView /// -/// # Notes +/// # Use the [`eq`] kernel to compare the logical content. +/// +/// Comparing two `GenericByteViewArray` using PartialEq compares by structure +/// (the `u128`s) and contents of the buffers, not by logical content. As there +/// are many different buffer layouts to represent the same data (e.g. different +/// offsets, different buffer sizes, etc) two arrays with the same data may not +/// compare equal. +/// +/// To compare the logical content of two `GenericByteViewArray`s, use the [`eq`] +/// kernel. /// -/// Comparing two `GenericByteViewArray` using PartialEq compares by structure, -/// not by value. as there are many different buffer layouts to represent the -/// same data (e.g. different offsets, different buffer sizes, etc). +/// [`eq`]: https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html /// /// # Layout: "views" and buffers /// @@ -86,6 +97,52 @@ use super::ByteArrayType; /// view and the entire string is stored in one of the buffers. See [`ByteView`] /// to access the fields of the these views. /// +/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely +/// the easiest and fastest way to work with this data. However, it is possible +/// to access the views and buffers directly for more control. +/// +/// For example +/// +/// ```rust +/// # use arrow_array::StringViewArray; +/// # use arrow_array::Array; +/// use arrow_data::ByteView; +/// let array = StringViewArray::from(vec![ +/// "hello", +/// "this string is longer than 12 bytes", +/// "this string is also longer than 12 bytes" +/// ]); +/// +/// // ** Examine the first view (short string) ** +/// assert!(array.is_valid(0)); // Check for nulls +/// let short_view: u128 = array.views()[0]; // "hello" +/// // get length of the string +/// let len = short_view as u32; +/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view +/// // SAFETY: `view` is a valid view +/// let value = unsafe { +/// StringViewArray::inline_value(&short_view, len as usize) +/// }; +/// assert_eq!(value, b"hello"); +/// +/// // ** Examine the third view (long string) ** +/// assert!(array.is_valid(12)); // Check for nulls +/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes" +/// let len = long_view as u32; +/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer +/// let view = ByteView::from(long_view); // use ByteView to access the fields +/// assert_eq!(view.length, 40); +/// assert_eq!(view.buffer_index, 0); +/// assert_eq!(view.offset, 35); // data starts after the first long string +/// // Views for long strings store a 4 byte prefix +/// let prefix = view.prefix.to_le_bytes(); +/// assert_eq!(&prefix, b"this"); +/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly) +/// assert_eq!(value, "this string is also longer than 12 bytes"); +/// ``` +/// +/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html +/// /// Unlike [`GenericByteArray`], there are no constraints on the offsets other /// than they must point into a valid buffer. However, they can be out of order, /// non continuous and overlapping. @@ -694,6 +751,8 @@ where /// A [`GenericByteViewArray`] of `[u8]` /// +/// See [`GenericByteViewArray`] for format and layout details. +/// /// # Example /// ``` /// use arrow_array::BinaryViewArray; @@ -733,6 +792,8 @@ impl From>> for BinaryViewArray { /// A [`GenericByteViewArray`] that stores utf8 data /// +/// See [`GenericByteViewArray`] for format and layout details. +/// /// # Example /// ``` /// use arrow_array::StringViewArray;