Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 28 additions & 26 deletions arrow-arith/src/arity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
use arrow_array::builder::BufferBuilder;
use arrow_array::*;
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
use arrow_buffer::{Buffer, MutableBuffer};
use arrow_data::ArrayData;
use arrow_schema::ArrowError;

Expand Down Expand Up @@ -124,13 +122,13 @@ where

let nulls = NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref());

let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r));
// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` is an iterator with a known size from a PrimitiveArray
let buffer = unsafe { Buffer::from_trusted_len_iter(values) };
let values = a
.values()
.into_iter()
.zip(b.values())
.map(|(l, r)| op(*l, *r));

let buffer: Vec<_> = values.collect();
Ok(PrimitiveArray::new(buffer.into(), nulls))
}

Expand Down Expand Up @@ -251,14 +249,16 @@ where
///
/// Return an error if the arrays have different lengths or
/// the operation is under erroneous
pub fn try_binary<A: ArrayAccessor, B: ArrayAccessor, F, O>(
a: A,
b: B,
pub fn try_binary<A, B, F, O>(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this an API change?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question 🤔, I believe it isn't?
I think it was bounded by primitive arrays (via ArrowPrimitiveType already?
I.e. you can not use it for a binary or boolean arrays.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is a breaking change, you could use this with any array provided the output was primitive

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm yeah, you are right!
Are we willing to make this change given that no test seems to break and other methods seem to be on PrimitiveArray instead (unary, unary_mut, try_unary, try_unary_mut, binary, etc. are on Primitive array)?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be a non-trivial functionality regression, even if in arrow-rs we aren't exploiting it - I could see this being useful for processing primitive dictionaries, for example.

Copy link
Copy Markdown
Contributor Author

@Dandandan Dandandan May 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW: I checked for DataFusion, here it is also only used (1 usage) for a primitive array.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we go back to the previous definition? I suspect you had a good reason for doing it this way, but I don't understand what it is

If we need to retain backward compatibility and want to release this before July, I recommend adding a new method called try_binary_primitive and add a link in the documentation for try_binary saying it is faster for primitive arrays to use that new function

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do that, but it seems not possible to have a safe api without regressing (because we need TrustedLen )

a: &PrimitiveArray<A>,
b: &PrimitiveArray<B>,
op: F,
) -> Result<PrimitiveArray<O>, ArrowError>
where
A: ArrowPrimitiveType,
B: ArrowPrimitiveType,
O: ArrowPrimitiveType,
F: Fn(A::Item, B::Item) -> Result<O::Native, ArrowError>,
F: Fn(A::Native, B::Native) -> Result<O::Native, ArrowError>,
{
if a.len() != b.len() {
return Err(ArrowError::ComputeError(
Expand All @@ -271,7 +271,7 @@ where
let len = a.len();

if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls(len, a, b, op)
try_binary_no_nulls(a, b, op)
} else {
let nulls =
NullBuffer::union(a.logical_nulls().as_ref(), b.logical_nulls().as_ref()).unwrap();
Expand Down Expand Up @@ -369,23 +369,25 @@ fn create_union_null_buffer(

/// This intentional inline(never) attribute helps LLVM optimize the loop.
#[inline(never)]
fn try_binary_no_nulls<A: ArrayAccessor, B: ArrayAccessor, F, O>(
len: usize,
a: A,
b: B,
fn try_binary_no_nulls<A, B, F, O>(
a: &PrimitiveArray<A>,
b: &PrimitiveArray<B>,
op: F,
) -> Result<PrimitiveArray<O>, ArrowError>
where
A: ArrowPrimitiveType,
B: ArrowPrimitiveType,
O: ArrowPrimitiveType,
F: Fn(A::Item, B::Item) -> Result<O::Native, ArrowError>,
B: ArrowPrimitiveType,
F: Fn(A::Native, B::Native) -> Result<O::Native, ArrowError>,
{
let mut buffer = MutableBuffer::new(len * O::Native::get_byte_width());
for idx in 0..len {
unsafe {
buffer.push_unchecked(op(a.value_unchecked(idx), b.value_unchecked(idx))?);
};
}
Ok(PrimitiveArray::new(buffer.into(), None))
let new_values = a
.values()
.into_iter()
.zip(b.values().into_iter())
.map(|(l, r)| op(*l, *r))
.collect::<Result<Vec<_>, ArrowError>>()?;
Ok(PrimitiveArray::new(new_values.into(), None))
}

/// This intentional inline(never) attribute helps LLVM optimize the loop.
Expand Down
23 changes: 7 additions & 16 deletions arrow-array/src/array/primitive_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -729,10 +729,8 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {

/// Creates a PrimitiveArray based on a constant value with `count` elements
pub fn from_value(value: T::Native, count: usize) -> Self {
unsafe {
let val_buf = Buffer::from_trusted_len_iter((0..count).map(|_| value));
Self::new(val_buf.into(), None)
}
let val_buf: Vec<_> = vec![value; count];
Self::new(val_buf.into(), None)
}

/// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
Expand Down Expand Up @@ -827,13 +825,8 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
F: Fn(T::Native) -> O::Native,
{
let nulls = self.nulls().cloned();
let values = self.values().iter().map(|v| op(*v));
// JUSTIFICATION
// Benefit
// ~60% speedup
// Soundness
// `values` is an iterator with a known size because arrays are sized.
let buffer = unsafe { Buffer::from_trusted_len_iter(values) };
let values = self.values().into_iter().map(|v| op(*v));
let buffer: Vec<_> = values.collect();
PrimitiveArray::new(buffer.into(), nulls)
}

Expand Down Expand Up @@ -1035,12 +1028,10 @@ impl<T: ArrowPrimitiveType> PrimitiveArray<T> {
F: FnMut(U::Item) -> T::Native,
{
let nulls = left.logical_nulls();
let buffer = unsafe {
let buffer: Vec<_> = (0..left.len())
// SAFETY: i in range 0..left.len()
let iter = (0..left.len()).map(|i| op(left.value_unchecked(i)));
// SAFETY: upper bound is trusted because `iter` is over a range
Buffer::from_trusted_len_iter(iter)
};
.map(|i| op(unsafe { left.value_unchecked(i) }))
Comment thread
Dandandan marked this conversation as resolved.
.collect();

PrimitiveArray::new(buffer.into(), nulls)
}
Expand Down
6 changes: 4 additions & 2 deletions arrow-buffer/src/buffer/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,13 @@ where
F: FnMut(u64) -> u64,
{
// reserve capacity and set length so we can get a typed view of u64 chunks
let mut result =
MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
let mut result = MutableBuffer::new(ceil(len_in_bits, 8));

let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);

// SAFETY: `MutableBuffer::set_len` is sound because it is initalized right after
unsafe { result.set_len(left_chunks.iter().len() * 8) };

let result_chunks = result.typed_data_mut::<u64>().iter_mut();

result_chunks
Expand Down
Loading