From b7452bc4a341790b5c14653e0dfb29cca5d4018b Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Tue, 28 Oct 2025 15:37:42 +0400 Subject: [PATCH 1/9] Fix spark_bit_count impl for signed int types --- .../spark/src/function/bitwise/bit_count.rs | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index ba44d3bc0a95..ff098cdeb917 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -93,25 +93,25 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { DataType::Int8 => { let result: Int32Array = value_array .as_primitive::() - .unary(|v| v.count_ones() as i32); + .unary(|v| bit_count(v.into())); Ok(Arc::new(result)) } DataType::Int16 => { let result: Int32Array = value_array .as_primitive::() - .unary(|v| v.count_ones() as i32); + .unary(|v| bit_count(v.into())); Ok(Arc::new(result)) } DataType::Int32 => { let result: Int32Array = value_array .as_primitive::() - .unary(|v| v.count_ones() as i32); + .unary(|v| bit_count(v.into())); Ok(Arc::new(result)) } DataType::Int64 => { let result: Int32Array = value_array .as_primitive::() - .unary(|v| v.count_ones() as i32); + .unary(|v| bit_count(v.into())); Ok(Arc::new(result)) } DataType::UInt8 => { @@ -147,6 +147,18 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { } } +// Here’s the equivalent Rust implementation of the bitCount function (similar to Apache Spark's bitCount for LongType) +fn bit_count(i: i64) -> i32 { + let mut u = i as u64; + u = u - ((u >> 1) & 0x5555555555555555); + u = (u & 0x3333333333333333) + ((u >> 2) & 0x3333333333333333); + u = (u + (u >> 4)) & 0x0f0f0f0f0f0f0f0f; + u = u + (u >> 8); + u = u + (u >> 16); + u = u + (u >> 32); + (u as i32) & 0x7f +} + #[cfg(test)] mod tests { use super::*; @@ -192,7 +204,7 @@ mod tests { assert_eq!(arr.value(2), 2); assert_eq!(arr.value(3), 3); assert_eq!(arr.value(4), 4); - assert_eq!(arr.value(5), 8); + assert_eq!(arr.value(5), 64); } #[test] @@ -207,7 +219,7 @@ mod tests { assert_eq!(arr.value(1), 1); assert_eq!(arr.value(2), 8); assert_eq!(arr.value(3), 10); - assert_eq!(arr.value(4), 16); + assert_eq!(arr.value(4), 64); } #[test] @@ -222,7 +234,7 @@ mod tests { assert_eq!(arr.value(1), 1); // 0b00000000000000000000000000000001 = 1 assert_eq!(arr.value(2), 8); // 0b00000000000000000000000011111111 = 8 assert_eq!(arr.value(3), 10); // 0b00000000000000000000001111111111 = 10 - assert_eq!(arr.value(4), 32); // -1 in two's complement = all 32 bits set + assert_eq!(arr.value(4), 64); // -1 in two's complement = all 32 bits set } #[test] From c7900b43037da0f26f55548a3cc22b8489bb2c75 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Tue, 28 Oct 2025 15:50:42 +0400 Subject: [PATCH 2/9] Fix spark_bit_count impl for signed int types --- datafusion/spark/src/function/bitwise/bit_count.rs | 2 +- .../test_files/spark/bitwise/bit_count.slt | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index ff098cdeb917..01f071204a89 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -90,7 +90,7 @@ impl ScalarUDFImpl for SparkBitCount { fn spark_bit_count(value_array: &[ArrayRef]) -> Result { let value_array = value_array[0].as_ref(); match value_array.data_type() { - DataType::Int8 => { + DataType::Int8 | DataType::Boolean => { let result: Int32Array = value_array .as_primitive::() .unary(|v| bit_count(v.into())); diff --git a/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt b/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt index 2a75c7648d40..216d99025171 100644 --- a/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt +++ b/datafusion/sqllogictest/test_files/spark/bitwise/bit_count.slt @@ -59,17 +59,17 @@ SELECT bit_count(1023::int); query I SELECT bit_count(-1::int); ---- -32 +64 query I SELECT bit_count(-2::int); ---- -31 +63 query I SELECT bit_count(-3::int); ---- -31 +63 # Tests with different integer types query I @@ -85,7 +85,7 @@ SELECT bit_count(arrow_cast(15, 'Int8')); query I SELECT bit_count(arrow_cast(-1, 'Int8')); ---- -8 +64 query I SELECT bit_count(arrow_cast(0, 'Int16')); @@ -100,7 +100,7 @@ SELECT bit_count(arrow_cast(255, 'Int16')); query I SELECT bit_count(arrow_cast(-1, 'Int16')); ---- -16 +64 query I SELECT bit_count(arrow_cast(0, 'Int64')); @@ -214,7 +214,7 @@ SELECT bit_count(arrow_cast(2147483647, 'Int32')); query I SELECT bit_count(arrow_cast(-2147483648, 'Int32')); ---- -1 +33 query I SELECT bit_count(arrow_cast(9223372036854775807, 'Int64')); From a9ded85f70f58bc910fe0e58a3cf3ad7a3195022 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Tue, 28 Oct 2025 16:30:20 +0400 Subject: [PATCH 3/9] Fix clippy warnings --- datafusion/spark/src/function/bitwise/bit_count.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index 01f071204a89..5bc34f10ac8d 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -111,7 +111,7 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { DataType::Int64 => { let result: Int32Array = value_array .as_primitive::() - .unary(|v| bit_count(v.into())); + .unary(|v| bit_count(v)); Ok(Arc::new(result)) } DataType::UInt8 => { From bb3a91680fa8a0fcc14c1fbe616d8f87d6e8b265 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Tue, 28 Oct 2025 16:42:41 +0400 Subject: [PATCH 4/9] Fix clippy warnings --- datafusion/spark/src/function/bitwise/bit_count.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index 5bc34f10ac8d..a9fa62a5ac00 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -111,7 +111,7 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { DataType::Int64 => { let result: Int32Array = value_array .as_primitive::() - .unary(|v| bit_count(v)); + .unary(bit_count); Ok(Arc::new(result)) } DataType::UInt8 => { From a11bf7b570c488a84f0dc6345207f6fb6c22e4b9 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Tue, 28 Oct 2025 16:52:19 +0400 Subject: [PATCH 5/9] Fix fmt --- datafusion/spark/src/function/bitwise/bit_count.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index a9fa62a5ac00..9af07c7f5c76 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -109,9 +109,8 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { Ok(Arc::new(result)) } DataType::Int64 => { - let result: Int32Array = value_array - .as_primitive::() - .unary(bit_count); + let result: Int32Array = + value_array.as_primitive::().unary(bit_count); Ok(Arc::new(result)) } DataType::UInt8 => { From 08d1a6fba68f375875666a28cd4e69faca0d932b Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 30 Oct 2025 20:33:08 +0400 Subject: [PATCH 6/9] Fix PR issues --- .../spark/src/function/bitwise/bit_count.rs | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index 9af07c7f5c76..dcda24ea2663 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -23,6 +23,7 @@ use arrow::datatypes::{ DataType, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; +use datafusion_common::cast::as_boolean_array; use datafusion_common::{plan_err, Result}; use datafusion_expr::{ ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, @@ -46,6 +47,7 @@ impl SparkBitCount { Self { signature: Signature::one_of( vec![ + TypeSignature::Exact(vec![DataType::Boolean]), TypeSignature::Exact(vec![DataType::Int8]), TypeSignature::Exact(vec![DataType::Int16]), TypeSignature::Exact(vec![DataType::Int32]), @@ -90,7 +92,14 @@ impl ScalarUDFImpl for SparkBitCount { fn spark_bit_count(value_array: &[ArrayRef]) -> Result { let value_array = value_array[0].as_ref(); match value_array.data_type() { - DataType::Int8 | DataType::Boolean => { + DataType::Boolean => { + let result: Int32Array = as_boolean_array(value_array)? + .iter() + .map(|x| x.map(|y| bit_count(y.into()))) + .collect(); + Ok(Arc::new(result)) + } + DataType::Int8 => { let result: Int32Array = value_array .as_primitive::() .unary(|v| bit_count(v.into())); @@ -147,6 +156,7 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { } // Here’s the equivalent Rust implementation of the bitCount function (similar to Apache Spark's bitCount for LongType) +// https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala#L243 fn bit_count(i: i64) -> i32 { let mut u = i as u64; u = u - ((u >> 1) & 0x5555555555555555); @@ -162,8 +172,8 @@ fn bit_count(i: i64) -> i32 { mod tests { use super::*; use arrow::array::{ - Array, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, - UInt64Array, UInt8Array, + Array, BooleanArray, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, + UInt32Array, UInt64Array, UInt8Array, }; use arrow::datatypes::Int32Type; @@ -206,6 +216,17 @@ mod tests { assert_eq!(arr.value(5), 64); } + #[test] + fn test_bit_count_boolean() { + // Test bit_count on BooleanArray + let result = + spark_bit_count(&[Arc::new(BooleanArray::from(vec![true, false]))]).unwrap(); + + let arr = result.as_primitive::(); + assert_eq!(arr.value(0), 1); + assert_eq!(arr.value(1), 0); + } + #[test] fn test_bit_count_int16() { // Test bit_count on Int16Array From 94b1eb1181d24b7692ca0c54f875e842405942ce Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 30 Oct 2025 20:36:22 +0400 Subject: [PATCH 7/9] Fix PR issues --- datafusion/spark/src/function/bitwise/bit_count.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index dcda24ea2663..6369e1fac9d8 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -95,7 +95,7 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { DataType::Boolean => { let result: Int32Array = as_boolean_array(value_array)? .iter() - .map(|x| x.map(|y| bit_count(y.into()))) + .map(|x| x.map(|y| y as i32)) .collect(); Ok(Arc::new(result)) } From e1f61d37804335b2947c1ab6fdfcbb0f7497de8b Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 31 Oct 2025 20:43:46 +0400 Subject: [PATCH 8/9] Fix PR issues --- datafusion/spark/src/function/bitwise/bit_count.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index 6369e1fac9d8..5afd686b80a1 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -156,7 +156,8 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { } // Here’s the equivalent Rust implementation of the bitCount function (similar to Apache Spark's bitCount for LongType) -// https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala#L243 +// Spark: https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala#L243 +// Java impl: https://github.com/openjdk/jdk/blob/master/src/java.base/share/classes/java/lang/Long.java#L1584 fn bit_count(i: i64) -> i32 { let mut u = i as u64; u = u - ((u >> 1) & 0x5555555555555555); From 111b6eee7d6a2d1b98301b275af81b0987bece1f Mon Sep 17 00:00:00 2001 From: Jeffrey Vo Date: Sat, 1 Nov 2025 13:56:20 +1100 Subject: [PATCH 9/9] Apply suggestion from @Jefffrey --- datafusion/spark/src/function/bitwise/bit_count.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/spark/src/function/bitwise/bit_count.rs b/datafusion/spark/src/function/bitwise/bit_count.rs index 5afd686b80a1..4b414b57cb77 100644 --- a/datafusion/spark/src/function/bitwise/bit_count.rs +++ b/datafusion/spark/src/function/bitwise/bit_count.rs @@ -156,8 +156,8 @@ fn spark_bit_count(value_array: &[ArrayRef]) -> Result { } // Here’s the equivalent Rust implementation of the bitCount function (similar to Apache Spark's bitCount for LongType) -// Spark: https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala#L243 -// Java impl: https://github.com/openjdk/jdk/blob/master/src/java.base/share/classes/java/lang/Long.java#L1584 +// Spark: https://github.com/apache/spark/blob/ac717dd7aec665de578d7c6b0070e8fcdde3cea9/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/bitwiseExpressions.scala#L243 +// Java impl: https://github.com/openjdk/jdk/blob/d226023643f90027a8980d161ec6d423887ae3ce/src/java.base/share/classes/java/lang/Long.java#L1584 fn bit_count(i: i64) -> i32 { let mut u = i as u64; u = u - ((u >> 1) & 0x5555555555555555);