From 0ee60cfce399e6be0dc7def025e0ed802b382e6f Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 19 Apr 2025 10:57:27 +0800 Subject: [PATCH 1/3] Add support for creating random Decimal128 and Decimal256 arrays --- arrow/src/util/data_gen.rs | 75 +++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 42a0798f5540..9f71c47869df 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -20,7 +20,10 @@ use std::sync::Arc; use rand::{ - distr::uniform::{SampleRange, SampleUniform}, + distr::{ + uniform::{SampleRange, SampleUniform}, + Distribution, StandardUniform, + }, Rng, }; @@ -55,6 +58,14 @@ pub fn create_random_batch( /// Create a random [ArrayRef] from a [DataType] with a length, /// null density and true density (for [BooleanArray]). +/// +/// # Arguments +/// +/// * `field` - The field containing the data type for which to create a random array +/// * `size` - The number of elements in the generated array +/// * `null_density` - The approximate fraction of null values in the resulting array (0.0 to 1.0) +/// * `true_density` - The approximate fraction of true values in boolean arrays (0.0 to 1.0) +/// pub fn create_random_array( field: &Field, size: usize, @@ -215,6 +226,8 @@ pub fn create_random_array( crate::compute::cast(&v, d)? } Map(_, _) => create_random_map_array(field, size, null_density, true_density)?, + Decimal128(_, _) => create_random_decimal_array(field, size, null_density)?, + Decimal256(_, _) => create_random_decimal_array(field, size, null_density)?, other => { return Err(ArrowError::NotYetImplemented(format!( "Generating random arrays not yet implemented for {other:?}" @@ -223,6 +236,47 @@ pub fn create_random_array( }) } +#[inline] +fn create_random_decimal_array(field: &Field, size: usize, null_density: f32) -> Result { + let mut rng = seedable_rng(); + + match field.data_type() { + DataType::Decimal128(precision, scale) => { + let values = (0..size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + Some(rng.random::()) + } + }) + .collect::>(); + Ok(Arc::new( + Decimal128Array::from(values).with_precision_and_scale(*precision, *scale)?, + )) + } + DataType::Decimal256(precision, scale) => { + let values = (0..size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + Some(i256::from_parts(rng.random::(), rng.random::())) + } + }) + .collect::>(); + Ok(Arc::new( + Decimal256Array::from(values).with_precision_and_scale(*precision, *scale)?, + )) + } + _ => { + return Err(ArrowError::InvalidArgumentError(format!( + "Cannot create decimal array for field {field:?}" + ))) + } + } +} + #[inline] fn create_random_list_array( field: &Field, @@ -745,4 +799,23 @@ mod tests { assert_eq!(array.as_map().keys().data_type(), &DataType::Utf8); assert_eq!(array.as_map().values().data_type(), &DataType::Utf8); } + + #[test] + fn test_create_decimal_array() { + let size = 10; + let fields = vec![ + Field::new("a", DataType::Decimal128(10, 2), true), + Field::new("b", DataType::Decimal256(10, 2), true), + ]; + let schema = Schema::new(fields); + let schema_ref = Arc::new(schema); + let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap(); + + + assert_eq!(batch.schema(), schema_ref); + assert_eq!(batch.num_columns(), schema_ref.fields().len()); + for array in batch.columns() { + assert_eq!(array.len(), size); + } + } } From ff744efa0853ed0ceb951dd10bd7e93aa23303b1 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 19 Apr 2025 10:59:50 +0800 Subject: [PATCH 2/3] chore --- arrow/src/util/data_gen.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 9f71c47869df..30a61a064805 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -804,14 +804,13 @@ mod tests { fn test_create_decimal_array() { let size = 10; let fields = vec![ - Field::new("a", DataType::Decimal128(10, 2), true), - Field::new("b", DataType::Decimal256(10, 2), true), + Field::new("a", DataType::Decimal128(10, -2), true), + Field::new("b", DataType::Decimal256(10, -2), true), ]; let schema = Schema::new(fields); let schema_ref = Arc::new(schema); let batch = create_random_batch(schema_ref.clone(), size, 0.35, 0.7).unwrap(); - assert_eq!(batch.schema(), schema_ref); assert_eq!(batch.num_columns(), schema_ref.fields().len()); for array in batch.columns() { From cc6b594b00f4bdd9eab5c82caaf110dbb2d11031 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 19 Apr 2025 11:05:07 +0800 Subject: [PATCH 3/3] chore: clippy --- arrow/src/util/data_gen.rs | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 30a61a064805..7ea05811d55b 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -20,10 +20,7 @@ use std::sync::Arc; use rand::{ - distr::{ - uniform::{SampleRange, SampleUniform}, - Distribution, StandardUniform, - }, + distr::uniform::{SampleRange, SampleUniform}, Rng, }; @@ -269,11 +266,9 @@ fn create_random_decimal_array(field: &Field, size: usize, null_density: f32) -> Decimal256Array::from(values).with_precision_and_scale(*precision, *scale)?, )) } - _ => { - return Err(ArrowError::InvalidArgumentError(format!( - "Cannot create decimal array for field {field:?}" - ))) - } + _ => Err(ArrowError::InvalidArgumentError(format!( + "Cannot create decimal array for field {field:?}" + ))), } }