From a33c48b752f56e77f9d6a590a261a1cca6fc47fe Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sat, 1 Nov 2025 15:55:11 +0900 Subject: [PATCH 01/25] WIP --- rust/sedona-functions/src/lib.rs | 1 + rust/sedona-functions/src/register.rs | 1 + rust/sedona-functions/src/st_dump.rs | 163 ++++++++++++++++++++++++++ 3 files changed, 165 insertions(+) create mode 100644 rust/sedona-functions/src/st_dump.rs diff --git a/rust/sedona-functions/src/lib.rs b/rust/sedona-functions/src/lib.rs index 2a81319e..4d94bdce 100644 --- a/rust/sedona-functions/src/lib.rs +++ b/rust/sedona-functions/src/lib.rs @@ -31,6 +31,7 @@ mod st_buffer; mod st_centroid; mod st_collect; mod st_dimension; +mod st_dump; mod st_dwithin; pub mod st_envelope; pub mod st_envelope_aggr; diff --git a/rust/sedona-functions/src/register.rs b/rust/sedona-functions/src/register.rs index dd541443..4e605409 100644 --- a/rust/sedona-functions/src/register.rs +++ b/rust/sedona-functions/src/register.rs @@ -88,6 +88,7 @@ pub fn default_function_set() -> FunctionSet { crate::st_point::st_point_udf, crate::st_pointn::st_pointn_udf, crate::st_points::st_points_udf, + crate::st_dump::st_dump_udf, crate::st_points::st_npoints_udf, crate::st_pointzm::st_pointz_udf, crate::st_pointzm::st_pointm_udf, diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs new file mode 100644 index 00000000..44192b4b --- /dev/null +++ b/rust/sedona-functions/src/st_dump.rs @@ -0,0 +1,163 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +use arrow_array::builder::{BinaryBuilder, ListBuilder}; +use arrow_schema::DataType; +use datafusion_common::error::Result; +use datafusion_expr::{ + scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, +}; +use geo_traits::{CoordTrait, GeometryTrait, PointTrait}; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_geometry::{ + error::SedonaGeometryError, + wkb_factory::{write_wkb_coord_trait, write_wkb_point_header, WKB_MIN_PROBABLE_BYTES}, +}; +use sedona_schema::{ + datatypes::{SedonaType, WKB_GEOMETRY}, + matchers::ArgMatcher, +}; +use std::{io::Write, sync::Arc}; + +use crate::executor::WkbExecutor; + +/// ST_Dump() scalar UDF +/// +/// Native implementation to get all the points of a geometry as MULTIPOINT +pub fn st_dump_udf() -> SedonaScalarUDF { + SedonaScalarUDF::new( + "st_dump", + vec![Arc::new(STDump)], + Volatility::Immutable, + Some(st_dump_doc()), + ) +} + +fn st_dump_doc() -> Documentation { + Documentation::builder( + DOC_SECTION_OTHER, + "Extracts the components of a geometry.", + "ST_Dump (geom: Geometry)", + ) + .with_argument("geom", "geometry: Input geometry") + .with_sql_example("SELECT ST_Dump(ST_GeomFromWKT('MULTIPOINT (0 1, 2 3, 4 5)'))") + .build() +} + +#[derive(Debug)] +struct STDump; + +impl SedonaScalarKernel for STDump { + fn return_type(&self, args: &[SedonaType]) -> Result> { + let matcher = ArgMatcher::new(vec![ArgMatcher::is_geometry()], WKB_GEOMETRY); + let geom_type = matcher.match_args(args)?; + + get_geom_list_type(geom_type.as_ref()) + } + + fn invoke_batch( + &self, + arg_types: &[SedonaType], + args: &[ColumnarValue], + ) -> Result { + let executor = WkbExecutor::new(arg_types, args); + + let mut builder = ListBuilder::with_capacity( + // TODO: what's the decent default value for the capacity? + BinaryBuilder::with_capacity(8, WKB_MIN_PROBABLE_BYTES * 8), + executor.num_iterations(), + ); + + executor.execute_wkb_void(|maybe_wkb| { + if let Some(wkb) = maybe_wkb { + let geom_builder = builder.values(); + + match wkb.as_type() { + geo_traits::GeometryType::Point(point) => match point.coord() { + Some(coord) => { + write_wkb_point_from_coord(geom_builder, coord).unwrap(); + geom_builder.append_value([]); + } + None => geom_builder.append_null(), + }, + _ => todo!(), + } + + geom_builder.finish(); + builder.append(true); + } else { + builder.append_null(); + } + + Ok(()) + })?; + + executor.finish(Arc::new(builder.finish())) + } +} + +fn get_geom_list_type( + geom_type: Option<&SedonaType>, +) -> std::result::Result, datafusion_common::DataFusionError> { + match geom_type { + Some(SedonaType::Wkb(edges, crs) | SedonaType::WkbView(edges, crs)) => { + let geom_type = SedonaType::Wkb(edges.clone(), crs.clone()); + let geom_list_type = + DataType::List(Arc::new(geom_type.to_storage_field("geom", true)?)); + Ok(Some(SedonaType::Arrow(geom_list_type))) + } + _ => Ok(None), + } +} + +fn write_wkb_point_from_coord( + buf: &mut impl Write, + coord: impl CoordTrait, +) -> Result<(), SedonaGeometryError> { + write_wkb_point_header(buf, coord.dim())?; + write_wkb_coord_trait(buf, &coord) +} + +#[cfg(test)] +mod tests { + use datafusion_expr::ScalarUDF; + use rstest::rstest; + use sedona_schema::datatypes::WKB_VIEW_GEOMETRY; + use sedona_testing::{ + compare::assert_array_equal, create::create_array, testers::ScalarUdfTester, + }; + + use super::*; + + #[test] + fn udf_metadata() { + let st_dump_udf: ScalarUDF = st_dump_udf().into(); + assert_eq!(st_dump_udf.name(), "st_dump"); + assert!(st_dump_udf.documentation().is_some()); + } + + #[rstest] + fn udf(#[values(WKB_GEOMETRY, WKB_VIEW_GEOMETRY)] sedona_type: SedonaType) { + let tester = ScalarUdfTester::new(st_dump_udf().into(), vec![sedona_type.clone()]); + + let input = create_array(&[Some("POINT (1 2)")], &sedona_type); + + let expected = create_array(&[Some("POINT (1 2)")], &WKB_GEOMETRY); + + let result = tester.invoke_array(input.clone()).unwrap(); + assert_array_equal(&result, &expected); + } +} From 1a66e6c57602a2a0fc292928d0dccbaf91cb82fc Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sat, 1 Nov 2025 16:30:28 +0900 Subject: [PATCH 02/25] Fix clippy warning --- rust/sedona-functions/src/st_dump.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 44192b4b..e7f0d358 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -114,7 +114,7 @@ fn get_geom_list_type( ) -> std::result::Result, datafusion_common::DataFusionError> { match geom_type { Some(SedonaType::Wkb(edges, crs) | SedonaType::WkbView(edges, crs)) => { - let geom_type = SedonaType::Wkb(edges.clone(), crs.clone()); + let geom_type = SedonaType::Wkb(*edges, crs.clone()); let geom_list_type = DataType::List(Arc::new(geom_type.to_storage_field("geom", true)?)); Ok(Some(SedonaType::Arrow(geom_list_type))) From d400c3e686f937108f130552182b78787d395def Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 01:32:46 +0900 Subject: [PATCH 03/25] Apply suggestions from code review Co-authored-by: Dewey Dunnington --- rust/sedona-functions/src/st_dump.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 44192b4b..529b61eb 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -62,7 +62,8 @@ struct STDump; impl SedonaScalarKernel for STDump { fn return_type(&self, args: &[SedonaType]) -> Result> { - let matcher = ArgMatcher::new(vec![ArgMatcher::is_geometry()], WKB_GEOMETRY); + let matcher = ArgMatcher::new(vec![ArgMatcher::is_geometry()], get_geom_list_type()?); + let geom_type = matcher.match_args(args)?; get_geom_list_type(geom_type.as_ref()) @@ -116,7 +117,8 @@ fn get_geom_list_type( Some(SedonaType::Wkb(edges, crs) | SedonaType::WkbView(edges, crs)) => { let geom_type = SedonaType::Wkb(edges.clone(), crs.clone()); let geom_list_type = - DataType::List(Arc::new(geom_type.to_storage_field("geom", true)?)); + DataType::List(Arc::new(geom_type.to_storage_field("item", true)?)); + Ok(Some(SedonaType::Arrow(geom_list_type))) } _ => Ok(None), From 8f404436e89d3f94280032fa9c0df7f916404a86 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 01:34:09 +0900 Subject: [PATCH 04/25] Use executor.num_iterations() for capacity --- rust/sedona-functions/src/st_dump.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index f337a8a3..13adb5ea 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -77,8 +77,10 @@ impl SedonaScalarKernel for STDump { let executor = WkbExecutor::new(arg_types, args); let mut builder = ListBuilder::with_capacity( - // TODO: what's the decent default value for the capacity? - BinaryBuilder::with_capacity(8, WKB_MIN_PROBABLE_BYTES * 8), + BinaryBuilder::with_capacity( + executor.num_iterations(), + WKB_MIN_PROBABLE_BYTES * executor.num_iterations(), + ), executor.num_iterations(), ); From a78fbe244312d81918bd71e0de687b7a2d90fd23 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 08:35:43 +0900 Subject: [PATCH 05/25] Return { path: [int], geom: GEOMETRY } --- rust/sedona-functions/src/st_dump.rs | 90 ++++++++++++++++++---------- 1 file changed, 57 insertions(+), 33 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 13adb5ea..21aade88 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -14,8 +14,8 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use arrow_array::builder::{BinaryBuilder, ListBuilder}; -use arrow_schema::DataType; +use arrow_array::builder::{BinaryBuilder, Int64Builder, ListBuilder, StructBuilder}; +use arrow_schema::{DataType, Field, Fields}; use datafusion_common::error::Result; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, @@ -62,11 +62,8 @@ struct STDump; impl SedonaScalarKernel for STDump { fn return_type(&self, args: &[SedonaType]) -> Result> { - let matcher = ArgMatcher::new(vec![ArgMatcher::is_geometry()], get_geom_list_type()?); - - let geom_type = matcher.match_args(args)?; - - get_geom_list_type(geom_type.as_ref()) + let matcher = ArgMatcher::new(vec![ArgMatcher::is_geometry()], geometry_dump_type()); + matcher.match_args(args) } fn invoke_batch( @@ -76,30 +73,55 @@ impl SedonaScalarKernel for STDump { ) -> Result { let executor = WkbExecutor::new(arg_types, args); - let mut builder = ListBuilder::with_capacity( - BinaryBuilder::with_capacity( - executor.num_iterations(), - WKB_MIN_PROBABLE_BYTES * executor.num_iterations(), - ), - executor.num_iterations(), + let num_iter = executor.num_iterations(); + let path_builder = + ListBuilder::with_capacity(Int64Builder::with_capacity(num_iter), num_iter); + let geom_builder = + BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); + let struct_builder = StructBuilder::new( + geometry_dump_fields(), + vec![Box::new(path_builder), Box::new(geom_builder)], ); + let mut builder = + ListBuilder::with_capacity(struct_builder, WKB_MIN_PROBABLE_BYTES * num_iter); executor.execute_wkb_void(|maybe_wkb| { if let Some(wkb) = maybe_wkb { - let geom_builder = builder.values(); + let struct_builder = builder.values(); + // Test: This should add { path: [1], geom: POINT } for a POINT geometry match wkb.as_type() { geo_traits::GeometryType::Point(point) => match point.coord() { Some(coord) => { - write_wkb_point_from_coord(geom_builder, coord).unwrap(); - geom_builder.append_value([]); + // TODO: struct_builder cannot borrow more than once. But this is too lengthy to be inlined here. + + // Write path + { + let path_array_builder = struct_builder + .field_builder::>(0) + .unwrap(); + let path_builder = path_array_builder.values(); + path_builder.append_value(1); + path_array_builder.append(true); + } + + // Write geom + { + let geom_builder = + struct_builder.field_builder::(1).unwrap(); + + write_wkb_point_from_coord(geom_builder, coord).unwrap(); + geom_builder.append_value([]); + } + + struct_builder.append(true); } - None => geom_builder.append_null(), + None => struct_builder.append_null(), }, _ => todo!(), } - geom_builder.finish(); + struct_builder.finish(); builder.append(true); } else { builder.append_null(); @@ -112,19 +134,21 @@ impl SedonaScalarKernel for STDump { } } -fn get_geom_list_type( - geom_type: Option<&SedonaType>, -) -> std::result::Result, datafusion_common::DataFusionError> { - match geom_type { - Some(SedonaType::Wkb(edges, crs) | SedonaType::WkbView(edges, crs)) => { - let geom_type = SedonaType::Wkb(*edges, crs.clone()); - let geom_list_type = - DataType::List(Arc::new(geom_type.to_storage_field("item", true)?)); - - Ok(Some(SedonaType::Arrow(geom_list_type))) - } - _ => Ok(None), - } +fn geometry_dump_fields() -> Fields { + let path = Field::new( + "path", + DataType::List(Field::new("item", DataType::Int64, true).into()), + true, + ); + let geom = WKB_GEOMETRY.to_storage_field("geom", true).unwrap(); + vec![path, geom].into() +} + +fn geometry_dump_type() -> SedonaType { + let fields = geometry_dump_fields(); + let struct_type = DataType::Struct(fields); + + SedonaType::Arrow(struct_type) } fn write_wkb_point_from_coord( @@ -159,9 +183,9 @@ mod tests { let input = create_array(&[Some("POINT (1 2)")], &sedona_type); - let expected = create_array(&[Some("POINT (1 2)")], &WKB_GEOMETRY); + // let expected = create_array(&[Some("POINT (1 2)")], &WKB_GEOMETRY); let result = tester.invoke_array(input.clone()).unwrap(); - assert_array_equal(&result, &expected); + // assert_array_equal(&result, &expected); } } From ea7954135b9ac4a7f7abc27abdb0e9226442c370 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 08:44:01 +0900 Subject: [PATCH 06/25] Fix return type --- rust/sedona-functions/src/st_dump.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 21aade88..932469cb 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -148,7 +148,7 @@ fn geometry_dump_type() -> SedonaType { let fields = geometry_dump_fields(); let struct_type = DataType::Struct(fields); - SedonaType::Arrow(struct_type) + SedonaType::Arrow(DataType::List(Field::new("item", struct_type, true).into())) } fn write_wkb_point_from_coord( From 2e99f43529f62a341e33da13d86aad6798fdb6d1 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 09:18:48 +0900 Subject: [PATCH 07/25] Fix bug and add test --- rust/sedona-functions/src/st_dump.rs | 41 +++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 932469cb..4e0841b5 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -121,7 +121,6 @@ impl SedonaScalarKernel for STDump { _ => todo!(), } - struct_builder.finish(); builder.append(true); } else { builder.append_null(); @@ -161,11 +160,14 @@ fn write_wkb_point_from_coord( #[cfg(test)] mod tests { + use arrow_array::{Array, Int64Array, ListArray, StructArray}; use datafusion_expr::ScalarUDF; use rstest::rstest; use sedona_schema::datatypes::WKB_VIEW_GEOMETRY; use sedona_testing::{ - compare::assert_array_equal, create::create_array, testers::ScalarUdfTester, + compare::assert_array_equal, + create::{create_array, create_scalar}, + testers::ScalarUdfTester, }; use super::*; @@ -183,9 +185,40 @@ mod tests { let input = create_array(&[Some("POINT (1 2)")], &sedona_type); - // let expected = create_array(&[Some("POINT (1 2)")], &WKB_GEOMETRY); + let expected_geom = create_scalar(Some("POINT (1 2)"), &WKB_GEOMETRY); let result = tester.invoke_array(input.clone()).unwrap(); - // assert_array_equal(&result, &expected); + let list_array = result + .as_ref() + .as_any() + .downcast_ref::() + .expect("result should be a ListArray"); + assert_eq!(list_array.len(), 1); + + let dumped = list_array.value(0); + let dumped = dumped + .as_ref() + .as_any() + .downcast_ref::() + .expect("list elements should be StructArray"); + + let path_array = dumped.column(0); + let path_array = path_array + .as_ref() + .as_any() + .downcast_ref::() + .expect("path should be a ListArray"); + let path_values_array = path_array.value(0); + let path_values = path_values_array + .as_ref() + .as_any() + .downcast_ref::() + .expect("path values should be Int64Array"); + assert_eq!(path_values.len(), 1); + assert_eq!(path_values.value(0), 1); + + let geom_array = dumped.column(1); + let expected_geom_array = expected_geom.to_array_of_size(1).unwrap(); + assert_array_equal(geom_array, &expected_geom_array); } } From f68e55335274a9d0f67638273e452b1d54282f10 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 09:35:51 +0900 Subject: [PATCH 08/25] Add test util --- rust/sedona-functions/src/st_dump.rs | 72 ++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 4e0841b5..27943681 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -160,14 +160,12 @@ fn write_wkb_point_from_coord( #[cfg(test)] mod tests { - use arrow_array::{Array, Int64Array, ListArray, StructArray}; + use arrow_array::{Array, ArrayRef, Int64Array, ListArray, StructArray}; use datafusion_expr::ScalarUDF; use rstest::rstest; use sedona_schema::datatypes::WKB_VIEW_GEOMETRY; use sedona_testing::{ - compare::assert_array_equal, - create::{create_array, create_scalar}, - testers::ScalarUdfTester, + compare::assert_array_equal, create::create_array, testers::ScalarUdfTester, }; use super::*; @@ -184,41 +182,73 @@ mod tests { let tester = ScalarUdfTester::new(st_dump_udf().into(), vec![sedona_type.clone()]); let input = create_array(&[Some("POINT (1 2)")], &sedona_type); + let result = tester.invoke_array(input).unwrap(); + let expected: &[(&[i64], Option<&str>)] = &[(&[1], Some("POINT (1 2)"))]; + assert_dump_row(&result, 0, expected); - let expected_geom = create_scalar(Some("POINT (1 2)"), &WKB_GEOMETRY); + let null_input = create_array(&[None], &sedona_type); + let result = tester.invoke_array(null_input).unwrap(); + assert_dump_row_null(&result, 0); + } - let result = tester.invoke_array(input.clone()).unwrap(); + fn assert_dump_row(result: &ArrayRef, row: usize, expected: &[(&[i64], Option<&str>)]) { let list_array = result .as_ref() .as_any() .downcast_ref::() .expect("result should be a ListArray"); - assert_eq!(list_array.len(), 1); - - let dumped = list_array.value(0); + assert!( + !list_array.is_null(row), + "row {row} should not be null in dump result" + ); + let dumped = list_array.value(row); let dumped = dumped .as_ref() .as_any() .downcast_ref::() .expect("list elements should be StructArray"); + assert_eq!(dumped.len(), expected.len()); - let path_array = dumped.column(0); - let path_array = path_array + let path_array = dumped + .column(0) .as_ref() .as_any() .downcast_ref::() .expect("path should be a ListArray"); - let path_values_array = path_array.value(0); - let path_values = path_values_array + assert_eq!(path_array.len(), expected.len()); + for (i, (expected_path, _)) in expected.iter().enumerate() { + let path_array_value = path_array.value(i); + let path_values = path_array_value + .as_ref() + .as_any() + .downcast_ref::() + .expect("path values should be Int64Array"); + assert_eq!( + path_values.len(), + expected_path.len(), + "unexpected path length at index {i}" + ); + for (j, expected_value) in expected_path.iter().enumerate() { + assert_eq!( + path_values.value(j), + *expected_value, + "unexpected path value at index {i}:{j}" + ); + } + } + + let expected_geom_values: Vec> = + expected.iter().map(|(_, geom)| *geom).collect(); + let expected_geom_array = create_array(&expected_geom_values, &WKB_GEOMETRY); + assert_array_equal(dumped.column(1), &expected_geom_array); + } + + fn assert_dump_row_null(result: &ArrayRef, row: usize) { + let list_array = result .as_ref() .as_any() - .downcast_ref::() - .expect("path values should be Int64Array"); - assert_eq!(path_values.len(), 1); - assert_eq!(path_values.value(0), 1); - - let geom_array = dumped.column(1); - let expected_geom_array = expected_geom.to_array_of_size(1).unwrap(); - assert_array_equal(geom_array, &expected_geom_array); + .downcast_ref::() + .expect("result should be a ListArray"); + assert!(list_array.is_null(row), "row {row} should be null"); } } From fd0bacf5d25f82c98c206617152b291c163c0e6a Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 10:25:18 +0900 Subject: [PATCH 09/25] Wrap with builder --- rust/sedona-functions/src/st_dump.rs | 135 ++++++++++++++++++--------- 1 file changed, 92 insertions(+), 43 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 27943681..8e143fc6 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -14,9 +14,12 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use arrow_array::builder::{BinaryBuilder, Int64Builder, ListBuilder, StructBuilder}; +use arrow_array::{ + builder::{BinaryBuilder, Int64Builder, ListBuilder, StructBuilder}, + ListArray, +}; use arrow_schema::{DataType, Field, Fields}; -use datafusion_common::error::Result; +use datafusion_common::error::{DataFusionError, Result}; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, }; @@ -60,6 +63,82 @@ fn st_dump_doc() -> Documentation { #[derive(Debug)] struct STDump; +struct STDumpBuilder { + builder: ListBuilder, +} + +impl STDumpBuilder { + fn new(num_iter: usize) -> Self { + let path_builder = + ListBuilder::with_capacity(Int64Builder::with_capacity(num_iter), num_iter); + let geom_builder = + BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); + let struct_builder = StructBuilder::new( + geometry_dump_fields(), + vec![Box::new(path_builder), Box::new(geom_builder)], + ); + let builder = ListBuilder::with_capacity(struct_builder, WKB_MIN_PROBABLE_BYTES * num_iter); + + Self { builder } + } + + fn append(&mut self, is_valid: bool) { + self.builder.append(is_valid); + } + + fn append_null(&mut self) { + self.builder.append_null(); + } + + fn struct_builder<'a>(&'a mut self) -> STDumpStructBuilder<'a> { + STDumpStructBuilder { + struct_builder: self.builder.values(), + } + } + + fn finish(&mut self) -> ListArray { + self.builder.finish() + } +} + +struct STDumpStructBuilder<'a> { + struct_builder: &'a mut StructBuilder, +} + +impl<'a> STDumpStructBuilder<'a> { + fn append( + &mut self, + path: &[i64], + coord: impl CoordTrait, // TODO: acceept Geometry here + ) -> std::result::Result<(), DataFusionError> { + let path_builder = self + .struct_builder + .field_builder::>(0) + .expect("path field exists"); + let values_builder = path_builder.values(); + for value in path { + values_builder.append_value(*value); + } + path_builder.append(true); + + let geom_builder = self + .struct_builder + .field_builder::(1) + .expect("geom field exists"); + write_wkb_point_from_coord(geom_builder, coord) + .map_err(|err| DataFusionError::External(Box::new(err)))?; + geom_builder.append_value([]); + + self.struct_builder.append(true); + + Ok(()) + } + + fn append_null(&mut self) { + self.struct_builder.append_null(); + } +} + impl SedonaScalarKernel for STDump { fn return_type(&self, args: &[SedonaType]) -> Result> { let matcher = ArgMatcher::new(vec![ArgMatcher::is_geometry()], geometry_dump_type()); @@ -73,52 +152,22 @@ impl SedonaScalarKernel for STDump { ) -> Result { let executor = WkbExecutor::new(arg_types, args); - let num_iter = executor.num_iterations(); - let path_builder = - ListBuilder::with_capacity(Int64Builder::with_capacity(num_iter), num_iter); - let geom_builder = - BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); - let struct_builder = StructBuilder::new( - geometry_dump_fields(), - vec![Box::new(path_builder), Box::new(geom_builder)], - ); - let mut builder = - ListBuilder::with_capacity(struct_builder, WKB_MIN_PROBABLE_BYTES * num_iter); + let mut builder = STDumpBuilder::new(executor.num_iterations()); executor.execute_wkb_void(|maybe_wkb| { if let Some(wkb) = maybe_wkb { - let struct_builder = builder.values(); - - // Test: This should add { path: [1], geom: POINT } for a POINT geometry - match wkb.as_type() { - geo_traits::GeometryType::Point(point) => match point.coord() { - Some(coord) => { - // TODO: struct_builder cannot borrow more than once. But this is too lengthy to be inlined here. - - // Write path - { - let path_array_builder = struct_builder - .field_builder::>(0) - .unwrap(); - let path_builder = path_array_builder.values(); - path_builder.append_value(1); - path_array_builder.append(true); + let mut struct_builder = builder.struct_builder(); + { + match wkb.as_type() { + geo_traits::GeometryType::Point(point) => { + if let Some(coord) = point.coord() { + struct_builder.append(&[1], coord)?; + } else { + struct_builder.append_null(); } - - // Write geom - { - let geom_builder = - struct_builder.field_builder::(1).unwrap(); - - write_wkb_point_from_coord(geom_builder, coord).unwrap(); - geom_builder.append_value([]); - } - - struct_builder.append(true); } - None => struct_builder.append_null(), - }, - _ => todo!(), + _ => todo!(), + } } builder.append(true); From be35ff99a1c9cfa8d7dbe0ca6cecf842423e8297 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 21:44:26 +0900 Subject: [PATCH 10/25] Implement --- rust/sedona-functions/src/st_dump.rs | 153 +++++++++++++++++++-------- 1 file changed, 110 insertions(+), 43 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 8e143fc6..0eb38f3c 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -19,16 +19,16 @@ use arrow_array::{ ListArray, }; use arrow_schema::{DataType, Field, Fields}; -use datafusion_common::error::{DataFusionError, Result}; +use datafusion_common::error::Result; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, }; -use geo_traits::{CoordTrait, GeometryTrait, PointTrait}; -use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; -use sedona_geometry::{ - error::SedonaGeometryError, - wkb_factory::{write_wkb_coord_trait, write_wkb_point_header, WKB_MIN_PROBABLE_BYTES}, +use geo_traits::{ + GeometryCollectionTrait, GeometryTrait, GeometryType, MultiLineStringTrait, MultiPointTrait, + MultiPolygonTrait, }; +use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; +use sedona_geometry::wkb_factory::WKB_MIN_PROBABLE_BYTES; use sedona_schema::{ datatypes::{SedonaType, WKB_GEOMETRY}, matchers::ArgMatcher, @@ -101,6 +101,13 @@ impl STDumpBuilder { } } +enum SingleWkb<'a> { + Raw(&'a [u8]), + Point(&'a wkb::reader::Point<'a>), + LineString(&'a wkb::reader::LineString<'a>), + Polygon(&'a wkb::reader::Polygon<'a>), +} + struct STDumpStructBuilder<'a> { struct_builder: &'a mut StructBuilder, } @@ -108,35 +115,49 @@ struct STDumpStructBuilder<'a> { impl<'a> STDumpStructBuilder<'a> { fn append( &mut self, - path: &[i64], - coord: impl CoordTrait, // TODO: acceept Geometry here - ) -> std::result::Result<(), DataFusionError> { + parent_path: &[i64], + cur_index: Option, + wkb: SingleWkb<'_>, + ) -> Result<()> { let path_builder = self .struct_builder .field_builder::>(0) - .expect("path field exists"); - let values_builder = path_builder.values(); - for value in path { - values_builder.append_value(*value); + .unwrap(); + + let path_array_builder = path_builder.values(); + path_array_builder.append_slice(parent_path); + if let Some(cur_index) = cur_index { + path_array_builder.append_value(cur_index); } path_builder.append(true); let geom_builder = self .struct_builder .field_builder::(1) - .expect("geom field exists"); - write_wkb_point_from_coord(geom_builder, coord) - .map_err(|err| DataFusionError::External(Box::new(err)))?; + .unwrap(); + + match wkb { + SingleWkb::Raw(wkb) => { + geom_builder.write_all(wkb)?; + } + SingleWkb::Point(point) => { + wkb::writer::write_point(geom_builder, &point, &Default::default()).unwrap() + } + SingleWkb::LineString(line_string) => { + wkb::writer::write_line_string(geom_builder, &line_string, &Default::default()) + .unwrap() + } + SingleWkb::Polygon(polygon) => { + wkb::writer::write_polygon(geom_builder, &polygon, &Default::default()).unwrap() + } + } + geom_builder.append_value([]); self.struct_builder.append(true); Ok(()) } - - fn append_null(&mut self) { - self.struct_builder.append_null(); - } } impl SedonaScalarKernel for STDump { @@ -157,18 +178,9 @@ impl SedonaScalarKernel for STDump { executor.execute_wkb_void(|maybe_wkb| { if let Some(wkb) = maybe_wkb { let mut struct_builder = builder.struct_builder(); - { - match wkb.as_type() { - geo_traits::GeometryType::Point(point) => { - if let Some(coord) = point.coord() { - struct_builder.append(&[1], coord)?; - } else { - struct_builder.append_null(); - } - } - _ => todo!(), - } - } + + let mut cur_path: Vec = Vec::new(); + append_struct(&mut struct_builder, &wkb, &mut cur_path)?; builder.append(true); } else { @@ -182,6 +194,55 @@ impl SedonaScalarKernel for STDump { } } +fn append_struct( + struct_builder: &mut STDumpStructBuilder<'_>, + wkb: &wkb::reader::Wkb<'_>, + parent_path: &mut Vec, +) -> Result<()> { + match wkb.as_type() { + GeometryType::Point(_) | GeometryType::LineString(_) | GeometryType::Polygon(_) => { + struct_builder.append(&parent_path, None, SingleWkb::Raw(wkb.buf()))?; + } + GeometryType::MultiPoint(multi_point) => { + for (index, point) in multi_point.points().enumerate() { + struct_builder.append( + &parent_path, + Some((index + 1) as _), + SingleWkb::Point(&point), + )?; + } + } + GeometryType::MultiLineString(multi_line_string) => { + for (index, line_string) in multi_line_string.line_strings().enumerate() { + struct_builder.append( + &parent_path, + Some((index + 1) as _), + SingleWkb::LineString(line_string), + )?; + } + } + GeometryType::MultiPolygon(multi_polygon) => { + for (index, polygon) in multi_polygon.polygons().enumerate() { + struct_builder.append( + &parent_path, + Some((index + 1) as _), + SingleWkb::Polygon(polygon), + )?; + } + } + GeometryType::GeometryCollection(geometry_collection) => { + for (index, geometry) in geometry_collection.geometries().enumerate() { + let mut path = parent_path.clone(); + path.push((index + 1) as _); + append_struct(struct_builder, geometry, &mut path)?; + } + } + _ => todo!(), + } + + Ok(()) +} + fn geometry_dump_fields() -> Fields { let path = Field::new( "path", @@ -199,14 +260,6 @@ fn geometry_dump_type() -> SedonaType { SedonaType::Arrow(DataType::List(Field::new("item", struct_type, true).into())) } -fn write_wkb_point_from_coord( - buf: &mut impl Write, - coord: impl CoordTrait, -) -> Result<(), SedonaGeometryError> { - write_wkb_point_header(buf, coord.dim())?; - write_wkb_coord_trait(buf, &coord) -} - #[cfg(test)] mod tests { use arrow_array::{Array, ArrayRef, Int64Array, ListArray, StructArray}; @@ -230,10 +283,24 @@ mod tests { fn udf(#[values(WKB_GEOMETRY, WKB_VIEW_GEOMETRY)] sedona_type: SedonaType) { let tester = ScalarUdfTester::new(st_dump_udf().into(), vec![sedona_type.clone()]); - let input = create_array(&[Some("POINT (1 2)")], &sedona_type); + let input = create_array( + &[ + Some("POINT (1 2)"), + Some("LINESTRING (1 1, 2 2)"), + Some("POLYGON ((1 1, 2 2, 2 1, 1 1))"), + // Some("MULTIPOINT (1 1, 2 2)"), + ], + &sedona_type, + ); let result = tester.invoke_array(input).unwrap(); - let expected: &[(&[i64], Option<&str>)] = &[(&[1], Some("POINT (1 2)"))]; - assert_dump_row(&result, 0, expected); + assert_dump_row(&result, 0, &[(&[], Some("POINT (1 2)"))]); + assert_dump_row(&result, 1, &[(&[], Some("LINESTRING (1 1, 2 2)"))]); + assert_dump_row(&result, 2, &[(&[], Some("POLYGON ((1 1, 2 2, 2 1, 1 1))"))]); + // assert_dump_row( + // &result, + // 3, + // &[(&[1], Some("POINT (1 1)")), (&[2], Some("POINT (2 2)"))], + // ); let null_input = create_array(&[None], &sedona_type); let result = tester.invoke_array(null_input).unwrap(); From 28c1cf77dc9b39a4beeaed679e0ca862d10e28cc Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 21:57:44 +0900 Subject: [PATCH 11/25] Do not use raw --- rust/sedona-functions/src/st_dump.rs | 72 +++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 13 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 0eb38f3c..54973232 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -33,7 +33,7 @@ use sedona_schema::{ datatypes::{SedonaType, WKB_GEOMETRY}, matchers::ArgMatcher, }; -use std::{io::Write, sync::Arc}; +use std::sync::Arc; use crate::executor::WkbExecutor; @@ -102,7 +102,6 @@ impl STDumpBuilder { } enum SingleWkb<'a> { - Raw(&'a [u8]), Point(&'a wkb::reader::Point<'a>), LineString(&'a wkb::reader::LineString<'a>), Polygon(&'a wkb::reader::Polygon<'a>), @@ -137,9 +136,6 @@ impl<'a> STDumpStructBuilder<'a> { .unwrap(); match wkb { - SingleWkb::Raw(wkb) => { - geom_builder.write_all(wkb)?; - } SingleWkb::Point(point) => { wkb::writer::write_point(geom_builder, &point, &Default::default()).unwrap() } @@ -200,8 +196,14 @@ fn append_struct( parent_path: &mut Vec, ) -> Result<()> { match wkb.as_type() { - GeometryType::Point(_) | GeometryType::LineString(_) | GeometryType::Polygon(_) => { - struct_builder.append(&parent_path, None, SingleWkb::Raw(wkb.buf()))?; + GeometryType::Point(point) => { + struct_builder.append(parent_path, None, SingleWkb::Point(point))?; + } + GeometryType::LineString(line_string) => { + struct_builder.append(parent_path, None, SingleWkb::LineString(line_string))?; + } + GeometryType::Polygon(polygon) => { + struct_builder.append(parent_path, None, SingleWkb::Polygon(polygon))?; } GeometryType::MultiPoint(multi_point) => { for (index, point) in multi_point.points().enumerate() { @@ -288,7 +290,11 @@ mod tests { Some("POINT (1 2)"), Some("LINESTRING (1 1, 2 2)"), Some("POLYGON ((1 1, 2 2, 2 1, 1 1))"), - // Some("MULTIPOINT (1 1, 2 2)"), + Some("MULTIPOINT (1 1, 2 2)"), + Some("MULTILINESTRING ((1 1, 2 2), EMPTY, (3 3, 4 4))"), + Some("MULTIPOLYGON (((1 1, 2 2, 2 1, 1 1)), EMPTY, ((3 3, 4 4, 4 3, 3 3)))"), + Some("GEOMETRYCOLLECTION (POINT (1 2), MULTILINESTRING ((1 1, 2 2), EMPTY, (3 3, 4 4)), LINESTRING (1 1, 2 2))"), + Some("GEOMETRYCOLLECTION (POINT (1 2), GEOMETRYCOLLECTION (MULTILINESTRING ((1 1, 2 2), EMPTY, (3 3, 4 4)), LINESTRING (1 1, 2 2)))"), ], &sedona_type, ); @@ -296,11 +302,51 @@ mod tests { assert_dump_row(&result, 0, &[(&[], Some("POINT (1 2)"))]); assert_dump_row(&result, 1, &[(&[], Some("LINESTRING (1 1, 2 2)"))]); assert_dump_row(&result, 2, &[(&[], Some("POLYGON ((1 1, 2 2, 2 1, 1 1))"))]); - // assert_dump_row( - // &result, - // 3, - // &[(&[1], Some("POINT (1 1)")), (&[2], Some("POINT (2 2)"))], - // ); + assert_dump_row( + &result, + 3, + &[(&[1], Some("POINT (1 1)")), (&[2], Some("POINT (2 2)"))], + ); + assert_dump_row( + &result, + 4, + &[ + (&[1], Some("LINESTRING (1 1, 2 2)")), + (&[2], Some("LINESTRING EMPTY")), + (&[3], Some("LINESTRING (3 3, 4 4)")), + ], + ); + assert_dump_row( + &result, + 5, + &[ + (&[1], Some("POLYGON ((1 1, 2 2, 2 1, 1 1))")), + (&[2], Some("POLYGON EMPTY")), + (&[3], Some("POLYGON ((3 3, 4 4, 4 3, 3 3)))")), + ], + ); + assert_dump_row( + &result, + 6, + &[ + (&[1], Some("POINT (1 2)")), + (&[2, 1], Some("LINESTRING (1 1, 2 2)")), + (&[2, 2], Some("LINESTRING EMPTY")), + (&[2, 3], Some("LINESTRING (3 3, 4 4)")), + (&[3], Some("LINESTRING (1 1, 2 2)")), + ], + ); + assert_dump_row( + &result, + 7, + &[ + (&[1], Some("POINT (1 2)")), + (&[2, 1, 1], Some("LINESTRING (1 1, 2 2)")), + (&[2, 1, 2], Some("LINESTRING EMPTY")), + (&[2, 1, 3], Some("LINESTRING (3 3, 4 4)")), + (&[2, 2], Some("LINESTRING (1 1, 2 2)")), + ], + ); let null_input = create_array(&[None], &sedona_type); let result = tester.invoke_array(null_input).unwrap(); From ec25bf57690cca00aa35f1a397fe372bfcedb319 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 22:28:33 +0900 Subject: [PATCH 12/25] Tweak --- rust/sedona-functions/src/st_dump.rs | 92 +++++++++++++++------------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 54973232..fa44c4f5 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -27,6 +27,7 @@ use geo_traits::{ GeometryCollectionTrait, GeometryTrait, GeometryType, MultiLineStringTrait, MultiPointTrait, MultiPolygonTrait, }; +use sedona_common::sedona_internal_err; use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF}; use sedona_geometry::wkb_factory::WKB_MIN_PROBABLE_BYTES; use sedona_schema::{ @@ -63,55 +64,26 @@ fn st_dump_doc() -> Documentation { #[derive(Debug)] struct STDump; -struct STDumpBuilder { - builder: ListBuilder, -} - -impl STDumpBuilder { - fn new(num_iter: usize) -> Self { - let path_builder = - ListBuilder::with_capacity(Int64Builder::with_capacity(num_iter), num_iter); - let geom_builder = - BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); - let struct_builder = StructBuilder::new( - geometry_dump_fields(), - vec![Box::new(path_builder), Box::new(geom_builder)], - ); - let builder = ListBuilder::with_capacity(struct_builder, WKB_MIN_PROBABLE_BYTES * num_iter); - - Self { builder } - } - - fn append(&mut self, is_valid: bool) { - self.builder.append(is_valid); - } - - fn append_null(&mut self) { - self.builder.append_null(); - } - - fn struct_builder<'a>(&'a mut self) -> STDumpStructBuilder<'a> { - STDumpStructBuilder { - struct_builder: self.builder.values(), - } - } - - fn finish(&mut self) -> ListArray { - self.builder.finish() - } -} - +// This enum is solely for passing the subset of wkb geometry to STDumpStructBuilder. +// Maybe we can pass the underlying raw WKB bytes directly, but this just works for now. enum SingleWkb<'a> { Point(&'a wkb::reader::Point<'a>), LineString(&'a wkb::reader::LineString<'a>), Polygon(&'a wkb::reader::Polygon<'a>), } +// A builder for a single struct of { path: [i64], geom: POINT | LINESTRING | POLYGON } struct STDumpStructBuilder<'a> { struct_builder: &'a mut StructBuilder, } +// A builder for a list of the structs +struct STDumpBuilder { + builder: ListBuilder, +} + impl<'a> STDumpStructBuilder<'a> { + // This appends both path and geom at once. fn append( &mut self, parent_path: &[i64], @@ -135,17 +107,19 @@ impl<'a> STDumpStructBuilder<'a> { .field_builder::(1) .unwrap(); - match wkb { + let write_result = match wkb { SingleWkb::Point(point) => { - wkb::writer::write_point(geom_builder, &point, &Default::default()).unwrap() + wkb::writer::write_point(geom_builder, &point, &Default::default()) } SingleWkb::LineString(line_string) => { wkb::writer::write_line_string(geom_builder, &line_string, &Default::default()) - .unwrap() } SingleWkb::Polygon(polygon) => { - wkb::writer::write_polygon(geom_builder, &polygon, &Default::default()).unwrap() + wkb::writer::write_polygon(geom_builder, &polygon, &Default::default()) } + }; + if let Err(e) = write_result { + return sedona_internal_err!("Failed to write WKB: {e}"); } geom_builder.append_value([]); @@ -156,6 +130,40 @@ impl<'a> STDumpStructBuilder<'a> { } } +impl STDumpBuilder { + fn new(num_iter: usize) -> Self { + let path_builder = + ListBuilder::with_capacity(Int64Builder::with_capacity(num_iter), num_iter); + let geom_builder = + BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); + let struct_builder = StructBuilder::new( + geometry_dump_fields(), + vec![Box::new(path_builder), Box::new(geom_builder)], + ); + let builder = ListBuilder::with_capacity(struct_builder, WKB_MIN_PROBABLE_BYTES * num_iter); + + Self { builder } + } + + fn append(&mut self, is_valid: bool) { + self.builder.append(is_valid); + } + + fn append_null(&mut self) { + self.builder.append_null(); + } + + fn struct_builder<'a>(&'a mut self) -> STDumpStructBuilder<'a> { + STDumpStructBuilder { + struct_builder: self.builder.values(), + } + } + + fn finish(&mut self) -> ListArray { + self.builder.finish() + } +} + impl SedonaScalarKernel for STDump { fn return_type(&self, args: &[SedonaType]) -> Result> { let matcher = ArgMatcher::new(vec![ArgMatcher::is_geometry()], geometry_dump_type()); From c9187a930481839bfb0a3219957bf6190eba4215 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Sun, 2 Nov 2025 22:30:48 +0900 Subject: [PATCH 13/25] Reject unsupported types --- rust/sedona-functions/src/st_dump.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index fa44c4f5..356a9c1e 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -247,7 +247,7 @@ fn append_struct( append_struct(struct_builder, geometry, &mut path)?; } } - _ => todo!(), + _ => return sedona_internal_err!("Invalid geometry type"), } Ok(()) From df817b6c3bb7d844682920ef4841c1ba28643353 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Mon, 3 Nov 2025 01:19:37 +0900 Subject: [PATCH 14/25] Fix clippy warning --- rust/sedona-functions/src/st_dump.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 356a9c1e..ea74f7d0 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -201,7 +201,7 @@ impl SedonaScalarKernel for STDump { fn append_struct( struct_builder: &mut STDumpStructBuilder<'_>, wkb: &wkb::reader::Wkb<'_>, - parent_path: &mut Vec, + parent_path: &mut [i64], ) -> Result<()> { match wkb.as_type() { GeometryType::Point(point) => { @@ -216,7 +216,7 @@ fn append_struct( GeometryType::MultiPoint(multi_point) => { for (index, point) in multi_point.points().enumerate() { struct_builder.append( - &parent_path, + parent_path, Some((index + 1) as _), SingleWkb::Point(&point), )?; @@ -225,7 +225,7 @@ fn append_struct( GeometryType::MultiLineString(multi_line_string) => { for (index, line_string) in multi_line_string.line_strings().enumerate() { struct_builder.append( - &parent_path, + parent_path, Some((index + 1) as _), SingleWkb::LineString(line_string), )?; @@ -234,7 +234,7 @@ fn append_struct( GeometryType::MultiPolygon(multi_polygon) => { for (index, polygon) in multi_polygon.polygons().enumerate() { struct_builder.append( - &parent_path, + parent_path, Some((index + 1) as _), SingleWkb::Polygon(polygon), )?; @@ -242,7 +242,7 @@ fn append_struct( } GeometryType::GeometryCollection(geometry_collection) => { for (index, geometry) in geometry_collection.geometries().enumerate() { - let mut path = parent_path.clone(); + let mut path = parent_path.to_vec(); path.push((index + 1) as _); append_struct(struct_builder, geometry, &mut path)?; } From 709da97a633af5525c4292c7aee4267203f1f39f Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Mon, 3 Nov 2025 11:43:52 +0900 Subject: [PATCH 15/25] Use u32 instead of i64 --- rust/sedona-functions/src/st_dump.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index ea74f7d0..1e688ae5 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. use arrow_array::{ - builder::{BinaryBuilder, Int64Builder, ListBuilder, StructBuilder}, + builder::{BinaryBuilder, ListBuilder, StructBuilder, UInt32Builder}, ListArray, }; use arrow_schema::{DataType, Field, Fields}; @@ -72,7 +72,7 @@ enum SingleWkb<'a> { Polygon(&'a wkb::reader::Polygon<'a>), } -// A builder for a single struct of { path: [i64], geom: POINT | LINESTRING | POLYGON } +// A builder for a single struct of { path: [u32], geom: POINT | LINESTRING | POLYGON } struct STDumpStructBuilder<'a> { struct_builder: &'a mut StructBuilder, } @@ -86,13 +86,13 @@ impl<'a> STDumpStructBuilder<'a> { // This appends both path and geom at once. fn append( &mut self, - parent_path: &[i64], - cur_index: Option, + parent_path: &[u32], + cur_index: Option, wkb: SingleWkb<'_>, ) -> Result<()> { let path_builder = self .struct_builder - .field_builder::>(0) + .field_builder::>(0) .unwrap(); let path_array_builder = path_builder.values(); @@ -133,7 +133,7 @@ impl<'a> STDumpStructBuilder<'a> { impl STDumpBuilder { fn new(num_iter: usize) -> Self { let path_builder = - ListBuilder::with_capacity(Int64Builder::with_capacity(num_iter), num_iter); + ListBuilder::with_capacity(UInt32Builder::with_capacity(num_iter), num_iter); let geom_builder = BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); let struct_builder = StructBuilder::new( @@ -183,7 +183,7 @@ impl SedonaScalarKernel for STDump { if let Some(wkb) = maybe_wkb { let mut struct_builder = builder.struct_builder(); - let mut cur_path: Vec = Vec::new(); + let mut cur_path: Vec = Vec::new(); append_struct(&mut struct_builder, &wkb, &mut cur_path)?; builder.append(true); @@ -201,7 +201,7 @@ impl SedonaScalarKernel for STDump { fn append_struct( struct_builder: &mut STDumpStructBuilder<'_>, wkb: &wkb::reader::Wkb<'_>, - parent_path: &mut [i64], + parent_path: &mut [u32], ) -> Result<()> { match wkb.as_type() { GeometryType::Point(point) => { @@ -256,7 +256,7 @@ fn append_struct( fn geometry_dump_fields() -> Fields { let path = Field::new( "path", - DataType::List(Field::new("item", DataType::Int64, true).into()), + DataType::List(Field::new("item", DataType::UInt32, true).into()), true, ); let geom = WKB_GEOMETRY.to_storage_field("geom", true).unwrap(); @@ -272,7 +272,7 @@ fn geometry_dump_type() -> SedonaType { #[cfg(test)] mod tests { - use arrow_array::{Array, ArrayRef, Int64Array, ListArray, StructArray}; + use arrow_array::{Array, ArrayRef, ListArray, StructArray, UInt32Array}; use datafusion_expr::ScalarUDF; use rstest::rstest; use sedona_schema::datatypes::WKB_VIEW_GEOMETRY; @@ -361,7 +361,7 @@ mod tests { assert_dump_row_null(&result, 0); } - fn assert_dump_row(result: &ArrayRef, row: usize, expected: &[(&[i64], Option<&str>)]) { + fn assert_dump_row(result: &ArrayRef, row: usize, expected: &[(&[u32], Option<&str>)]) { let list_array = result .as_ref() .as_any() @@ -391,8 +391,8 @@ mod tests { let path_values = path_array_value .as_ref() .as_any() - .downcast_ref::() - .expect("path values should be Int64Array"); + .downcast_ref::() + .expect("path values should be UInt32Array"); assert_eq!( path_values.len(), expected_path.len(), From 485064c58dae4a231147d522fb3ea18c78a118e7 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Mon, 3 Nov 2025 12:47:38 +0900 Subject: [PATCH 16/25] Add Python test --- .../tests/functions/test_functions.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 1420d6b5..3bc1bef2 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -633,6 +633,31 @@ def test_st_dimension(eng, geom, expected): eng = eng.create_or_skip() eng.assert_query_result(f"SELECT ST_Dimension({geom_or_null(geom)})", expected) +@pytest.mark.parametrize("eng", [SedonaDB, PostGIS]) +def test_st_dump(eng): + wkt = "MULTIPOINT (0 1, 2 3)" + expected = [ + { + "path": [1], + "geom": shapely.from_wkt("POINT (0 1)").wkb, + }, + { + "path": [2], + "geom": shapely.from_wkt("POINT (1 2)").wkb, + }, + ] + + if eng == PostGIS: + eng = eng.create_or_skip() + result = eng.execute_and_collect(f"SELECT ST_Dump({geom_or_null(wkt)})") + else: + eng = eng.create_or_skip() + result = eng.execute_and_collect(f"SELECT unnest(ST_Dump({geom_or_null(wkt)}))") + df = eng.result_to_pandas(result) + actual = df.iloc[0] + + for item in actual: + assert list(item.keys()) == ["path", "geom"] @pytest.mark.parametrize("eng", [SedonaDB, PostGIS]) @pytest.mark.parametrize( From a1a90498e9da6b256470fbe19666ffb9c8e313a2 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Mon, 3 Nov 2025 12:58:41 +0900 Subject: [PATCH 17/25] Reuse Vec to avoid allocation --- rust/sedona-functions/src/st_dump.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 1e688ae5..372bd289 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -178,12 +178,14 @@ impl SedonaScalarKernel for STDump { let executor = WkbExecutor::new(arg_types, args); let mut builder = STDumpBuilder::new(executor.num_iterations()); + let mut cur_path: Vec = Vec::new(); executor.execute_wkb_void(|maybe_wkb| { if let Some(wkb) = maybe_wkb { + cur_path.clear(); + let mut struct_builder = builder.struct_builder(); - let mut cur_path: Vec = Vec::new(); append_struct(&mut struct_builder, &wkb, &mut cur_path)?; builder.append(true); @@ -201,7 +203,7 @@ impl SedonaScalarKernel for STDump { fn append_struct( struct_builder: &mut STDumpStructBuilder<'_>, wkb: &wkb::reader::Wkb<'_>, - parent_path: &mut [u32], + parent_path: &mut Vec, ) -> Result<()> { match wkb.as_type() { GeometryType::Point(point) => { @@ -241,11 +243,13 @@ fn append_struct( } } GeometryType::GeometryCollection(geometry_collection) => { - for (index, geometry) in geometry_collection.geometries().enumerate() { - let mut path = parent_path.to_vec(); - path.push((index + 1) as _); - append_struct(struct_builder, geometry, &mut path)?; + parent_path.push(0); + + for geometry in geometry_collection.geometries() { + parent_path.last_mut().map(|x| *x += 1); + append_struct(struct_builder, geometry, parent_path)?; } + parent_path.pop(); } _ => return sedona_internal_err!("Invalid geometry type"), } From 955baf6ff79436c5cc5d917dd3b8590511451825 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Mon, 3 Nov 2025 13:09:54 +0900 Subject: [PATCH 18/25] Tweak --- rust/sedona-functions/src/st_dump.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 372bd289..f5e90c84 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -203,6 +203,7 @@ impl SedonaScalarKernel for STDump { fn append_struct( struct_builder: &mut STDumpStructBuilder<'_>, wkb: &wkb::reader::Wkb<'_>, + // Note: in order to avoid allocation, this needs to be &mut Vec, not &mut []. parent_path: &mut Vec, ) -> Result<()> { match wkb.as_type() { @@ -243,13 +244,14 @@ fn append_struct( } } GeometryType::GeometryCollection(geometry_collection) => { - parent_path.push(0); + parent_path.push(0); // add an index for the next nexted level for geometry in geometry_collection.geometries() { - parent_path.last_mut().map(|x| *x += 1); + parent_path.last_mut().map(|x| *x += 1); // increment the index append_struct(struct_builder, geometry, parent_path)?; } - parent_path.pop(); + + parent_path.truncate(parent_path.len() - 1); // clear the index before returning to the upper level } _ => return sedona_internal_err!("Invalid geometry type"), } From 1f4818cd5959c70b289d8fb8621634f609852ef4 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Tue, 4 Nov 2025 01:58:55 +0900 Subject: [PATCH 19/25] Add Python test --- .../tests/functions/test_functions.py | 143 ++++++++++++++++-- 1 file changed, 127 insertions(+), 16 deletions(-) diff --git a/python/sedonadb/tests/functions/test_functions.py b/python/sedonadb/tests/functions/test_functions.py index 3bc1bef2..c108cfba 100644 --- a/python/sedonadb/tests/functions/test_functions.py +++ b/python/sedonadb/tests/functions/test_functions.py @@ -633,31 +633,142 @@ def test_st_dimension(eng, geom, expected): eng = eng.create_or_skip() eng.assert_query_result(f"SELECT ST_Dimension({geom_or_null(geom)})", expected) + @pytest.mark.parametrize("eng", [SedonaDB, PostGIS]) def test_st_dump(eng): - wkt = "MULTIPOINT (0 1, 2 3)" - expected = [ + is_postgis = eng == PostGIS + eng = eng.create_or_skip() + + cases = [ + {"input": "POINT (1 2)", "expected": [{"path": [], "geom": "POINT (1 2)"}]}, + { + "input": "LINESTRING (1 1, 2 2)", + "expected": [{"path": [], "geom": "LINESTRING (1 1, 2 2)"}], + }, + { + "input": "POLYGON ((1 1, 2 2, 2 1, 1 1))", + "expected": [{"path": [], "geom": "POLYGON ((1 1, 2 2, 2 1, 1 1))"}], + }, { - "path": [1], - "geom": shapely.from_wkt("POINT (0 1)").wkb, + "input": "MULTIPOINT (0 1, 1 2)", + "expected": [ + { + "path": [1], + "geom": "POINT (0 1)", + }, + { + "path": [2], + "geom": "POINT (1 2)", + }, + ], }, { - "path": [2], - "geom": shapely.from_wkt("POINT (1 2)").wkb, + "input": "MULTILINESTRING ((1 1, 2 2), EMPTY, (3 3, 4 4))", + "expected": [ + { + "path": [1], + "geom": "LINESTRING (1 1, 2 2)", + }, + { + "path": [2], + "geom": "LINESTRING EMPTY", + }, + { + "path": [3], + "geom": "LINESTRING (3 3, 4 4)", + }, + ], + }, + { + "input": "MULTIPOLYGON (((1 1, 2 2, 2 1, 1 1)), EMPTY, ((3 3, 4 4, 4 3, 3 3)))", + "expected": [ + { + "path": [1], + "geom": "POLYGON ((1 1, 2 2, 2 1, 1 1))", + }, + { + "path": [2], + "geom": "POLYGON EMPTY", + }, + { + "path": [3], + "geom": "POLYGON ((3 3, 4 4, 4 3, 3 3))", + }, + ], + }, + { + "input": "GEOMETRYCOLLECTION (POINT (1 2), MULTILINESTRING ((1 1, 2 2), EMPTY, (3 3, 4 4)), LINESTRING (1 1, 2 2))", + "expected": [ + { + "path": [1], + "geom": "POINT (1 2)", + }, + { + "path": [2, 1], + "geom": "LINESTRING (1 1, 2 2)", + }, + { + "path": [2, 2], + "geom": "LINESTRING EMPTY", + }, + { + "path": [2, 3], + "geom": "LINESTRING (3 3, 4 4)", + }, + { + "path": [3], + "geom": "LINESTRING (1 1, 2 2)", + }, + ], + }, + { + "input": "GEOMETRYCOLLECTION (POINT (1 2), GEOMETRYCOLLECTION (MULTILINESTRING ((1 1, 2 2), EMPTY, (3 3, 4 4)), LINESTRING (1 1, 2 2)))", + "expected": [ + { + "path": [1], + "geom": "POINT (1 2)", + }, + { + "path": [2, 1, 1], + "geom": "LINESTRING (1 1, 2 2)", + }, + { + "path": [2, 1, 2], + "geom": "LINESTRING EMPTY", + }, + { + "path": [2, 1, 3], + "geom": "LINESTRING (3 3, 4 4)", + }, + { + "path": [2, 2], + "geom": "LINESTRING (1 1, 2 2)", + }, + ], }, ] - if eng == PostGIS: - eng = eng.create_or_skip() - result = eng.execute_and_collect(f"SELECT ST_Dump({geom_or_null(wkt)})") - else: - eng = eng.create_or_skip() - result = eng.execute_and_collect(f"SELECT unnest(ST_Dump({geom_or_null(wkt)}))") - df = eng.result_to_pandas(result) - actual = df.iloc[0] + for case in cases: + if is_postgis: + result = eng.execute_and_collect( + f"SELECT ST_Dump({geom_or_null(case['input'])})" + ) + else: + result = eng.execute_and_collect( + f"SELECT unnest(ST_Dump({geom_or_null(case['input'])}))" + ) + df = eng.result_to_pandas(result) + + for i in df.index: + actual = df.iat[i, 0] + expected = case["expected"][i] + assert list(actual.keys()) == ["path", "geom"] + if actual["path"].size == 0: + assert len(expected["path"]) == 0 + else: + actual["path"] == expected["path"] + assert actual["geom"] == shapely.from_wkt(expected["geom"]).wkb - for item in actual: - assert list(item.keys()) == ["path", "geom"] @pytest.mark.parametrize("eng", [SedonaDB, PostGIS]) @pytest.mark.parametrize( From 6c29d3d488255905eb48fc8c747b254c6e3fb5d3 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Tue, 4 Nov 2025 21:29:03 +0900 Subject: [PATCH 20/25] Add benchmark --- benchmarks/test_functions.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/benchmarks/test_functions.py b/benchmarks/test_functions.py index 3e2ae04d..8ec46e38 100644 --- a/benchmarks/test_functions.py +++ b/benchmarks/test_functions.py @@ -113,6 +113,24 @@ def queries(): benchmark(queries) + @pytest.mark.parametrize( + "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread] + ) + @pytest.mark.parametrize( + "table", + [ + "collections_simple", + "collections_complex", + ], + ) + def test_st_dump(self, benchmark, eng, table): + eng = self._get_eng(eng) + + def queries(): + eng.execute_and_collect(f"SELECT ST_Dump(geom1) from {table}") + + benchmark(queries) + @pytest.mark.parametrize( "eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread] ) From c16a2947760b9c1b1caede815be336370efbe18e Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Tue, 4 Nov 2025 22:10:01 +0900 Subject: [PATCH 21/25] Fix clippy and typo --- rust/sedona-functions/src/st_dump.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index f5e90c84..5bf35145 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -244,10 +244,13 @@ fn append_struct( } } GeometryType::GeometryCollection(geometry_collection) => { - parent_path.push(0); // add an index for the next nexted level + parent_path.push(0); // add an index for the next nested level for geometry in geometry_collection.geometries() { - parent_path.last_mut().map(|x| *x += 1); // increment the index + // increment the index + if let Some(index) = parent_path.last_mut() { + *index += 1; + } append_struct(struct_builder, geometry, parent_path)?; } From 8936ddddbf514d3ef6a01b287d9572d9ac8c2871 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Wed, 5 Nov 2025 08:36:13 +0900 Subject: [PATCH 22/25] Remove STDumpStructBuilder --- rust/sedona-functions/src/st_dump.rs | 258 ++++++++++++++------------- 1 file changed, 134 insertions(+), 124 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 5bf35145..9479834d 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. use arrow_array::{ - builder::{BinaryBuilder, ListBuilder, StructBuilder, UInt32Builder}, - ListArray, + builder::{BinaryBuilder, OffsetBufferBuilder, UInt32Builder}, + ListArray, StructArray, }; use arrow_schema::{DataType, Field, Fields}; use datafusion_common::error::Result; @@ -72,95 +72,172 @@ enum SingleWkb<'a> { Polygon(&'a wkb::reader::Polygon<'a>), } -// A builder for a single struct of { path: [u32], geom: POINT | LINESTRING | POLYGON } -struct STDumpStructBuilder<'a> { - struct_builder: &'a mut StructBuilder, -} - // A builder for a list of the structs struct STDumpBuilder { - builder: ListBuilder, + path_array_builder: UInt32Builder, + path_array_offsets_builder: OffsetBufferBuilder, + geom_builder: BinaryBuilder, + struct_offsets_builder: OffsetBufferBuilder, } -impl<'a> STDumpStructBuilder<'a> { +impl STDumpBuilder { + fn new(num_iter: usize) -> Self { + let path_array_builder = UInt32Builder::with_capacity(num_iter); + let path_array_offsets_builder = OffsetBufferBuilder::new(num_iter); + let geom_builder = + BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); + let struct_offsets_builder = OffsetBufferBuilder::new(num_iter); + + Self { + path_array_builder, + path_array_offsets_builder, + geom_builder, + struct_offsets_builder, + } + } + // This appends both path and geom at once. - fn append( + fn append_inner( &mut self, parent_path: &[u32], cur_index: Option, wkb: SingleWkb<'_>, ) -> Result<()> { - let path_builder = self - .struct_builder - .field_builder::>(0) - .unwrap(); - - let path_array_builder = path_builder.values(); - path_array_builder.append_slice(parent_path); + self.path_array_builder.append_slice(parent_path); if let Some(cur_index) = cur_index { - path_array_builder.append_value(cur_index); + self.path_array_builder.append_value(cur_index); + self.path_array_offsets_builder + .push_length(parent_path.len() + 1); + } else { + self.path_array_offsets_builder + .push_length(parent_path.len()); } - path_builder.append(true); - - let geom_builder = self - .struct_builder - .field_builder::(1) - .unwrap(); let write_result = match wkb { SingleWkb::Point(point) => { - wkb::writer::write_point(geom_builder, &point, &Default::default()) - } - SingleWkb::LineString(line_string) => { - wkb::writer::write_line_string(geom_builder, &line_string, &Default::default()) + wkb::writer::write_point(&mut self.geom_builder, &point, &Default::default()) } + SingleWkb::LineString(line_string) => wkb::writer::write_line_string( + &mut self.geom_builder, + &line_string, + &Default::default(), + ), SingleWkb::Polygon(polygon) => { - wkb::writer::write_polygon(geom_builder, &polygon, &Default::default()) + wkb::writer::write_polygon(&mut self.geom_builder, &polygon, &Default::default()) } }; if let Err(e) = write_result { return sedona_internal_err!("Failed to write WKB: {e}"); } - geom_builder.append_value([]); - - self.struct_builder.append(true); + self.geom_builder.append_value([]); Ok(()) } -} -impl STDumpBuilder { - fn new(num_iter: usize) -> Self { - let path_builder = - ListBuilder::with_capacity(UInt32Builder::with_capacity(num_iter), num_iter); - let geom_builder = - BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); - let struct_builder = StructBuilder::new( - geometry_dump_fields(), - vec![Box::new(path_builder), Box::new(geom_builder)], - ); - let builder = ListBuilder::with_capacity(struct_builder, WKB_MIN_PROBABLE_BYTES * num_iter); + fn append( + &mut self, + // Note: in order to avoid allocation, this needs to be &mut Vec, not &mut []. + parent_path: &mut Vec, + wkb: &wkb::reader::Wkb<'_>, + ) -> Result { + match wkb.as_type() { + GeometryType::Point(point) => { + self.append_inner(parent_path, None, SingleWkb::Point(point))?; + Ok(1) + } + GeometryType::LineString(line_string) => { + self.append_inner(parent_path, None, SingleWkb::LineString(line_string))?; + Ok(1) + } + GeometryType::Polygon(polygon) => { + self.append_inner(parent_path, None, SingleWkb::Polygon(polygon))?; + Ok(1) + } + GeometryType::MultiPoint(multi_point) => { + for (index, point) in multi_point.points().enumerate() { + self.append_inner( + parent_path, + Some((index + 1) as _), + SingleWkb::Point(&point), + )?; + } + Ok(multi_point.num_points() as _) + } + GeometryType::MultiLineString(multi_line_string) => { + for (index, line_string) in multi_line_string.line_strings().enumerate() { + self.append_inner( + parent_path, + Some((index + 1) as _), + SingleWkb::LineString(line_string), + )?; + } + Ok(multi_line_string.num_line_strings() as _) + } + GeometryType::MultiPolygon(multi_polygon) => { + for (index, polygon) in multi_polygon.polygons().enumerate() { + self.append_inner( + parent_path, + Some((index + 1) as _), + SingleWkb::Polygon(polygon), + )?; + } + Ok(multi_polygon.num_polygons() as _) + } + GeometryType::GeometryCollection(geometry_collection) => { + let mut num_geometries: i32 = 0; - Self { builder } - } + parent_path.push(0); // add an index for the next nested level + + for geometry in geometry_collection.geometries() { + // increment the index + if let Some(index) = parent_path.last_mut() { + *index += 1; + } + num_geometries += self.append(parent_path, geometry)?; + } - fn append(&mut self, is_valid: bool) { - self.builder.append(is_valid); + parent_path.truncate(parent_path.len() - 1); // clear the index before returning to the upper level + + Ok(num_geometries) + } + _ => return sedona_internal_err!("Invalid geometry type"), + } } fn append_null(&mut self) { - self.builder.append_null(); + self.path_array_builder.append_null(); + self.path_array_offsets_builder.push_length(1); + self.geom_builder.append_null(); + self.struct_offsets_builder.push_length(1); } - fn struct_builder<'a>(&'a mut self) -> STDumpStructBuilder<'a> { - STDumpStructBuilder { - struct_builder: self.builder.values(), - } - } + fn finish(mut self) -> ListArray { + let path_array = Arc::new(self.path_array_builder.finish()); + let path_offsets = self.path_array_offsets_builder.finish(); + let geom_array = self.geom_builder.finish(); + + let path_field = Arc::new(Field::new("item", DataType::UInt32, true)); + let path_list = ListArray::new(path_field, path_offsets, path_array, None); + + let fields = Fields::from(vec![ + Field::new( + "path", + DataType::List(Arc::new(Field::new("item", DataType::UInt32, true))), + true, + ), + Field::new("geom", DataType::Binary, true), + ]); + let struct_array = StructArray::new( + fields.clone(), + vec![Arc::new(path_list), Arc::new(geom_array)], + None, + ); + let struct_offsets = self.struct_offsets_builder.finish(); + let struct_field = Arc::new(Field::new("item", DataType::Struct(fields), true)); + let path_list = ListArray::new(struct_field, struct_offsets, Arc::new(struct_array), None); - fn finish(&mut self) -> ListArray { - self.builder.finish() + path_list } } @@ -183,12 +260,7 @@ impl SedonaScalarKernel for STDump { executor.execute_wkb_void(|maybe_wkb| { if let Some(wkb) = maybe_wkb { cur_path.clear(); - - let mut struct_builder = builder.struct_builder(); - - append_struct(&mut struct_builder, &wkb, &mut cur_path)?; - - builder.append(true); + builder.append(&mut cur_path, &wkb)?; } else { builder.append_null(); } @@ -200,68 +272,6 @@ impl SedonaScalarKernel for STDump { } } -fn append_struct( - struct_builder: &mut STDumpStructBuilder<'_>, - wkb: &wkb::reader::Wkb<'_>, - // Note: in order to avoid allocation, this needs to be &mut Vec, not &mut []. - parent_path: &mut Vec, -) -> Result<()> { - match wkb.as_type() { - GeometryType::Point(point) => { - struct_builder.append(parent_path, None, SingleWkb::Point(point))?; - } - GeometryType::LineString(line_string) => { - struct_builder.append(parent_path, None, SingleWkb::LineString(line_string))?; - } - GeometryType::Polygon(polygon) => { - struct_builder.append(parent_path, None, SingleWkb::Polygon(polygon))?; - } - GeometryType::MultiPoint(multi_point) => { - for (index, point) in multi_point.points().enumerate() { - struct_builder.append( - parent_path, - Some((index + 1) as _), - SingleWkb::Point(&point), - )?; - } - } - GeometryType::MultiLineString(multi_line_string) => { - for (index, line_string) in multi_line_string.line_strings().enumerate() { - struct_builder.append( - parent_path, - Some((index + 1) as _), - SingleWkb::LineString(line_string), - )?; - } - } - GeometryType::MultiPolygon(multi_polygon) => { - for (index, polygon) in multi_polygon.polygons().enumerate() { - struct_builder.append( - parent_path, - Some((index + 1) as _), - SingleWkb::Polygon(polygon), - )?; - } - } - GeometryType::GeometryCollection(geometry_collection) => { - parent_path.push(0); // add an index for the next nested level - - for geometry in geometry_collection.geometries() { - // increment the index - if let Some(index) = parent_path.last_mut() { - *index += 1; - } - append_struct(struct_builder, geometry, parent_path)?; - } - - parent_path.truncate(parent_path.len() - 1); // clear the index before returning to the upper level - } - _ => return sedona_internal_err!("Invalid geometry type"), - } - - Ok(()) -} - fn geometry_dump_fields() -> Fields { let path = Field::new( "path", From ff760babda00bf3167eee74561e0a782892871c5 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Wed, 5 Nov 2025 23:45:59 +0900 Subject: [PATCH 23/25] Fix clippy warning --- rust/sedona-functions/src/st_dump.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 9479834d..358973f3 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -201,7 +201,7 @@ impl STDumpBuilder { Ok(num_geometries) } - _ => return sedona_internal_err!("Invalid geometry type"), + _ => sedona_internal_err!("Invalid geometry type"), } } @@ -235,9 +235,7 @@ impl STDumpBuilder { ); let struct_offsets = self.struct_offsets_builder.finish(); let struct_field = Arc::new(Field::new("item", DataType::Struct(fields), true)); - let path_list = ListArray::new(struct_field, struct_offsets, Arc::new(struct_array), None); - - path_list + ListArray::new(struct_field, struct_offsets, Arc::new(struct_array), None) } } From 07874ac25bdb9e46b0ff27452363e4a9abfdce59 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Thu, 6 Nov 2025 00:13:12 +0900 Subject: [PATCH 24/25] Use NullBuffer --- rust/sedona-functions/src/st_dump.rs | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 358973f3..709ad5c1 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. use arrow_array::{ - builder::{BinaryBuilder, OffsetBufferBuilder, UInt32Builder}, + builder::{BinaryBuilder, NullBufferBuilder, OffsetBufferBuilder, UInt32Builder}, ListArray, StructArray, }; use arrow_schema::{DataType, Field, Fields}; @@ -78,6 +78,7 @@ struct STDumpBuilder { path_array_offsets_builder: OffsetBufferBuilder, geom_builder: BinaryBuilder, struct_offsets_builder: OffsetBufferBuilder, + null_builder: NullBufferBuilder, } impl STDumpBuilder { @@ -87,12 +88,14 @@ impl STDumpBuilder { let geom_builder = BinaryBuilder::with_capacity(num_iter, WKB_MIN_PROBABLE_BYTES * num_iter); let struct_offsets_builder = OffsetBufferBuilder::new(num_iter); + let null_builder = NullBufferBuilder::new(num_iter); Self { path_array_builder, path_array_offsets_builder, geom_builder, struct_offsets_builder, + null_builder, } } @@ -206,10 +209,8 @@ impl STDumpBuilder { } fn append_null(&mut self) { - self.path_array_builder.append_null(); - self.path_array_offsets_builder.push_length(1); + self.path_array_offsets_builder.push_length(0); self.geom_builder.append_null(); - self.struct_offsets_builder.push_length(1); } fn finish(mut self) -> ListArray { @@ -228,14 +229,16 @@ impl STDumpBuilder { ), Field::new("geom", DataType::Binary, true), ]); - let struct_array = StructArray::new( + let struct_array = StructArray::try_new( fields.clone(), vec![Arc::new(path_list), Arc::new(geom_array)], None, - ); + ) + .unwrap(); let struct_offsets = self.struct_offsets_builder.finish(); let struct_field = Arc::new(Field::new("item", DataType::Struct(fields), true)); - ListArray::new(struct_field, struct_offsets, Arc::new(struct_array), None) + let nulls = self.null_builder.finish(); + ListArray::new(struct_field, struct_offsets, Arc::new(struct_array), nulls) } } @@ -258,9 +261,15 @@ impl SedonaScalarKernel for STDump { executor.execute_wkb_void(|maybe_wkb| { if let Some(wkb) = maybe_wkb { cur_path.clear(); - builder.append(&mut cur_path, &wkb)?; + let num_geometries = builder.append(&mut cur_path, &wkb)?; + builder.null_builder.append(true); + builder + .struct_offsets_builder + .push_length(num_geometries as usize); } else { builder.append_null(); + builder.struct_offsets_builder.push_length(1); + builder.null_builder.append(false); } Ok(()) From 6de1952075b485609b16bd340d2c535405701494 Mon Sep 17 00:00:00 2001 From: Hiroaki Yutani Date: Thu, 6 Nov 2025 00:51:31 +0900 Subject: [PATCH 25/25] Clean up --- rust/sedona-functions/src/st_dump.rs | 73 ++++++++++++---------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/rust/sedona-functions/src/st_dump.rs b/rust/sedona-functions/src/st_dump.rs index 709ad5c1..ca1d7127 100644 --- a/rust/sedona-functions/src/st_dump.rs +++ b/rust/sedona-functions/src/st_dump.rs @@ -79,6 +79,7 @@ struct STDumpBuilder { geom_builder: BinaryBuilder, struct_offsets_builder: OffsetBufferBuilder, null_builder: NullBufferBuilder, + parent_path: Vec, } impl STDumpBuilder { @@ -96,24 +97,20 @@ impl STDumpBuilder { geom_builder, struct_offsets_builder, null_builder, + parent_path: Vec::new(), // Reusable buffer to avoid allocation per row } } // This appends both path and geom at once. - fn append_inner( - &mut self, - parent_path: &[u32], - cur_index: Option, - wkb: SingleWkb<'_>, - ) -> Result<()> { - self.path_array_builder.append_slice(parent_path); + fn append_single_struct(&mut self, cur_index: Option, wkb: SingleWkb<'_>) -> Result<()> { + self.path_array_builder.append_slice(&self.parent_path); if let Some(cur_index) = cur_index { self.path_array_builder.append_value(cur_index); self.path_array_offsets_builder - .push_length(parent_path.len() + 1); + .push_length(self.parent_path.len() + 1); } else { self.path_array_offsets_builder - .push_length(parent_path.len()); + .push_length(self.parent_path.len()); } let write_result = match wkb { @@ -138,39 +135,29 @@ impl STDumpBuilder { Ok(()) } - fn append( - &mut self, - // Note: in order to avoid allocation, this needs to be &mut Vec, not &mut []. - parent_path: &mut Vec, - wkb: &wkb::reader::Wkb<'_>, - ) -> Result { + fn append_structs(&mut self, wkb: &wkb::reader::Wkb<'_>) -> Result { match wkb.as_type() { GeometryType::Point(point) => { - self.append_inner(parent_path, None, SingleWkb::Point(point))?; + self.append_single_struct(None, SingleWkb::Point(point))?; Ok(1) } GeometryType::LineString(line_string) => { - self.append_inner(parent_path, None, SingleWkb::LineString(line_string))?; + self.append_single_struct(None, SingleWkb::LineString(line_string))?; Ok(1) } GeometryType::Polygon(polygon) => { - self.append_inner(parent_path, None, SingleWkb::Polygon(polygon))?; + self.append_single_struct(None, SingleWkb::Polygon(polygon))?; Ok(1) } GeometryType::MultiPoint(multi_point) => { for (index, point) in multi_point.points().enumerate() { - self.append_inner( - parent_path, - Some((index + 1) as _), - SingleWkb::Point(&point), - )?; + self.append_single_struct(Some((index + 1) as _), SingleWkb::Point(&point))?; } Ok(multi_point.num_points() as _) } GeometryType::MultiLineString(multi_line_string) => { for (index, line_string) in multi_line_string.line_strings().enumerate() { - self.append_inner( - parent_path, + self.append_single_struct( Some((index + 1) as _), SingleWkb::LineString(line_string), )?; @@ -179,28 +166,24 @@ impl STDumpBuilder { } GeometryType::MultiPolygon(multi_polygon) => { for (index, polygon) in multi_polygon.polygons().enumerate() { - self.append_inner( - parent_path, - Some((index + 1) as _), - SingleWkb::Polygon(polygon), - )?; + self.append_single_struct(Some((index + 1) as _), SingleWkb::Polygon(polygon))?; } Ok(multi_polygon.num_polygons() as _) } GeometryType::GeometryCollection(geometry_collection) => { let mut num_geometries: i32 = 0; - parent_path.push(0); // add an index for the next nested level + self.parent_path.push(0); // add an index for the next nested level for geometry in geometry_collection.geometries() { // increment the index - if let Some(index) = parent_path.last_mut() { + if let Some(index) = self.parent_path.last_mut() { *index += 1; } - num_geometries += self.append(parent_path, geometry)?; + num_geometries += self.append_structs(geometry)?; } - parent_path.truncate(parent_path.len() - 1); // clear the index before returning to the upper level + self.parent_path.truncate(self.parent_path.len() - 1); // clear the index before returning to the upper level Ok(num_geometries) } @@ -208,9 +191,21 @@ impl STDumpBuilder { } } + fn append(&mut self, wkb: &wkb::reader::Wkb<'_>) -> Result<()> { + self.parent_path.clear(); + + let num_geometries = self.append_structs(wkb)?; + self.null_builder.append(true); + self.struct_offsets_builder + .push_length(num_geometries as usize); + Ok(()) + } + fn append_null(&mut self) { self.path_array_offsets_builder.push_length(0); self.geom_builder.append_null(); + self.struct_offsets_builder.push_length(1); + self.null_builder.append(false); } fn finish(mut self) -> ListArray { @@ -256,20 +251,12 @@ impl SedonaScalarKernel for STDump { let executor = WkbExecutor::new(arg_types, args); let mut builder = STDumpBuilder::new(executor.num_iterations()); - let mut cur_path: Vec = Vec::new(); executor.execute_wkb_void(|maybe_wkb| { if let Some(wkb) = maybe_wkb { - cur_path.clear(); - let num_geometries = builder.append(&mut cur_path, &wkb)?; - builder.null_builder.append(true); - builder - .struct_offsets_builder - .push_length(num_geometries as usize); + builder.append(&wkb)?; } else { builder.append_null(); - builder.struct_offsets_builder.push_length(1); - builder.null_builder.append(false); } Ok(())