From 9da1d71b30e43e389ded220796180cc7c7bfd411 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 29 Apr 2026 13:43:51 -0400 Subject: [PATCH 1/9] add vortex-geo crate and WKB extension type Signed-off-by: Andrew Duffy --- Cargo.lock | 46 +++++++++++++++ Cargo.toml | 6 ++ vortex-geo/Cargo.toml | 28 +++++++++ vortex-geo/README.md | 7 +++ vortex-geo/public-api.lock | 101 ++++++++++++++++++++++++++++++++ vortex-geo/src/extension/mod.rs | 45 ++++++++++++++ vortex-geo/src/extension/wkb.rs | 98 +++++++++++++++++++++++++++++++ vortex-geo/src/lib.rs | 95 ++++++++++++++++++++++++++++++ 8 files changed, 426 insertions(+) create mode 100644 vortex-geo/Cargo.toml create mode 100644 vortex-geo/README.md create mode 100644 vortex-geo/public-api.lock create mode 100644 vortex-geo/src/extension/mod.rs create mode 100644 vortex-geo/src/extension/wkb.rs create mode 100644 vortex-geo/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 0f7a33f3d14..10eecea8901 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4176,6 +4176,26 @@ dependencies = [ "version_check", ] +[[package]] +name = "geo-traits" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7c353d12a704ccfab1ba8bfb1a7fe6cb18b665bf89d37f4f7890edcd260206" +dependencies = [ + "geo-types", +] + +[[package]] +name = "geo-types" +version = "0.7.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94776032c45f950d30a13af6113c2ad5625316c9abfbccee4dd5a6695f8fe0f5" +dependencies = [ + "approx", + "num-traits", + "serde", +] + [[package]] name = "get_dir" version = "0.5.0" @@ -6292,6 +6312,7 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ + "proc-macro-crate", "proc-macro2", "quote", "syn 2.0.117", @@ -10608,6 +10629,19 @@ dependencies = [ "vortex-utils", ] +[[package]] +name = "vortex-geo" +version = "0.1.0" +dependencies = [ + "geo-traits", + "geo-types", + "prost 0.14.3", + "vortex-array", + "vortex-error", + "vortex-session", + "wkb", +] + [[package]] name = "vortex-io" version = "0.1.0" @@ -11726,6 +11760,18 @@ dependencies = [ "wasmparser", ] +[[package]] +name = "wkb" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a120b336c7ad17749026d50427c23d838ecb50cd64aaea6254b5030152f890a9" +dependencies = [ + "byteorder", + "geo-traits", + "num_enum", + "thiserror 1.0.69", +] + [[package]] name = "writeable" version = "0.6.3" diff --git a/Cargo.toml b/Cargo.toml index 35179561e14..48dcfa2e6c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -60,6 +60,7 @@ members = [ "benchmarks/duckdb-bench", "benchmarks/random-access-bench", "benchmarks/vector-search-bench", + "vortex-geo", ] exclude = ["java/testfiles", "wasm-test"] resolver = "2" @@ -151,6 +152,9 @@ flatbuffers = "25.2.10" fsst-rs = "0.5.5" futures = { version = "0.3.31", default-features = false } fuzzy-matcher = "0.3" +geoarrow = "0.8.0" +geo-traits = "0.3.0" +geo-types = "0.7.19" get_dir = "0.5.0" glob = "0.3.2" goldenfile = "1" @@ -256,6 +260,7 @@ tracing-subscriber = "0.3" url = "2.5.7" uuid = { version = "1.21", features = ["js"] } wasm-bindgen-futures = "0.4.54" +wkb = "0.9.2" xshell = "0.2.6" zigzag = "0.1.0" zip = "8.0.0" @@ -279,6 +284,7 @@ vortex-fastlanes = { version = "0.1.0", path = "./encodings/fastlanes", default- vortex-file = { version = "0.1.0", path = "./vortex-file", default-features = false } vortex-flatbuffers = { version = "0.1.0", path = "./vortex-flatbuffers", default-features = false } vortex-fsst = { version = "0.1.0", path = "./encodings/fsst", default-features = false } +vortex-geo = { version = "0.1.0", path = "./vortex-geo", default-features = false } vortex-io = { version = "0.1.0", path = "./vortex-io", default-features = false } vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = false } vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false } diff --git a/vortex-geo/Cargo.toml b/vortex-geo/Cargo.toml new file mode 100644 index 00000000000..8033cb8ea31 --- /dev/null +++ b/vortex-geo/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "vortex-geo" +description = "Geospatial encodings and layouts for Vortex files" +authors.workspace = true +categories.workspace = true +edition.workspace = true +homepage.workspace = true +include.workspace = true +keywords.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true + +[dependencies] +prost = { workspace = true } +vortex-array = { workspace = true } +vortex-error = { workspace = true } +vortex-session = { workspace = true } +wkb = { workspace = true } + +[dev-dependencies] +geo-traits = { workspace = true } +geo-types = { workspace = true } + +[lints] +workspace = true diff --git a/vortex-geo/README.md b/vortex-geo/README.md new file mode 100644 index 00000000000..139a49323a3 --- /dev/null +++ b/vortex-geo/README.md @@ -0,0 +1,7 @@ +# vortex-geo + +Geospatial data types and layouts for Vortex. This crate is the Vortex-equivalent of the +[geoarrow](https://github.com/geoarrow/geoarrow-rs) series of crates. + +You can import this crate into your application to add support for reading/write geospatial Vector +data as part of Vortex files. \ No newline at end of file diff --git a/vortex-geo/public-api.lock b/vortex-geo/public-api.lock new file mode 100644 index 00000000000..6e46eecea88 --- /dev/null +++ b/vortex-geo/public-api.lock @@ -0,0 +1,101 @@ +pub mod vortex_geo + +pub mod vortex_geo::extension + +pub struct vortex_geo::extension::GeoMetadata + +pub vortex_geo::extension::GeoMetadata::crs: alloc::string::String + +impl core::clone::Clone for vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::clone(&self) -> vortex_geo::extension::GeoMetadata + +impl core::cmp::Eq for vortex_geo::extension::GeoMetadata + +impl core::cmp::PartialEq for vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::eq(&self, other: &vortex_geo::extension::GeoMetadata) -> bool + +impl core::default::Default for vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::default() -> Self + +impl core::fmt::Debug for vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::StructuralPartialEq for vortex_geo::extension::GeoMetadata + +impl prost::message::Message for vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::clear(&mut self) + +pub fn vortex_geo::extension::GeoMetadata::encoded_len(&self) -> usize + +pub struct vortex_geo::extension::WellKnownBinary + +impl core::clone::Clone for vortex_geo::extension::WellKnownBinary + +pub fn vortex_geo::extension::WellKnownBinary::clone(&self) -> vortex_geo::extension::WellKnownBinary + +impl core::cmp::Eq for vortex_geo::extension::WellKnownBinary + +impl core::cmp::PartialEq for vortex_geo::extension::WellKnownBinary + +pub fn vortex_geo::extension::WellKnownBinary::eq(&self, other: &vortex_geo::extension::WellKnownBinary) -> bool + +impl core::default::Default for vortex_geo::extension::WellKnownBinary + +pub fn vortex_geo::extension::WellKnownBinary::default() -> vortex_geo::extension::WellKnownBinary + +impl core::fmt::Debug for vortex_geo::extension::WellKnownBinary + +pub fn vortex_geo::extension::WellKnownBinary::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::hash::Hash for vortex_geo::extension::WellKnownBinary + +pub fn vortex_geo::extension::WellKnownBinary::hash<__H: core::hash::Hasher>(&self, state: &mut __H) + +impl core::marker::StructuralPartialEq for vortex_geo::extension::WellKnownBinary + +impl vortex_array::dtype::extension::vtable::ExtVTable for vortex_geo::extension::WellKnownBinary + +pub type vortex_geo::extension::WellKnownBinary::Metadata = vortex_geo::extension::GeoMetadata + +pub type vortex_geo::extension::WellKnownBinary::NativeValue<'a> = vortex_geo::extension::Wkb<'a> + +pub fn vortex_geo::extension::WellKnownBinary::deserialize_metadata(&self, metadata: &[u8]) -> vortex_error::VortexResult + +pub fn vortex_geo::extension::WellKnownBinary::id(&self) -> vortex_array::dtype::extension::ExtId + +pub fn vortex_geo::extension::WellKnownBinary::serialize_metadata(&self, metadata: &Self::Metadata) -> vortex_error::VortexResult> + +pub fn vortex_geo::extension::WellKnownBinary::unpack_native<'a>(_ext_dtype: &'a vortex_array::dtype::extension::typed::ExtDType, storage_value: &'a vortex_array::scalar::scalar_value::ScalarValue) -> vortex_error::VortexResult + +pub fn vortex_geo::extension::WellKnownBinary::validate_dtype(ext_dtype: &vortex_array::dtype::extension::typed::ExtDType) -> vortex_error::VortexResult<()> + +pub struct vortex_geo::extension::Wkb<'a>(_) + +impl<'a> vortex_geo::extension::Wkb<'a> + +pub fn vortex_geo::extension::Wkb<'a>::try_from_bytes(bytes: &'a [u8]) -> vortex_error::VortexResult + +impl<'a> core::fmt::Display for vortex_geo::extension::Wkb<'a> + +pub fn vortex_geo::extension::Wkb<'a>::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl<'a> core::ops::deref::Deref for vortex_geo::extension::Wkb<'a> + +pub type vortex_geo::extension::Wkb<'a>::Target = wkb::reader::geometry::Wkb<'a> + +pub fn vortex_geo::extension::Wkb<'a>::deref(&self) -> &wkb::reader::geometry::Wkb<'a> + +pub fn vortex_geo::initialize(session: &vortex_session::VortexSession) diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs new file mode 100644 index 00000000000..b66285b498c --- /dev/null +++ b/vortex-geo/src/extension/mod.rs @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod wkb; + +use std::fmt::Display; + +pub use wkb::*; + +/// Extension metadata that is common to all the geospatial extension types. +/// +/// Currently, this is just the coordinate reference system (CRS). +/// We may wish to add a second field for edges interpretation in the future similar to +/// the GeoArrow standard. +#[derive(Clone, PartialEq, Eq, Hash, prost::Message)] +pub struct GeoMetadata { + #[prost(string, tag = "1")] + pub crs: String, +} + +impl Display for GeoMetadata { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Geometry(crs={})", self.crs) + } +} + +#[cfg(test)] +mod tests { + use prost::Message; + + use crate::extension::GeoMetadata; + + #[test] + fn test_metadata() { + let meta = GeoMetadata { + crs: "EPSG:4326".to_string(), + }; + + assert_eq!(meta.to_string(), "Geometry(crs=EPSG:4326)"); + // round trip + let bytes = meta.encode_to_vec(); + let decoded = GeoMetadata::decode(bytes.as_slice()).unwrap(); + assert_eq!(decoded, meta); + } +} diff --git a/vortex-geo/src/extension/wkb.rs b/vortex-geo/src/extension/wkb.rs new file mode 100644 index 00000000000..850dc26ad3b --- /dev/null +++ b/vortex-geo/src/extension/wkb.rs @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Display; +use std::ops::Deref; + +use prost::Message; +use vortex_array::dtype::extension::ExtDType; +use vortex_array::dtype::extension::ExtId; +use vortex_array::dtype::extension::ExtVTable; +use vortex_array::scalar::ScalarValue; +use vortex_error::VortexResult; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use wkb::reader::GeometryType; + +use crate::extension::GeoMetadata; + +#[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] +pub struct WellKnownBinary; + +/// A reference to a value of well-known binary encoded geometry data. +/// +/// Interpreting this value is dependent on the [geospatial metadata][GeoMetadata] in the extension +/// schema for the array where this scalar is taken from. +pub struct Wkb<'a>(wkb::reader::Wkb<'a>); + +impl<'a> Wkb<'a> { + /// Attempt to decode a well-known binary value from a byte slice. + /// + /// This will not cause any data allocations or copies, but it will perform a one-pass + /// validation on the structure of the WKB. + pub fn try_from_bytes(bytes: &'a [u8]) -> VortexResult { + wkb::reader::Wkb::try_new(bytes) + .map_err(|e| vortex_err!("failed parsing WKB: {e}")) + .map(Wkb) + } +} + +impl<'a> Display for Wkb<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // TODO(aduffy): make this more useful + let geometry_kind = match self.0.geometry_type() { + GeometryType::Point => "point", + GeometryType::LineString => "linestring", + GeometryType::Polygon => "polygon", + GeometryType::MultiPoint => "multipoint", + GeometryType::MultiLineString => "multilinestring", + GeometryType::MultiPolygon => "multipolygon", + GeometryType::GeometryCollection => "geometrycollection", + _ => "unknown", + }; + write!(f, "WKB({geometry_kind})") + } +} + +impl<'a> Deref for Wkb<'a> { + type Target = wkb::reader::Wkb<'a>; + + fn deref(&self) -> &wkb::reader::Wkb<'a> { + &self.0 + } +} + +impl ExtVTable for WellKnownBinary { + type Metadata = GeoMetadata; + + type NativeValue<'a> = Wkb<'a>; + + fn id(&self) -> ExtId { + ExtId::new_static("geo.wkb") + } + + fn serialize_metadata(&self, metadata: &Self::Metadata) -> VortexResult> { + Ok(metadata.encode_to_vec()) + } + + fn deserialize_metadata(&self, metadata: &[u8]) -> VortexResult { + Ok(GeoMetadata::decode(metadata)?) + } + + fn validate_dtype(ext_dtype: &ExtDType) -> VortexResult<()> { + vortex_ensure!( + ext_dtype.storage_dtype().is_binary(), + "geo.wkb must have binary storage type, was {}", + ext_dtype.storage_dtype() + ); + + Ok(()) + } + + fn unpack_native<'a>( + _ext_dtype: &'a ExtDType, + storage_value: &'a ScalarValue, + ) -> VortexResult> { + Wkb::try_from_bytes(storage_value.as_binary().as_slice()) + } +} diff --git a/vortex-geo/src/lib.rs b/vortex-geo/src/lib.rs new file mode 100644 index 00000000000..dde4277e4fe --- /dev/null +++ b/vortex-geo/src/lib.rs @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::dtype::session::DTypeSessionExt; +use vortex_session::VortexSession; + +use crate::extension::WellKnownBinary; + +pub mod extension; +/// Set up a session with support for geospatial extension types, encodings and layouts. +pub fn initialize(session: &VortexSession) { + // register geospatial extension types + session.dtypes().register(WellKnownBinary); +} + +#[cfg(test)] +mod tests { + use std::sync::LazyLock; + + use geo_traits::to_geo::ToGeoGeometry; + use geo_types::Coord; + use geo_types::Geometry; + use geo_types::LineString; + use geo_types::Polygon; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::ExtensionArray; + use vortex_array::arrays::varbin::builder::VarBinBuilder; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::dtype::extension::ExtDType; + use vortex_array::dtype::extension::ExtVTable; + use vortex_array::session::ArraySession; + use vortex_error::VortexResult; + use vortex_error::vortex_err; + use vortex_session::VortexSession; + use wkb::writer::WriteOptions; + + use crate::extension::GeoMetadata; + use crate::extension::WellKnownBinary; + + static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::empty().with::(); + crate::initialize(&session); + session + }); + + #[test] + fn test_array() -> VortexResult<()> { + let mut execution_ctx = SESSION.create_execution_ctx(); + + let mut buf = Vec::new(); + + let mut builder = VarBinBuilder::::with_capacity(3); + + let polygon = Geometry::Polygon(Polygon::new( + LineString::new(vec![ + Coord::zero(), + Coord { x: 100.0, y: 0.0 }, + Coord { x: 100.0, y: 100.0 }, + Coord { x: 0.0, y: 100.0 }, + Coord::zero(), + ]), + vec![], + )); + + // We should always prefer to write little-endian, which is the default option. + wkb::writer::write_geometry(&mut buf, &polygon, &WriteOptions::default()) + .map_err(|e| vortex_err!("writing WKB failed: {e}"))?; + + // Push same polygon 3 times + builder.append_value(&buf); + builder.append_value(&buf); + builder.append_value(&buf); + + let dtype = ExtDType::::try_new( + GeoMetadata { + crs: "EPSG:4326".to_string(), + }, + DType::Binary(Nullability::NonNullable), + )?; + + let array = builder.finish(DType::Binary(Nullability::NonNullable)); + let array = ExtensionArray::new(dtype.clone().erased(), array.into_array()).into_array(); + + for idx in 0..3 { + let geom = array.execute_scalar(idx, &mut execution_ctx)?; + let wkb = WellKnownBinary::unpack_native(&dtype, geom.value().unwrap())?; + + assert_eq!(wkb.to_geometry(), polygon); + } + + Ok(()) + } +} From 260f992d9f5bca6398e0506edec72a417a34ca0d Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 30 Apr 2026 11:32:32 -0400 Subject: [PATCH 2/9] export to DuckDB Signed-off-by: Andrew Duffy --- Cargo.lock | 3 + Cargo.toml | 2 +- vortex-duckdb/Cargo.toml | 3 + .../cpp/include/duckdb_vx/logical_type.h | 4 + vortex-duckdb/cpp/logical_type.cpp | 9 ++ vortex-duckdb/src/convert/dtype.rs | 123 +++++++++++++----- vortex-duckdb/src/datasource.rs | 11 +- vortex-duckdb/src/duckdb/logical_type.rs | 35 +++++ vortex-duckdb/src/duckdb/vector.rs | 8 +- .../src/e2e_test/vortex_scan_test.rs | 53 ++++++++ vortex-duckdb/src/exporter/geo.rs | 23 ++++ vortex-duckdb/src/exporter/mod.rs | 14 +- vortex-duckdb/src/exporter/validity.rs | 2 + vortex-duckdb/src/lib.rs | 7 +- vortex-geo/src/extension/mod.rs | 11 +- vortex-geo/src/extension/wkb.rs | 46 ++++++- vortex-geo/src/lib.rs | 2 +- 17 files changed, 301 insertions(+), 55 deletions(-) create mode 100644 vortex-duckdb/src/exporter/geo.rs diff --git a/Cargo.lock b/Cargo.lock index 10eecea8901..06d24d29e7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10456,6 +10456,7 @@ dependencies = [ "cc", "custom-labels", "futures", + "geo-types", "itertools 0.14.0", "jiff", "kanal", @@ -10470,9 +10471,11 @@ dependencies = [ "url", "vortex", "vortex-array", + "vortex-geo", "vortex-runend", "vortex-sequence", "vortex-utils", + "wkb", "zip", ] diff --git a/Cargo.toml b/Cargo.toml index 48dcfa2e6c5..00b5d680717 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,9 +152,9 @@ flatbuffers = "25.2.10" fsst-rs = "0.5.5" futures = { version = "0.3.31", default-features = false } fuzzy-matcher = "0.3" -geoarrow = "0.8.0" geo-traits = "0.3.0" geo-types = "0.7.19" +geoarrow = "0.8.0" get_dir = "0.5.0" glob = "0.3.2" goldenfile = "1" diff --git a/vortex-duckdb/Cargo.toml b/vortex-duckdb/Cargo.toml index 2d095c1c3e5..8ae27bd990f 100644 --- a/vortex-duckdb/Cargo.toml +++ b/vortex-duckdb/Cargo.toml @@ -38,16 +38,19 @@ paste = { workspace = true } tracing = { workspace = true } url = { workspace = true } vortex = { workspace = true, features = ["files", "tokio", "object_store"] } +vortex-geo = { workspace = true } vortex-utils = { workspace = true, features = ["dashmap"] } [dev-dependencies] anyhow = { workspace = true } +geo-types = { workspace = true } jiff = { workspace = true } rstest = { workspace = true } tempfile = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } vortex-runend = { workspace = true } vortex-sequence = { workspace = true } +wkb = { workspace = true } [lints] workspace = true diff --git a/vortex-duckdb/cpp/include/duckdb_vx/logical_type.h b/vortex-duckdb/cpp/include/duckdb_vx/logical_type.h index 2dcc91350ba..760e4d43679 100644 --- a/vortex-duckdb/cpp/include/duckdb_vx/logical_type.h +++ b/vortex-duckdb/cpp/include/duckdb_vx/logical_type.h @@ -16,6 +16,10 @@ extern "C" { char *duckdb_vx_logical_type_stringify(duckdb_logical_type ty); duckdb_logical_type duckdb_vx_logical_type_copy(duckdb_logical_type ty); +/// Creates a GEOMETRY logical type with the given CRS (Coordinate Reference System). +/// `crs` must be a NUL-terminated UTF-8 string. Pass an empty string for no CRS. +duckdb_logical_type duckdb_vx_create_geometry(const char *crs); + #ifdef __cplusplus /* End C ABI */ } #endif diff --git a/vortex-duckdb/cpp/logical_type.cpp b/vortex-duckdb/cpp/logical_type.cpp index eb3ac4f3768..3c3a72996e4 100644 --- a/vortex-duckdb/cpp/logical_type.cpp +++ b/vortex-duckdb/cpp/logical_type.cpp @@ -8,6 +8,7 @@ DUCKDB_INCLUDES_BEGIN #include "duckdb/common/types.hpp" DUCKDB_INCLUDES_END #include +#include duckdb_logical_type duckdb_vx_logical_type_copy(duckdb_logical_type ty) { D_ASSERT(ty); @@ -23,3 +24,11 @@ char *duckdb_vx_logical_type_stringify(duckdb_logical_type c_type) { memcpy(result, str.c_str(), str.size() + 1); return result; } + +duckdb_logical_type duckdb_vx_create_geometry(const char *crs) { + D_ASSERT(crs); + auto geom = + (*crs == '\0') ? duckdb::LogicalType::GEOMETRY() : duckdb::LogicalType::GEOMETRY(std::string(crs)); + auto copy = duckdb::make_uniq(std::move(geom)); + return reinterpret_cast(copy.release()); +} diff --git a/vortex-duckdb/src/convert/dtype.rs b/vortex-duckdb/src/convert/dtype.rs index 2b7c656be64..996a6920de7 100644 --- a/vortex-duckdb/src/convert/dtype.rs +++ b/vortex-duckdb/src/convert/dtype.rs @@ -31,6 +31,7 @@ use std::ffi::CString; use std::sync::Arc; +use vortex::array::dtype::extension::ExtDType; use vortex::dtype::DType; use vortex::dtype::DecimalDType; use vortex::dtype::FieldName; @@ -57,6 +58,8 @@ use vortex::extension::datetime::TemporalMetadata; use vortex::extension::datetime::Time; use vortex::extension::datetime::TimeUnit; use vortex::extension::datetime::Timestamp; +use vortex_geo::extension::GeoMetadata; +use vortex_geo::extension::WellKnownBinary; use crate::cpp::DUCKDB_TYPE; use crate::duckdb::LogicalType; @@ -160,6 +163,16 @@ impl FromLogicalType for DType { .collect::>()?, nullability, ), + DUCKDB_TYPE::DUCKDB_TYPE_GEOMETRY => { + let crs = logical_type.geometry_crs().map(|crs| crs.to_string()); + DType::Extension( + ExtDType::::try_new( + GeoMetadata { crs }, + DType::Binary(nullability), + )? + .erased(), + ) + } DUCKDB_TYPE::DUCKDB_TYPE_TIME_TZ => todo!(), DUCKDB_TYPE::DUCKDB_TYPE_INTERVAL => todo!(), DUCKDB_TYPE::DUCKDB_TYPE_ENUM => todo!(), @@ -171,7 +184,6 @@ impl FromLogicalType for DType { DUCKDB_TYPE::DUCKDB_TYPE_BIGNUM => todo!(), DUCKDB_TYPE::DUCKDB_TYPE_STRING_LITERAL => todo!(), DUCKDB_TYPE::DUCKDB_TYPE_INTEGER_LITERAL => todo!(), - DUCKDB_TYPE::DUCKDB_TYPE_GEOMETRY => todo!(), }) } } @@ -241,40 +253,17 @@ impl TryFrom<&DType> for LogicalType { vortex_bail!("Vortex Variant array aren't supported in DuckDB") } DType::Extension(ext_dtype) => { - let Some(temporal) = ext_dtype.metadata_opt::() else { - vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id()); - }; - - match temporal { - TemporalMetadata::Timestamp(unit, None) => match unit { - TimeUnit::Nanoseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_NS, - TimeUnit::Microseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP, - TimeUnit::Milliseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_MS, - TimeUnit::Seconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_S, - _ => vortex_bail!("Invalid TimeUnit {} for timestamp", unit), - }, - TemporalMetadata::Timestamp(unit, Some(tz)) => { - if tz.as_ref() != "UTC" { - vortex_bail!("Invalid timezone for timestamp_tz {tz}, must be UTC"); - } - if unit != &TimeUnit::Microseconds { - vortex_bail!( - "Invalid TimeUnit {} for timestamp_tz, must be Microseconds", - unit - ); - } - DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_TZ - } - TemporalMetadata::Date(unit) => match unit { - TimeUnit::Days => DUCKDB_TYPE::DUCKDB_TYPE_DATE, - _ => vortex_bail!("Invalid TimeUnit {} for date", unit), - }, - TemporalMetadata::Time(unit) => match unit { - TimeUnit::Microseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIME, - TimeUnit::Nanoseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIME_NS, - _ => vortex_bail!("Invalid TimeUnit {} for time", unit), - }, + // Handle first-party extension types that have DuckDB equivalents. + if let Some(temporal) = ext_dtype.metadata_opt::() { + return temporal_to_duckdb(temporal); } + + if let Some(wkb) = ext_dtype.metadata_opt::() { + let crs = wkb.crs.as_ref(); + return LogicalType::geometry_type(crs.map(|crs| crs.as_str())); + } + + vortex_bail!("Unsupported extension type \"{}\"", ext_dtype.id()); } }; @@ -282,6 +271,41 @@ impl TryFrom<&DType> for LogicalType { } } +fn temporal_to_duckdb(temporal: TemporalMetadata) -> VortexResult { + let duckdb_type = match temporal { + TemporalMetadata::Timestamp(unit, None) => match unit { + TimeUnit::Nanoseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_NS, + TimeUnit::Microseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP, + TimeUnit::Milliseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_MS, + TimeUnit::Seconds => DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_S, + _ => vortex_bail!("Invalid TimeUnit {} for timestamp", unit), + }, + TemporalMetadata::Timestamp(unit, Some(tz)) => { + if tz.as_ref() != "UTC" { + vortex_bail!("Invalid timezone for timestamp_tz {tz}, must be UTC"); + } + if unit != &TimeUnit::Microseconds { + vortex_bail!( + "Invalid TimeUnit {} for timestamp_tz, must be Microseconds", + unit + ); + } + DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP_TZ + } + TemporalMetadata::Date(unit) => match unit { + TimeUnit::Days => DUCKDB_TYPE::DUCKDB_TYPE_DATE, + _ => vortex_bail!("Invalid TimeUnit {} for date", unit), + }, + TemporalMetadata::Time(unit) => match unit { + TimeUnit::Microseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIME, + TimeUnit::Nanoseconds => DUCKDB_TYPE::DUCKDB_TYPE_TIME_NS, + _ => vortex_bail!("Invalid TimeUnit {} for time", unit), + }, + }; + + Ok(LogicalType::new(duckdb_type)) +} + impl TryFrom for LogicalType { type Error = VortexError; @@ -352,7 +376,10 @@ mod tests { use vortex::extension::datetime::Time; use vortex::extension::datetime::Timestamp; use vortex::scalar::ScalarValue; + use vortex_geo::extension::GeoMetadata; + use vortex_geo::extension::WellKnownBinary; + use crate::convert::dtype::FromLogicalType; use crate::cpp; use crate::duckdb::LogicalType; @@ -575,6 +602,34 @@ mod tests { assert!(LogicalType::try_from(&dtype).is_err()); } + #[test] + fn test_geometry_roundtrip() -> VortexResult<()> { + let vortex_geometry = DType::Extension( + ExtDType::::try_new( + GeoMetadata { + crs: Some("EPSG:4326".to_string()), + }, + DType::Binary(Nullability::NonNullable), + )? + .erased(), + ); + + let duckdb_geometry = LogicalType::try_from(&vortex_geometry)?; + assert_eq!( + duckdb_geometry.as_type_id(), + cpp::DUCKDB_TYPE::DUCKDB_TYPE_GEOMETRY + ); + assert_eq!( + duckdb_geometry.geometry_crs().map(|crs| crs.to_string()), + Some("EPSG:4326".to_string()) + ); + + let original = DType::from_logical_type(&duckdb_geometry, Nullability::NonNullable)?; + assert_eq!(original, vortex_geometry); + + Ok(()) + } + #[test] fn test_unsupported_extension_type() { #[derive(Clone, Debug, Default, PartialEq, Eq, Hash)] diff --git a/vortex-duckdb/src/datasource.rs b/vortex-duckdb/src/datasource.rs index 1f07155e1f6..cad9fc60a09 100644 --- a/vortex-duckdb/src/datasource.rs +++ b/vortex-duckdb/src/datasource.rs @@ -179,18 +179,17 @@ fn progress(bytes_read: &AtomicU64, bytes_total: &AtomicU64) -> f64 { impl ColumnStatistics { fn from(stats: &ColumnStatisticsAggregate, dtype: DType) -> Self { - let min = stats.min.as_ref().map(|value| { - let value = value.clone(); - Scalar::try_new(dtype.clone(), Some(value)) + let min = stats.min.as_ref().and_then(|value| { + Scalar::try_new(dtype.clone(), Some(value.clone())) .vortex_expect("scalar dtype and value are incompatible") .try_to_duckdb_scalar() - .vortex_expect("can't convert Scalar to duckdb Value") + .ok() }); - let max = stats.max.as_ref().map(|value| { + let max = stats.max.as_ref().and_then(|value| { Scalar::try_new(dtype.clone(), Some(value.clone())) .vortex_expect("scalar dtype and value are incompatible") .try_to_duckdb_scalar() - .vortex_expect("can't convert Scalar to duckdb Value") + .ok() }); let max_string_length = stats diff --git a/vortex-duckdb/src/duckdb/logical_type.rs b/vortex-duckdb/src/duckdb/logical_type.rs index 06f17222a34..523b46d0586 100644 --- a/vortex-duckdb/src/duckdb/logical_type.rs +++ b/vortex-duckdb/src/duckdb/logical_type.rs @@ -170,6 +170,20 @@ impl LogicalType { pub fn date() -> Self { Self::new(DUCKDB_TYPE::DUCKDB_TYPE_DATE) } + + /// Creates a DuckDB GEOMETRY logical type with the given CRS (Coordinate Reference System). + /// + /// Pass `None` for a GEOMETRY with no associated CRS. + pub fn geometry_type(crs: Option<&str>) -> VortexResult { + let Ok(crs) = CString::new(crs.unwrap_or("")) else { + vortex_bail!("CRS must not contain NUL bytes"); + }; + let ptr = unsafe { duckdb_vx_create_geometry(crs.as_ptr()) }; + if ptr.is_null() { + vortex_bail!("Failed to create GEOMETRY logical type"); + } + Ok(unsafe { Self::own(ptr) }) + } } impl LogicalTypeRef { @@ -190,6 +204,17 @@ impl LogicalTypeRef { matches!(self.as_type_id(), DUCKDB_TYPE::DUCKDB_TYPE_DECIMAL) } + pub fn geometry_crs(&self) -> Option { + unsafe { + let c_string = duckdb_geometry_type_get_crs(self.as_ptr()); + if c_string.is_null() { + None + } else { + Some(DDBString::own(c_string)) + } + } + } + pub fn array_child_type(&self) -> LogicalType { unsafe { LogicalType::own(duckdb_array_type_child_type(self.as_ptr())) } } @@ -524,6 +549,16 @@ mod tests { } } + #[test] + fn test_create_geometry_type() -> VortexResult<()> { + let no_crs = LogicalType::geometry_type(None)?; + assert_eq!(no_crs.as_type_id(), DUCKDB_TYPE::DUCKDB_TYPE_GEOMETRY); + + let with_crs = LogicalType::geometry_type(Some("EPSG:4326"))?; + assert_eq!(with_crs.as_type_id(), DUCKDB_TYPE::DUCKDB_TYPE_GEOMETRY); + Ok(()) + } + #[test] fn test_clone_union_logical_type() { let str_type = LogicalType::new(DUCKDB_TYPE::DUCKDB_TYPE_VARCHAR); diff --git a/vortex-duckdb/src/duckdb/vector.rs b/vortex-duckdb/src/duckdb/vector.rs index ac652bed43f..d4343ce6fc6 100644 --- a/vortex-duckdb/src/duckdb/vector.rs +++ b/vortex-duckdb/src/duckdb/vector.rs @@ -147,18 +147,18 @@ impl VectorRef { !is_valid } - pub unsafe fn set_vector_buffer(&self, buffer: &VectorBufferRef) { + pub unsafe fn set_vector_buffer(&mut self, buffer: &VectorBufferRef) { unsafe { cpp::duckdb_vx_vector_set_vector_data_buffer(self.as_ptr(), buffer.as_ptr()) } } - pub fn add_string_vector_buffer(&self, buffer: &VectorBufferRef) { + pub fn add_string_vector_buffer(&mut self, buffer: &VectorBufferRef) { unsafe { cpp::duckdb_vx_string_vector_add_vector_data_buffer(self.as_ptr(), buffer.as_ptr()) } } /// Sets the data pointer for the vector. This is the start of the values array in the vector. - pub unsafe fn set_data_ptr(&self, ptr: *mut T) { + pub unsafe fn set_data_ptr(&mut self, ptr: *mut T) { unsafe { cpp::duckdb_vx_vector_set_data_ptr(self.as_ptr(), ptr as *mut c_void) } } @@ -171,7 +171,7 @@ impl VectorRef { /// The data pointer must point to a valid `u64` array with at least /// `u64_offset + capacity.div_ceil(64)` elements. pub(crate) unsafe fn set_validity_data( - &self, + &mut self, u64_offset: usize, capacity: usize, zero_copy: &ValidityData, diff --git a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs index 8e65d26ed6f..a078f963714 100644 --- a/vortex-duckdb/src/e2e_test/vortex_scan_test.rs +++ b/vortex-duckdb/src/e2e_test/vortex_scan_test.rs @@ -11,6 +11,8 @@ use std::slice; use std::str::FromStr; use anyhow::Result; +use geo_types::LineString; +use geo_types::Polygon; use jiff::Span; use jiff::Timestamp; use jiff::Zoned; @@ -38,8 +40,15 @@ use vortex::file::WriteOptionsSessionExt; use vortex::io::runtime::BlockingRuntime; use vortex::scalar::PValue; use vortex::scalar::Scalar; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::varbin::builder::VarBinBuilder; +use vortex_array::dtype::DType; +use vortex_array::dtype::extension::ExtDType; +use vortex_geo::extension::GeoMetadata; +use vortex_geo::extension::WellKnownBinary; use vortex_runend::RunEnd; use vortex_sequence::Sequence; +use wkb::writer::WriteOptions; use crate::RUNTIME; use crate::SESSION; @@ -993,3 +1002,47 @@ fn test_vortex_encodings_roundtrip() { let fixed_child_values = fixed_child.as_slice_with_len::(10); // 10 total child elements assert_eq!(fixed_child_values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); } + +#[test] +fn test_geometry() { + let file = RUNTIME.block_on(async { + let rect10 = Polygon::new( + LineString::from_iter([[0., 0.], [10., 0.], [10., 10.], [0., 10.], [0., 0.]]), + vec![], + ); + let mut wkb_binary: Vec = Vec::new(); + wkb::writer::write_polygon(&mut wkb_binary, &rect10, &WriteOptions::default()) + .expect("serializing WKB"); + let mut geometry = VarBinBuilder::::with_capacity(10); + for _ in 0..10 { + geometry.append_value(wkb_binary.as_slice()); + } + let geometry = geometry.finish(DType::Binary(Nullability::NonNullable)); + + let geometry = ExtensionArray::new( + ExtDType::::try_new( + GeoMetadata { + crs: Some("EPSG:32600".to_string()), + }, + geometry.dtype().clone(), + ) + .expect("making extension array") + .erased(), + geometry.into_array(), + ) + .into_array(); + + write_single_column_vortex_file("geometry", geometry).await + }); + + let conn = database_connection(); + conn.query("INSTALL spatial; LOAD spatial;").unwrap(); + let file_path = file.path().to_string_lossy(); + let result = conn + .query(&format!("SELECT SUM(ST_Area(geometry)) FROM '{file_path}'")) + .unwrap(); + let chunk = result.into_iter().next().unwrap(); + let vec = chunk.get_vector(0); + let area = vec.as_slice_with_len::(chunk.len().as_())[0]; + assert_eq!(area, 1000.0); +} diff --git a/vortex-duckdb/src/exporter/geo.rs b/vortex-duckdb/src/exporter/geo.rs new file mode 100644 index 00000000000..a88a3263507 --- /dev/null +++ b/vortex-duckdb/src/exporter/geo.rs @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex::array::Canonical; +use vortex::array::ExecutionCtx; +use vortex::error::VortexResult; +use vortex_geo::extension::WellKnownBinaryData; + +use crate::exporter::ColumnExporter; + +/// Create a new exporter for geospatial data stored in one of the supported spatial formats. +pub(crate) fn new_wkb_exporter( + array: WellKnownBinaryData, + ctx: &mut ExecutionCtx, +) -> VortexResult> { + // Execute the WKB child into binary + let values = array + .wkb_values() + .clone() + .execute::(ctx)? + .into_varbinview(); + crate::exporter::varbinview::new_exporter(values, ctx) +} diff --git a/vortex-duckdb/src/exporter/mod.rs b/vortex-duckdb/src/exporter/mod.rs index 517776f5521..a57f7ef2939 100644 --- a/vortex-duckdb/src/exporter/mod.rs +++ b/vortex-duckdb/src/exporter/mod.rs @@ -8,6 +8,7 @@ mod constant; mod decimal; mod dict; mod fixed_size_list; +mod geo; mod list; mod list_view; mod primitive; @@ -29,13 +30,17 @@ use vortex::array::arrays::Dict; use vortex::array::arrays::List; use vortex::array::arrays::StructArray; use vortex::array::arrays::TemporalArray; +use vortex::array::arrays::extension::ExtensionArrayExt; use vortex::array::arrays::struct_::StructArrayExt; +use vortex::array::extension::datetime::AnyTemporal; use vortex::buffer::BitChunks; use vortex::encodings::runend::RunEnd; use vortex::encodings::sequence::Sequence; use vortex::error::VortexExpect; use vortex::error::VortexResult; use vortex::error::vortex_bail; +use vortex_geo::extension::WellKnownBinary; +use vortex_geo::extension::WellKnownBinaryData; use crate::duckdb::DataChunkRef; use crate::duckdb::VectorRef; @@ -216,9 +221,14 @@ fn new_array_exporter_with_flatten( Canonical::FixedSizeList(array) => fixed_size_list::new_exporter(array, cache, ctx), Canonical::Struct(array) => struct_::new_exporter(array, cache, ctx), Canonical::Extension(ext) => { - if let Ok(temporal_array) = TemporalArray::try_from(ext) { - return temporal::new_exporter(temporal_array, ctx); + if ext.ext_dtype().is::() { + return temporal::new_exporter(TemporalArray::try_from(ext)?, ctx); } + + if ext.ext_dtype().is::() { + return geo::new_wkb_exporter(WellKnownBinaryData::try_from(ext)?, ctx); + } + vortex_bail!("no non-temporal extension exporter") } Canonical::Variant(_) => { diff --git a/vortex-duckdb/src/exporter/validity.rs b/vortex-duckdb/src/exporter/validity.rs index d1fbf8123cb..429699e9f55 100644 --- a/vortex-duckdb/src/exporter/validity.rs +++ b/vortex-duckdb/src/exporter/validity.rs @@ -10,6 +10,8 @@ use crate::duckdb::VectorBuffer; use crate::duckdb::VectorRef; use crate::exporter::ColumnExporter; +/// A [`ColumnExporter`] that wraps another exporter with a validity +/// export, allowing you to write data using something else here. struct ValidityExporter { mask: Mask, /// If the mask's bit buffer is u64-aligned with no sub-byte offset, diff --git a/vortex-duckdb/src/lib.rs b/vortex-duckdb/src/lib.rs index 413a71c611e..fe46f4e0c11 100644 --- a/vortex-duckdb/src/lib.rs +++ b/vortex-duckdb/src/lib.rs @@ -42,8 +42,11 @@ mod e2e_test; // A global runtime for Vortex operations within DuckDB. static RUNTIME: LazyLock = LazyLock::new(CurrentThreadRuntime::new); -static SESSION: LazyLock = - LazyLock::new(|| VortexSession::default().with_handle(RUNTIME.handle())); +static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::default().with_handle(RUNTIME.handle()); + vortex_geo::initialize(&session); + session +}); /// Initialize the Vortex extension by registering the extension functions. /// Note: This also registers extension options. If you want to register options diff --git a/vortex-geo/src/extension/mod.rs b/vortex-geo/src/extension/mod.rs index b66285b498c..f08b76ae83d 100644 --- a/vortex-geo/src/extension/mod.rs +++ b/vortex-geo/src/extension/mod.rs @@ -14,13 +14,16 @@ pub use wkb::*; /// the GeoArrow standard. #[derive(Clone, PartialEq, Eq, Hash, prost::Message)] pub struct GeoMetadata { - #[prost(string, tag = "1")] - pub crs: String, + #[prost(optional, string, tag = "1")] + pub crs: Option, } impl Display for GeoMetadata { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Geometry(crs={})", self.crs) + match self.crs.as_ref() { + Some(crs) => write!(f, "Geometry(crs={crs})"), + None => write!(f, "Geometry(unreferenced)"), + } } } @@ -33,7 +36,7 @@ mod tests { #[test] fn test_metadata() { let meta = GeoMetadata { - crs: "EPSG:4326".to_string(), + crs: Some("EPSG:4326".to_string()), }; assert_eq!(meta.to_string(), "Geometry(crs=EPSG:4326)"); diff --git a/vortex-geo/src/extension/wkb.rs b/vortex-geo/src/extension/wkb.rs index 850dc26ad3b..5c66b8bed81 100644 --- a/vortex-geo/src/extension/wkb.rs +++ b/vortex-geo/src/extension/wkb.rs @@ -5,17 +5,61 @@ use std::fmt::Display; use std::ops::Deref; use prost::Message; +use vortex_array::ArrayRef; +use vortex_array::arrays::ExtensionArray; +use vortex_array::arrays::extension::ExtensionArrayExt; use vortex_array::dtype::extension::ExtDType; use vortex_array::dtype::extension::ExtId; use vortex_array::dtype::extension::ExtVTable; use vortex_array::scalar::ScalarValue; +use vortex_error::VortexError; use vortex_error::VortexResult; +use vortex_error::vortex_bail; use vortex_error::vortex_ensure; use vortex_error::vortex_err; use wkb::reader::GeometryType; use crate::extension::GeoMetadata; +/// A typed handle to an [`ExtensionArray`] that contains WKB-encoded data. +/// +/// You can construct this safely using `WellKnownBinaryData::try_from(ExtensionArray)`. +#[derive(Debug, Clone)] +pub struct WellKnownBinaryData { + ext: ExtensionArray, +} + +impl WellKnownBinaryData { + /// A reference to the array that holds the Well-Known Binary scalar values. + pub fn wkb_values(&self) -> &ArrayRef { + self.ext.storage_array() + } + + /// A reference to the [geospatial metadata][GeoMetadata]. + pub fn geo_metadata(&self) -> &GeoMetadata { + self.ext + .dtype() + .as_extension() + .metadata::() + } +} + +impl TryFrom for WellKnownBinaryData { + type Error = VortexError; + + fn try_from(ext: ExtensionArray) -> Result { + if !ext.ext_dtype().is::() { + vortex_bail!("array extension dtype {} is not a WKB", ext.ext_dtype()); + } + + Ok(Self { ext }) + } +} + +/// An [extension type][ExtVTable] for OGC Well-known Binary (WKB) data format. +/// +/// This is one of the most common formats for sharing of geometry data between analytic systems, +/// used by DuckDB, PostGIS and GeoParquet. #[derive(Debug, Clone, Default, PartialEq, Eq, Hash)] pub struct WellKnownBinary; @@ -39,7 +83,6 @@ impl<'a> Wkb<'a> { impl<'a> Display for Wkb<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // TODO(aduffy): make this more useful let geometry_kind = match self.0.geometry_type() { GeometryType::Point => "point", GeometryType::LineString => "linestring", @@ -50,6 +93,7 @@ impl<'a> Display for Wkb<'a> { GeometryType::GeometryCollection => "geometrycollection", _ => "unknown", }; + // TODO(aduffy): make this more useful write!(f, "WKB({geometry_kind})") } } diff --git a/vortex-geo/src/lib.rs b/vortex-geo/src/lib.rs index dde4277e4fe..a36a4fdde6b 100644 --- a/vortex-geo/src/lib.rs +++ b/vortex-geo/src/lib.rs @@ -75,7 +75,7 @@ mod tests { let dtype = ExtDType::::try_new( GeoMetadata { - crs: "EPSG:4326".to_string(), + crs: Some("EPSG:4326".to_string()), }, DType::Binary(Nullability::NonNullable), )?; From e064113f20977beaf5322b32cefc4527677b9f06 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 30 Apr 2026 17:06:37 -0400 Subject: [PATCH 3/9] fix lockfiles Signed-off-by: Andrew Duffy --- vortex-geo/public-api.lock | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/vortex-geo/public-api.lock b/vortex-geo/public-api.lock index 6e46eecea88..6d4f22fc34e 100644 --- a/vortex-geo/public-api.lock +++ b/vortex-geo/public-api.lock @@ -4,7 +4,11 @@ pub mod vortex_geo::extension pub struct vortex_geo::extension::GeoMetadata -pub vortex_geo::extension::GeoMetadata::crs: alloc::string::String +pub vortex_geo::extension::GeoMetadata::crs: core::option::Option + +impl vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::GeoMetadata::crs(&self) -> &str impl core::clone::Clone for vortex_geo::extension::GeoMetadata @@ -82,6 +86,28 @@ pub fn vortex_geo::extension::WellKnownBinary::unpack_native<'a>(_ext_dtype: &'a pub fn vortex_geo::extension::WellKnownBinary::validate_dtype(ext_dtype: &vortex_array::dtype::extension::typed::ExtDType) -> vortex_error::VortexResult<()> +pub struct vortex_geo::extension::WellKnownBinaryData + +impl vortex_geo::extension::WellKnownBinaryData + +pub fn vortex_geo::extension::WellKnownBinaryData::geo_metadata(&self) -> &vortex_geo::extension::GeoMetadata + +pub fn vortex_geo::extension::WellKnownBinaryData::wkb_values(&self) -> &vortex_array::array::erased::ArrayRef + +impl core::clone::Clone for vortex_geo::extension::WellKnownBinaryData + +pub fn vortex_geo::extension::WellKnownBinaryData::clone(&self) -> vortex_geo::extension::WellKnownBinaryData + +impl core::convert::TryFrom> for vortex_geo::extension::WellKnownBinaryData + +pub type vortex_geo::extension::WellKnownBinaryData::Error = vortex_error::VortexError + +pub fn vortex_geo::extension::WellKnownBinaryData::try_from(ext: vortex_array::arrays::extension::vtable::ExtensionArray) -> core::result::Result + +impl core::fmt::Debug for vortex_geo::extension::WellKnownBinaryData + +pub fn vortex_geo::extension::WellKnownBinaryData::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result + pub struct vortex_geo::extension::Wkb<'a>(_) impl<'a> vortex_geo::extension::Wkb<'a> From a4dd175dbb6f34e512d0d9a60740c2b2a481ffef Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Thu, 30 Apr 2026 17:51:36 -0400 Subject: [PATCH 4/9] support duckdb -> vortex WKB Signed-off-by: Andrew Duffy --- vortex-duckdb/src/convert/vector.rs | 95 ++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 3 deletions(-) diff --git a/vortex-duckdb/src/convert/vector.rs b/vortex-duckdb/src/convert/vector.rs index a9a58c752c4..03bd45c0074 100644 --- a/vortex-duckdb/src/convert/vector.rs +++ b/vortex-duckdb/src/convert/vector.rs @@ -8,6 +8,7 @@ use vortex::array::ArrayRef; use vortex::array::IntoArray; use vortex::array::arrays::BoolArray; use vortex::array::arrays::DecimalArray; +use vortex::array::arrays::ExtensionArray; use vortex::array::arrays::FixedSizeListArray; use vortex::array::arrays::ListViewArray; use vortex::array::arrays::PrimitiveArray; @@ -15,6 +16,7 @@ use vortex::array::arrays::StructArray; use vortex::array::arrays::TemporalArray; use vortex::array::builders::ArrayBuilder; use vortex::array::builders::VarBinViewBuilder; +use vortex::array::dtype::extension::ExtDType; use vortex::array::validity::Validity; use vortex::buffer::BitBuffer; use vortex::buffer::Buffer; @@ -30,6 +32,8 @@ use vortex::error::VortexResult; use vortex::error::vortex_bail; use vortex::extension::datetime::TimeUnit; use vortex::mask::Mask; +use vortex_geo::extension::GeoMetadata; +use vortex_geo::extension::WellKnownBinary; use crate::cpp::DUCKDB_TYPE; use crate::cpp::duckdb_date; @@ -208,8 +212,8 @@ fn process_duckdb_lists( /// Converts flat vector to a vortex array pub fn flat_vector_to_vortex(vector: &VectorRef, len: usize) -> VortexResult { - let type_id = vector.logical_type().as_type_id(); - match type_id { + let logical_type = vector.logical_type(); + match logical_type.as_type_id() { DUCKDB_TYPE::DUCKDB_TYPE_TIMESTAMP => { let arr = vector_mapped(vector, len, |duckdb_timestamp { micros }| *micros); Ok(TemporalArray::new_timestamp(arr, TimeUnit::Microseconds, None).into_array()) @@ -255,6 +259,18 @@ pub fn flat_vector_to_vortex(vector: &VectorRef, len: usize) -> VortexResult { + let wkb_values = + vector_as_string_blob(vector, len, DType::Binary(Nullability::Nullable)); + let crs = logical_type.geometry_crs().map(|crs| crs.to_string()); + let wkb_type = ExtDType::::try_new( + GeoMetadata { crs }, + DType::Binary(Nullability::Nullable), + )? + .erased(); + + Ok(ExtensionArray::try_new(wkb_type, wkb_values.into_array())?.into_array()) + } DUCKDB_TYPE::DUCKDB_TYPE_BOOLEAN => { let data = vector.as_slice_with_len::(len); @@ -345,7 +361,7 @@ pub fn flat_vector_to_vortex(vector: &VectorRef, len: usize) -> VortexResult unimplemented!("missing impl for {type_id:?}"), + type_id => unimplemented!("missing impl for {type_id:?}"), } } @@ -375,17 +391,24 @@ pub fn data_chunk_to_vortex( mod tests { use std::ffi::CString; + use geo_types::point; use vortex::array::LEGACY_SESSION; use vortex::array::VortexSessionExecute; use vortex::array::arrays::BoolArray; + use vortex::array::arrays::Extension; + use vortex::array::arrays::VarBinViewArray; use vortex::array::arrays::fixed_size_list::FixedSizeListArrayExt; use vortex::array::arrays::listview::ListViewArrayExt; use vortex::array::arrays::struct_::StructArrayExt; use vortex::array::assert_arrays_eq; use vortex::error::VortexExpect; use vortex::mask::Mask; + use vortex_geo::extension::WellKnownBinaryData; + use wkb::writer::WriteOptions; + use wkb::writer::write_point; use super::*; + use crate::cpp; use crate::cpp::DUCKDB_TYPE; use crate::duckdb::LogicalType; use crate::duckdb::Vector; @@ -963,4 +986,70 @@ mod tests { Mask::from_indices(3, vec![0, 2]) ); } + + #[test] + fn test_geometry_vector_conversion() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + + let mut wkb_a: Vec = Vec::new(); + write_point( + &mut wkb_a, + &point!(x: 1.0_f64, y: 2.0_f64), + &WriteOptions::default(), + ) + .map_err(|e| vortex::error::vortex_err!("writing WKB point: {e}"))?; + let mut wkb_b: Vec = Vec::new(); + write_point( + &mut wkb_b, + &point!(x: 3.5_f64, y: -4.25_f64), + &WriteOptions::default(), + ) + .map_err(|e| vortex::error::vortex_err!("writing WKB point: {e}"))?; + + let len = 3; + let logical_type = LogicalType::geometry_type(Some("EPSG:4326"))?; + let mut vector = Vector::with_capacity(&logical_type, len); + + // WKB contains embedded null bytes, so use the length-aware assignment. + unsafe { + cpp::duckdb_vector_assign_string_element_len( + vector.as_ptr(), + 0, + wkb_a.as_ptr().cast(), + wkb_a.len() as _, + ); + cpp::duckdb_vector_assign_string_element_len( + vector.as_ptr(), + 2, + wkb_b.as_ptr().cast(), + wkb_b.len() as _, + ); + } + + // SAFETY: Vector was created with this length. + let validity_slice = unsafe { vector.ensure_validity_bitslice(len) }; + validity_slice.set(1, false); + + let result = flat_vector_to_vortex(&vector, len)?; + let extension = result + .as_opt::() + .vortex_expect("expected ExtensionArray") + .into_owned(); + let wkb_data = WellKnownBinaryData::try_from(extension)?; + + assert_eq!(wkb_data.geo_metadata().crs.as_deref(), Some("EPSG:4326")); + + let storage = wkb_data + .wkb_values() + .clone() + .execute::(&mut ctx)?; + let expected = VarBinViewArray::from_iter_nullable_bin([ + Some(wkb_a.as_slice()), + None, + Some(wkb_b.as_slice()), + ]); + assert_arrays_eq!(storage, expected); + + Ok(()) + } } From ba4f8678c619122822c34986ccfb999fb7d30755 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 1 May 2026 12:01:55 -0400 Subject: [PATCH 5/9] fix lockfiles Signed-off-by: Andrew Duffy --- vortex-geo/public-api.lock | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vortex-geo/public-api.lock b/vortex-geo/public-api.lock index 6d4f22fc34e..3d399aace98 100644 --- a/vortex-geo/public-api.lock +++ b/vortex-geo/public-api.lock @@ -18,7 +18,7 @@ impl core::cmp::Eq for vortex_geo::extension::GeoMetadata impl core::cmp::PartialEq for vortex_geo::extension::GeoMetadata -pub fn vortex_geo::extension::GeoMetadata::eq(&self, other: &vortex_geo::extension::GeoMetadata) -> bool +pub fn vortex_geo::extension::GeoMetadata::eq(&self, &vortex_geo::extension::GeoMetadata) -> bool impl core::default::Default for vortex_geo::extension::GeoMetadata @@ -26,15 +26,15 @@ pub fn vortex_geo::extension::GeoMetadata::default() -> Self impl core::fmt::Debug for vortex_geo::extension::GeoMetadata -pub fn vortex_geo::extension::GeoMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_geo::extension::GeoMetadata::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result impl core::fmt::Display for vortex_geo::extension::GeoMetadata -pub fn vortex_geo::extension::GeoMetadata::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_geo::extension::GeoMetadata::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result impl core::hash::Hash for vortex_geo::extension::GeoMetadata -pub fn vortex_geo::extension::GeoMetadata::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +pub fn vortex_geo::extension::GeoMetadata::hash<__H: core::hash::Hasher>(&self, &mut __H) impl core::marker::StructuralPartialEq for vortex_geo::extension::GeoMetadata @@ -54,7 +54,7 @@ impl core::cmp::Eq for vortex_geo::extension::WellKnownBinary impl core::cmp::PartialEq for vortex_geo::extension::WellKnownBinary -pub fn vortex_geo::extension::WellKnownBinary::eq(&self, other: &vortex_geo::extension::WellKnownBinary) -> bool +pub fn vortex_geo::extension::WellKnownBinary::eq(&self, &vortex_geo::extension::WellKnownBinary) -> bool impl core::default::Default for vortex_geo::extension::WellKnownBinary @@ -62,11 +62,11 @@ pub fn vortex_geo::extension::WellKnownBinary::default() -> vortex_geo::extensio impl core::fmt::Debug for vortex_geo::extension::WellKnownBinary -pub fn vortex_geo::extension::WellKnownBinary::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_geo::extension::WellKnownBinary::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result impl core::hash::Hash for vortex_geo::extension::WellKnownBinary -pub fn vortex_geo::extension::WellKnownBinary::hash<__H: core::hash::Hasher>(&self, state: &mut __H) +pub fn vortex_geo::extension::WellKnownBinary::hash<__H: core::hash::Hasher>(&self, &mut __H) impl core::marker::StructuralPartialEq for vortex_geo::extension::WellKnownBinary @@ -76,15 +76,15 @@ pub type vortex_geo::extension::WellKnownBinary::Metadata = vortex_geo::extensio pub type vortex_geo::extension::WellKnownBinary::NativeValue<'a> = vortex_geo::extension::Wkb<'a> -pub fn vortex_geo::extension::WellKnownBinary::deserialize_metadata(&self, metadata: &[u8]) -> vortex_error::VortexResult +pub fn vortex_geo::extension::WellKnownBinary::deserialize_metadata(&self, &[u8]) -> vortex_error::VortexResult pub fn vortex_geo::extension::WellKnownBinary::id(&self) -> vortex_array::dtype::extension::ExtId -pub fn vortex_geo::extension::WellKnownBinary::serialize_metadata(&self, metadata: &Self::Metadata) -> vortex_error::VortexResult> +pub fn vortex_geo::extension::WellKnownBinary::serialize_metadata(&self, &Self::Metadata) -> vortex_error::VortexResult> -pub fn vortex_geo::extension::WellKnownBinary::unpack_native<'a>(_ext_dtype: &'a vortex_array::dtype::extension::typed::ExtDType, storage_value: &'a vortex_array::scalar::scalar_value::ScalarValue) -> vortex_error::VortexResult +pub fn vortex_geo::extension::WellKnownBinary::unpack_native<'a>(&'a vortex_array::dtype::extension::typed::ExtDType, &'a vortex_array::scalar::scalar_value::ScalarValue) -> vortex_error::VortexResult -pub fn vortex_geo::extension::WellKnownBinary::validate_dtype(ext_dtype: &vortex_array::dtype::extension::typed::ExtDType) -> vortex_error::VortexResult<()> +pub fn vortex_geo::extension::WellKnownBinary::validate_dtype(&vortex_array::dtype::extension::typed::ExtDType) -> vortex_error::VortexResult<()> pub struct vortex_geo::extension::WellKnownBinaryData @@ -102,21 +102,21 @@ impl core::convert::TryFrom core::result::Result +pub fn vortex_geo::extension::WellKnownBinaryData::try_from(vortex_array::arrays::extension::vtable::ExtensionArray) -> core::result::Result impl core::fmt::Debug for vortex_geo::extension::WellKnownBinaryData -pub fn vortex_geo::extension::WellKnownBinaryData::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_geo::extension::WellKnownBinaryData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub struct vortex_geo::extension::Wkb<'a>(_) impl<'a> vortex_geo::extension::Wkb<'a> -pub fn vortex_geo::extension::Wkb<'a>::try_from_bytes(bytes: &'a [u8]) -> vortex_error::VortexResult +pub fn vortex_geo::extension::Wkb<'a>::try_from_bytes(&'a [u8]) -> vortex_error::VortexResult impl<'a> core::fmt::Display for vortex_geo::extension::Wkb<'a> -pub fn vortex_geo::extension::Wkb<'a>::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub fn vortex_geo::extension::Wkb<'a>::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result impl<'a> core::ops::deref::Deref for vortex_geo::extension::Wkb<'a> @@ -124,4 +124,4 @@ pub type vortex_geo::extension::Wkb<'a>::Target = wkb::reader::geometry::Wkb<'a> pub fn vortex_geo::extension::Wkb<'a>::deref(&self) -> &wkb::reader::geometry::Wkb<'a> -pub fn vortex_geo::initialize(session: &vortex_session::VortexSession) +pub fn vortex_geo::initialize(&vortex_session::VortexSession) From d5c2e7b038c8b06ea8f44ede7dab078e13b5fa94 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 1 May 2026 12:33:08 -0400 Subject: [PATCH 6/9] revert change Signed-off-by: Andrew Duffy --- vortex-duckdb/src/datasource.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vortex-duckdb/src/datasource.rs b/vortex-duckdb/src/datasource.rs index cad9fc60a09..1f07155e1f6 100644 --- a/vortex-duckdb/src/datasource.rs +++ b/vortex-duckdb/src/datasource.rs @@ -179,17 +179,18 @@ fn progress(bytes_read: &AtomicU64, bytes_total: &AtomicU64) -> f64 { impl ColumnStatistics { fn from(stats: &ColumnStatisticsAggregate, dtype: DType) -> Self { - let min = stats.min.as_ref().and_then(|value| { - Scalar::try_new(dtype.clone(), Some(value.clone())) + let min = stats.min.as_ref().map(|value| { + let value = value.clone(); + Scalar::try_new(dtype.clone(), Some(value)) .vortex_expect("scalar dtype and value are incompatible") .try_to_duckdb_scalar() - .ok() + .vortex_expect("can't convert Scalar to duckdb Value") }); - let max = stats.max.as_ref().and_then(|value| { + let max = stats.max.as_ref().map(|value| { Scalar::try_new(dtype.clone(), Some(value.clone())) .vortex_expect("scalar dtype and value are incompatible") .try_to_duckdb_scalar() - .ok() + .vortex_expect("can't convert Scalar to duckdb Value") }); let max_string_length = stats From a40f548c612c0f6758e52dcb4b8d71dbd9c27909 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 1 May 2026 14:41:26 -0400 Subject: [PATCH 7/9] fix handling of scalars Signed-off-by: Andrew Duffy --- vortex-duckdb/cpp/include/duckdb_vx/value.h | 19 ++++ vortex-duckdb/cpp/value.cpp | 21 ++++ vortex-duckdb/src/convert/scalar.rs | 112 +++++++++++++++++++- vortex-duckdb/src/duckdb/value.rs | 58 ++++++++-- 4 files changed, 202 insertions(+), 8 deletions(-) diff --git a/vortex-duckdb/cpp/include/duckdb_vx/value.h b/vortex-duckdb/cpp/include/duckdb_vx/value.h index 34c38b50c11..061674f9bfd 100644 --- a/vortex-duckdb/cpp/include/duckdb_vx/value.h +++ b/vortex-duckdb/cpp/include/duckdb_vx/value.h @@ -3,6 +3,12 @@ #pragma once +#include "duckdb_vx/duckdb_diagnostics.h" + +DUCKDB_INCLUDES_BEGIN +#include "duckdb.h" +DUCKDB_INCLUDES_END + #ifdef __cplusplus /* If compiled as C++, use C ABI */ extern "C" { #endif @@ -10,6 +16,19 @@ extern "C" { // Create a null value with a reference to a logical type. duckdb_value duckdb_vx_value_create_null(duckdb_logical_type ty); +/// Creates a GEOMETRY value containing the given WKB bytes and CRS. +/// +/// `wkb` points to `len` bytes of well-known-binary geometry data; the bytes are not validated. +/// `crs` must be a NUL-terminated UTF-8 string; pass NULL or an empty string for no CRS. +duckdb_value duckdb_vx_value_create_geometry(const uint8_t *wkb, idx_t len, const char *crs); + +/// Extracts the raw WKB bytes from a GEOMETRY value as a duckdb_blob. +/// +/// This bypasses the GEOMETRY -> BLOB default cast (which would require the spatial extension to +/// be loaded). The returned `data` pointer must be freed with `duckdb_free`. The caller must +/// ensure `value` is a non-null GEOMETRY value, otherwise behavior is undefined. +duckdb_blob duckdb_vx_value_get_geometry(duckdb_value value); + #ifdef __cplusplus /* End C ABI */ } #endif diff --git a/vortex-duckdb/cpp/value.cpp b/vortex-duckdb/cpp/value.cpp index 61e96131aff..a190d3f5bfe 100644 --- a/vortex-duckdb/cpp/value.cpp +++ b/vortex-duckdb/cpp/value.cpp @@ -4,6 +4,7 @@ #include "duckdb_vx/duckdb_diagnostics.h" DUCKDB_INCLUDES_BEGIN +#include "duckdb/common/types/geometry_crs.hpp" #include "duckdb/common/types/value.hpp" DUCKDB_INCLUDES_END @@ -14,3 +15,23 @@ extern "C" duckdb_value duckdb_vx_value_create_null(duckdb_logical_type ty) { auto value = duckdb::make_uniq(*logical_type); return reinterpret_cast(value.release()); } + +extern "C" duckdb_value duckdb_vx_value_create_geometry(const uint8_t *wkb, idx_t len, const char *crs) { + const auto bytes = reinterpret_cast(wkb); + auto value = (crs == nullptr || *crs == '\0') + ? duckdb::Value::GEOMETRY(bytes, len) + : duckdb::Value::GEOMETRY(bytes, len, duckdb::CoordinateReferenceSystem(std::string(crs))); + auto owned = duckdb::make_uniq(std::move(value)); + return reinterpret_cast(owned.release()); +} + +extern "C" duckdb_blob duckdb_vx_value_get_geometry(duckdb_value value) { + const auto val = reinterpret_cast(value); + const auto &str = duckdb::StringValue::Get(*val); + const auto size = str.size(); + auto buf = reinterpret_cast(duckdb_malloc(size)); + if (size > 0) { + memcpy(buf, str.c_str(), size); + } + return {buf, size}; +} diff --git a/vortex-duckdb/src/convert/scalar.rs b/vortex-duckdb/src/convert/scalar.rs index 8a9da564127..a26ad2f3819 100644 --- a/vortex-duckdb/src/convert/scalar.rs +++ b/vortex-duckdb/src/convert/scalar.rs @@ -47,6 +47,7 @@ use vortex::scalar::PrimitiveScalar; use vortex::scalar::Scalar; use vortex::scalar::ScalarValue; use vortex::scalar::Utf8Scalar; +use vortex_geo::extension::WellKnownBinary; use crate::convert::dtype::FromLogicalType; use crate::duckdb::LogicalType; @@ -169,9 +170,22 @@ impl ToDuckDBScalar for BinaryScalar<'_> { } impl ToDuckDBScalar for ExtScalar<'_> { - /// Converts an extension scalar (primarily temporal types) to a DuckDB value. + /// Converts an extension scalar (temporal types or `WellKnownBinary` geometries) to a DuckDB + /// value. fn try_to_duckdb_scalar(&self) -> VortexResult { let logical_type = LogicalType::try_from(&DType::Extension(self.ext_dtype().clone()))?; + + if let Some(wkb) = self.ext_dtype().metadata_opt::() { + let storage = self.to_storage_scalar(); + let binary = storage + .as_binary_opt() + .ok_or_else(|| vortex_err!("WellKnownBinary storage must be a binary scalar"))?; + return Ok(match binary.value() { + Some(bytes) => Value::new_geometry(bytes.as_slice(), wkb.crs.as_deref())?, + None => Value::null(&logical_type), + }); + } + let Some(temporal) = self.ext_dtype().metadata_opt::() else { vortex_bail!("Cannot convert non-temporal extension scalar to duckdb value"); }; @@ -266,7 +280,14 @@ impl<'a> TryFrom<&'a ValueRef> for Scalar { ExtractedValue::Float(v) => Ok(Scalar::primitive(v, Nullable)), ExtractedValue::Double(v) => Ok(Scalar::primitive(v, Nullable)), ExtractedValue::Varchar(s) => Ok(Scalar::utf8(s, Nullable)), - ExtractedValue::Blob(b) => Ok(Scalar::binary(b, Nullable)), + ExtractedValue::Blob(b) => match &dtype { + DType::Binary(_) => Ok(Scalar::binary(b, Nullable)), + DType::Extension(ext) if ext.is::() => Ok(Scalar::extension_ref( + ext.clone(), + Scalar::binary(b, Nullable), + )), + _ => vortex_bail!("Cannot convert DuckDB blob to Vortex scalar of dtype {dtype}"), + }, ExtractedValue::Date(days) => Ok(Scalar::extension::( TimeUnit::Days, Scalar::try_new( @@ -350,11 +371,19 @@ impl<'a> TryFrom<&'a ValueRef> for Scalar { #[cfg(test)] mod tests { + use rstest::rstest; + use vortex::dtype::DType; + use vortex::dtype::Nullability; + use vortex::dtype::extension::ExtDType; use vortex::extension::datetime::Timestamp; use vortex::extension::datetime::TimestampOptions; use vortex::scalar::Scalar; + use vortex_geo::extension::GeoMetadata; + use vortex_geo::extension::WellKnownBinary; use crate::convert::ToDuckDBScalar; + use crate::cpp::DUCKDB_TYPE; + use crate::duckdb::Value; #[test] fn test_scalar_round_trip() { @@ -413,4 +442,83 @@ mod tests { assert_eq!(original_scalar, roundtrip_scalar); } } + + /// Sample WKB bytes for `POINT(1 2)` little-endian. + fn sample_wkb() -> Vec { + vec![ + 0x01, // little-endian + 0x01, 0x00, 0x00, 0x00, // type = 1 (Point) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + ] + } + + fn wkb_scalar(crs: Option<&str>, bytes: &[u8]) -> Scalar { + Scalar::extension::( + GeoMetadata { + crs: crs.map(str::to_string), + }, + Scalar::binary(bytes.to_vec(), Nullability::Nullable), + ) + } + + #[rstest] + #[case::with_crs(Some("EPSG:4326"))] + #[case::no_crs(None)] + #[case::empty_crs(Some(""))] + fn test_geometry_value_extract_round_trip(#[case] crs: Option<&str>) { + let bytes = sample_wkb(); + let value = Value::new_geometry(&bytes, crs).unwrap(); + + // The constructed value must be a GEOMETRY logical type. + assert_eq!( + value.logical_type().as_type_id(), + DUCKDB_TYPE::DUCKDB_TYPE_GEOMETRY + ); + + // Extract back: bytes round-trip exactly. + let scalar: Scalar = (&*value).try_into().unwrap(); + let ext = scalar.as_extension(); + let storage = ext.to_storage_scalar(); + let storage_binary = storage.as_binary(); + assert_eq!(storage_binary.value().unwrap().as_slice(), bytes.as_slice()); + + // The extension dtype should be `WellKnownBinary` and CRS should round-trip, + // with the documented quirk that `Some("")` collapses to `None` through DuckDB. + let metadata = ext.ext_dtype().metadata::(); + match crs { + Some("") | None => assert_eq!(metadata.crs, None), + Some(s) => assert_eq!(metadata.crs.as_deref(), Some(s)), + } + } + + #[test] + fn test_geometry_to_duckdb_scalar_round_trip() { + let bytes = sample_wkb(); + let original = wkb_scalar(Some("EPSG:4326"), &bytes); + + let duckdb_value = original.try_to_duckdb_scalar().unwrap(); + let roundtrip: Scalar = duckdb_value.try_into().unwrap(); + + assert_eq!(original, roundtrip); + } + + #[test] + fn test_null_geometry_to_duckdb_scalar() { + let dtype = ExtDType::::try_new( + GeoMetadata { + crs: Some("EPSG:4326".to_string()), + }, + DType::Binary(Nullability::Nullable), + ) + .unwrap() + .erased(); + let original = Scalar::null(DType::Extension(dtype)); + + let duckdb_value = original.try_to_duckdb_scalar().unwrap(); + let roundtrip: Scalar = duckdb_value.try_into().unwrap(); + + assert!(roundtrip.is_null()); + assert_eq!(roundtrip.dtype(), original.dtype()); + } } diff --git a/vortex-duckdb/src/duckdb/value.rs b/vortex-duckdb/src/duckdb/value.rs index 752f1ccca70..fa849932967 100644 --- a/vortex-duckdb/src/duckdb/value.rs +++ b/vortex-duckdb/src/duckdb/value.rs @@ -2,9 +2,11 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use std::ffi::CStr; +use std::ffi::CString; use std::fmt::Debug; use std::fmt::Display; use std::fmt::Formatter; +use std::ptr; use num_traits::AsPrimitive; use vortex::buffer::BufferString; @@ -12,6 +14,7 @@ use vortex::buffer::ByteBuffer; use vortex::dtype::NativeDType; use vortex::error::VortexError; use vortex::error::VortexExpect; +use vortex::error::VortexResult; use vortex::error::vortex_err; use vortex::error::vortex_panic; @@ -97,12 +100,15 @@ impl ValueRef { ExtractedValue::Varchar(string) } DUCKDB_TYPE::DUCKDB_TYPE_BLOB => { - let blob = unsafe { cpp::duckdb_get_blob(self.as_ptr()) }; - let slice = - unsafe { std::slice::from_raw_parts(blob.data.cast::(), blob.size.as_()) }; - let bytes = ByteBuffer::copy_from(slice); - unsafe { cpp::duckdb_free(blob.data) }; - ExtractedValue::Blob(bytes) + ExtractedValue::Blob(unsafe { take_blob(cpp::duckdb_get_blob(self.as_ptr())) }) + } + DUCKDB_TYPE::DUCKDB_TYPE_GEOMETRY => { + // GEOMETRY values are WKB blobs in DuckDB; we read the bytes directly via a shim + // that bypasses the GEOMETRY -> BLOB default cast (which requires spatial loaded). + // CRS lives on the logical type and is not part of the extracted bytes. + ExtractedValue::Blob(unsafe { + take_blob(cpp::duckdb_vx_value_get_geometry(self.as_ptr())) + }) } DUCKDB_TYPE::DUCKDB_TYPE_DATE => { ExtractedValue::Date(unsafe { cpp::duckdb_get_date(self.as_ptr()).days }) @@ -250,6 +256,24 @@ impl Value { pub fn new_date(days: i32) -> Self { unsafe { Self::own(cpp::duckdb_create_date(cpp::duckdb_date { days })) } } + + /// Creates a DuckDB GEOMETRY value from WKB bytes and an optional CRS. + /// + /// Pass `None` (or an empty string) for `crs` to create a geometry value with no CRS. + pub fn new_geometry(wkb: &[u8], crs: Option<&str>) -> VortexResult { + let crs_c = crs + .map(CString::new) + .transpose() + .map_err(|_| vortex_err!("CRS must not contain NUL bytes"))?; + let crs_ptr = crs_c.as_ref().map_or(ptr::null(), |c| c.as_ptr()); + Ok(unsafe { + Self::own(cpp::duckdb_vx_value_create_geometry( + wkb.as_ptr(), + wkb.len() as idx_t, + crs_ptr, + )) + }) + } } impl Display for ValueRef { @@ -267,6 +291,28 @@ impl Display for Value { } } +/// Copies the bytes of a `duckdb_blob` into a `ByteBuffer` and frees the underlying allocation. +/// +/// # Safety +/// +/// `blob` must be a freshly returned value from a DuckDB allocator (`duckdb_malloc` or one of the +/// `duckdb_get_*` accessors that allocate). +unsafe fn take_blob(blob: cpp::duckdb_blob) -> ByteBuffer { + let size: usize = blob.size.as_(); + // `slice::from_raw_parts` requires a non-null, aligned pointer even for zero-length slices, + // and `duckdb_malloc(0)` is permitted to return null. Skip the slice construction in that case. + let bytes = if size == 0 { + ByteBuffer::empty() + } else { + let slice = unsafe { std::slice::from_raw_parts(blob.data.cast::(), size) }; + ByteBuffer::copy_from(slice) + }; + if !blob.data.is_null() { + unsafe { cpp::duckdb_free(blob.data) }; + } + bytes +} + #[inline] pub fn i128_from_parts(high: i64, low: u64) -> i128 { ((high as i128) << 64) | (low as i128) From b2eff8f66eb9f16453259e344b908d33c6c0dea6 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 1 May 2026 14:45:07 -0400 Subject: [PATCH 8/9] fix comment Signed-off-by: Andrew Duffy --- vortex-duckdb/src/exporter/geo.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-duckdb/src/exporter/geo.rs b/vortex-duckdb/src/exporter/geo.rs index a88a3263507..f49e6890830 100644 --- a/vortex-duckdb/src/exporter/geo.rs +++ b/vortex-duckdb/src/exporter/geo.rs @@ -8,7 +8,7 @@ use vortex_geo::extension::WellKnownBinaryData; use crate::exporter::ColumnExporter; -/// Create a new exporter for geospatial data stored in one of the supported spatial formats. +/// Create a new exporter for geospatial data stored as Well-Known Binary (WKB) format. pub(crate) fn new_wkb_exporter( array: WellKnownBinaryData, ctx: &mut ExecutionCtx, From f4995539d0ed7c80f6db8f0bdefcd027efdf3e48 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 1 May 2026 14:50:34 -0400 Subject: [PATCH 9/9] formatting Signed-off-by: Andrew Duffy --- vortex-duckdb/cpp/value.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vortex-duckdb/cpp/value.cpp b/vortex-duckdb/cpp/value.cpp index a190d3f5bfe..a0cd16574c8 100644 --- a/vortex-duckdb/cpp/value.cpp +++ b/vortex-duckdb/cpp/value.cpp @@ -18,9 +18,10 @@ extern "C" duckdb_value duckdb_vx_value_create_null(duckdb_logical_type ty) { extern "C" duckdb_value duckdb_vx_value_create_geometry(const uint8_t *wkb, idx_t len, const char *crs) { const auto bytes = reinterpret_cast(wkb); - auto value = (crs == nullptr || *crs == '\0') - ? duckdb::Value::GEOMETRY(bytes, len) - : duckdb::Value::GEOMETRY(bytes, len, duckdb::CoordinateReferenceSystem(std::string(crs))); + auto value = + (crs == nullptr || *crs == '\0') + ? duckdb::Value::GEOMETRY(bytes, len) + : duckdb::Value::GEOMETRY(bytes, len, duckdb::CoordinateReferenceSystem(std::string(crs))); auto owned = duckdb::make_uniq(std::move(value)); return reinterpret_cast(owned.release()); }