From 81d490ab0eb4c30d1125c3dc82082366cd34430c Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 23 Oct 2024 13:10:32 +0300 Subject: [PATCH] Add option for casting geometry data to WKB --- libtiledbsoma/src/soma/soma_array.h | 2 +- .../src/soma/soma_geometry_dataframe.cc | 234 ++++++++++++++++++ .../src/soma/soma_geometry_dataframe.h | 35 ++- .../test/unit_soma_geometry_dataframe.cc | 222 +++++++++++++++++ 4 files changed, 491 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 1840b8ffbe..7cc6d0e086 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -503,7 +503,7 @@ class SOMAArray : public SOMAObject { * @param arrow_schema * @param arrow_array */ - void set_array_data( + virtual void set_array_data( std::unique_ptr arrow_schema, std::unique_ptr arrow_array); diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc index d54be5f3dd..3bbdfa24ae 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc @@ -31,9 +31,13 @@ */ #include "soma_geometry_dataframe.h" +#include "../geometry/geometry.h" +#include "../geometry/operators/envelope.h" +#include "../geometry/operators/io/write.h" #include "../utils/util.h" #include +#include namespace tiledbsoma { using namespace tiledb; @@ -122,4 +126,234 @@ uint64_t SOMAGeometryDataFrame::count() { return this->nnz(); } +void SOMAGeometryDataFrame::set_array_data( + std::unique_ptr arrow_schema, + std::unique_ptr arrow_array) { + std::vector spatial_axes = this->spatial_column_names(); + + for (auto i = 0; i < arrow_schema->n_children; ++i) { + /** + * If `soma_geometry` conforms to specific formats automatically convert + * to WKB and create additional index columns for spatial axes. + * + * If the `soma_geometry` array is a WKB binary users are expected to + * provide the additional index columns for spatial axes. + */ + + if (strcmp(arrow_schema->children[i]->name, "soma_geometry") == 0 && + strcmp(arrow_schema->children[i]->format, "+l") == 0) { + std::string_view type_metadata; + + if (ArrowMetadataHasKey( + arrow_schema->children[i]->metadata, + ArrowCharView("geometry_type"))) { + ArrowStringView out; + NANOARROW_THROW_NOT_OK(ArrowMetadataGetValue( + arrow_schema->children[i]->metadata, + ArrowCharView("geometry_type"), + &out)); + + type_metadata = std::string_view(out.data, out.size_bytes); + } + + ArrowTable casted_data; + if (type_metadata == "polygon_ring") { + casted_data = _reconstruct_geometry_data_table( + ArrowTable(std::move(arrow_array), std::move(arrow_schema)), + _cast_polygon_vertex_list_to_wkb(arrow_array->children[i])); + } else { + throw std::runtime_error("Unknown geometry type"); + } + + SOMAArray::set_array_data( + std::move(casted_data.second), std::move(casted_data.first)); + return; + } + } + + SOMAArray::set_array_data(std::move(arrow_schema), std::move(arrow_array)); +} + +//=================================================================== +//= private non-static +//=================================================================== + +std::vector SOMAGeometryDataFrame::_cast_polygon_vertex_list_to_wkb( + ArrowArray* array) { + // Initialize a vector to hold all the Arrow tables containing the + // transformed geometry data + ArrowError error; + std::vector spatial_axes = this->spatial_column_names(); + std::vector tables; + tables.push_back(ArrowTable( + std::make_unique(ArrowArray{}), + std::make_unique(ArrowSchema{}))); + + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType( + tables.front().first.get(), ArrowType::NANOARROW_TYPE_LARGE_BINARY)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + tables.front().second.get(), ArrowType::NANOARROW_TYPE_LARGE_BINARY)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(tables.front().second.get(), "soma_geometry")); + + for (auto axis : spatial_axes) { + // Min spatial axis + tables.push_back(ArrowTable( + std::make_unique(ArrowArray{}), + std::make_unique(ArrowSchema{}))); + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType( + tables.back().first.get(), ArrowType::NANOARROW_TYPE_DOUBLE)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + tables.back().second.get(), ArrowType::NANOARROW_TYPE_DOUBLE)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName( + tables.back().second.get(), + (SOMAGeometryDataFrame::dimension_prefix + axis + "__min") + .c_str())); + + // Max spatial axis + tables.push_back(ArrowTable( + std::make_unique(), std::make_unique())); + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType( + tables.back().first.get(), ArrowType::NANOARROW_TYPE_DOUBLE)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + tables.back().second.get(), ArrowType::NANOARROW_TYPE_DOUBLE)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName( + tables.back().second.get(), + (SOMAGeometryDataFrame::dimension_prefix + axis + "__max") + .c_str())); + } + + // Large list of doubles + const uint32_t* offset = static_cast(array->buffers[1]); + const double_t* data = static_cast( + array->children[0]->buffers[1]); + + size_t wkb_buffer_size = 0; + std::vector geometries; + + for (int64_t index = 0; index < array->length; ++index) { + int64_t stop_index = index < array->length - 1 ? + offset[index + 1] : + array->children[0]->length; + + std::vector ring; + for (int64_t j = offset[index]; j < stop_index; j += 2) { + ring.push_back(geometry::BasePoint(data[j], data[j + 1])); + } + + geometries.push_back( + geometry::GenericGeometry(geometry::Polygon(std::move(ring)))); + wkb_buffer_size += wkb_size(geometries.back()); + } + + NANOARROW_THROW_NOT_OK( + ArrowArrayReserve(tables.front().first.get(), wkb_buffer_size)); + NANOARROW_THROW_NOT_OK( + ArrowArrayStartAppending(tables.front().first.get())); + for (size_t i = 1; i < tables.size(); ++i) { + NANOARROW_THROW_NOT_OK( + ArrowArrayReserve(tables[i].first.get(), array->length)); + NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tables[i].first.get())); + } + + for (auto& geometry : geometries) { + geometry::BinaryBuffer wkb = geometry::to_wkb(geometry); + geometry::Envelope envelope = geometry::envelope(geometry); + + ArrowBufferView wkb_view; + wkb_view.data.data = wkb.data(); + wkb_view.size_bytes = (int64_t)wkb.size(); + + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendBytes(tables.front().first.get(), wkb_view)); + + for (size_t i = 0; i < spatial_axes.size(); ++i) { + NANOARROW_THROW_NOT_OK(ArrowArrayAppendDouble( + tables[2 * i + 1].first.get(), envelope.range.at(i).first)); + NANOARROW_THROW_NOT_OK(ArrowArrayAppendDouble( + tables[2 * i + 2].first.get(), envelope.range.at(i).second)); + } + } + + for (size_t i = 0; i < tables.size(); ++i) { + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuildingDefault(tables[i].first.get(), &error)); + } + + return tables; +} + +ArrowTable SOMAGeometryDataFrame::_reconstruct_geometry_data_table( + ArrowTable original_data, std::vector wkb_data) { + std::vector spatial_axes = this->spatial_column_names(); + std::unordered_set unique_column_names; + std::unique_ptr arrow_schema = std::make_unique( + ArrowSchema{}); + std::unique_ptr arrow_array = std::make_unique( + ArrowArray{}); + + for (int64_t i = 0; i < original_data.second->n_children; ++i) { + unique_column_names.insert(original_data.second->children[i]->name); + } + for (int64_t i = 0; i < wkb_data.size(); ++i) { + unique_column_names.insert(wkb_data[i].second->name); + } + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + arrow_schema.get(), ArrowType::NANOARROW_TYPE_STRUCT)); + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateChildren( + arrow_schema.get(), unique_column_names.size())); + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType( + arrow_array.get(), ArrowType::NANOARROW_TYPE_STRUCT)); + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren( + arrow_array.get(), unique_column_names.size())); + + // First add the wkb data columns so that already existing columns in the + // original data except `soma_geometry` can overwrite the generated columns. + + for (int64_t i = 0; i < wkb_data.size(); ++i) { + ArrowSchemaMove(wkb_data[i].second.get(), arrow_schema->children[i]); + ArrowArrayMove(wkb_data[i].first.get(), arrow_array->children[i]); + } + + int64_t index = wkb_data.size(); + for (int64_t i = 0; i < original_data.second->n_children; ++i) { + if (strcmp(original_data.second->children[i]->name, "soma_geometry") == + 0) { + continue; + } + + bool replaced = false; + for (int64_t j = 0; j < wkb_data.size(); ++j) { + if (strcmp( + arrow_schema->children[j]->name, + original_data.second->children[i]->name) == 0) { + arrow_schema->children[j]->release(arrow_schema->children[j]); + arrow_array->children[j]->release(arrow_array->children[j]); + + ArrowSchemaMove( + original_data.second->children[i], + arrow_schema->children[j]); + ArrowArrayMove( + original_data.first->children[i], arrow_array->children[j]); + + replaced = true; + break; + } + } + + if (!replaced) { + ArrowSchemaMove( + original_data.second->children[i], + arrow_schema->children[index]); + ArrowArrayMove( + original_data.first->children[i], arrow_array->children[index]); + + ++index; + } + } + + return ArrowTable(std::move(arrow_array), std::move(arrow_schema)); +} + } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.h b/libtiledbsoma/src/soma/soma_geometry_dataframe.h index 5bfedd4648..608fa95782 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.h +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.h @@ -34,6 +34,7 @@ #define SOMA_GEOMETRY_DATAFRAME #include +#include #include "soma_array.h" @@ -173,7 +174,39 @@ class SOMAGeometryDataFrame : virtual public SOMAArray { * @return int64_t */ uint64_t count(); + + void set_array_data( + std::unique_ptr arrow_schema, + std::unique_ptr arrow_array) override; + + private: + //=================================================================== + //= private static + //=================================================================== + + const std::string dimension_prefix = "tiledb__internal__"; + + //=================================================================== + //= private non-static + //=================================================================== + + /** + * @brief Cast an array containing the outer rings of polygons to an Arrow + * array holding the WKB encoded polygons and generate the additional index + * column arrays based on the spatial axes. + */ + std::vector _cast_polygon_vertex_list_to_wkb(ArrowArray* array); + + /** + * @brief Create a new ArrowTable by merging the generated WKB and spatial + * index arrays and the original data. + * + * @remark Generated columns have predefined names. Any generated column + * with name already present in the original data will be skipped. + */ + ArrowTable _reconstruct_geometry_data_table( + ArrowTable original_data, std::vector wkb_data); }; } // namespace tiledbsoma -#endif // SOMA_GEOMETRY_DATAFRAME \ No newline at end of file +#endif // SOMA_GEOMETRY_DATAFRAME diff --git a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc index bfde07be64..466af03b47 100644 --- a/libtiledbsoma/test/unit_soma_geometry_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_geometry_dataframe.cc @@ -141,6 +141,228 @@ TEST_CASE("SOMAGeometryDataFrame: basic", "[SOMAGeometryDataFrame]") { REQUIRE(soma_geometry->nnz() == 0); soma_geometry->close(); + auto soma_object = SOMAObject::open(uri, OpenMode::read, ctx); + REQUIRE(soma_object->uri() == uri); + REQUIRE(soma_object->type() == "SOMAGeometryDataFrame"); + soma_object->close(); + } +} + +TEST_CASE("SOMAGeometryDataFrame: Roundtrip", "[SOMAGeometryDataFrame]") { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + auto ctx = std::make_shared(); + std::string uri{"mem://unit-test-geometry-roundtrip"}; + PlatformConfig platform_config{}; + + std::vector dim_infos( + {helper::DimInfo( + {.name = "soma_joinid", + .tiledb_datatype = TILEDB_INT64, + .dim_max = SOMA_JOINID_DIM_MAX, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "soma_geometry", + .tiledb_datatype = TILEDB_GEOM_WKB, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain})}); + + std::vector spatial_dim_infos( + {helper::DimInfo( + {.name = "x", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 200, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "y", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain})}); + + std::vector attr_infos({helper::AttrInfo( + {.name = "quality", .tiledb_datatype = TILEDB_FLOAT64})}); + + auto [schema, index_columns] = + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos); + auto spatial_columns = helper::create_column_index_info( + spatial_dim_infos); + + SOMAGeometryDataFrame::create( + uri, + std::move(schema), + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ArrowTable( + std::move(spatial_columns.first), + std::move(spatial_columns.second)), + ctx, + platform_config, + std::nullopt); + + // Create table of data for writing + std::unique_ptr + data_schema = std::make_unique(ArrowSchema{}); + std::unique_ptr data_array = std::make_unique( + ArrowArray{}); + + nanoarrow::UniqueBuffer metadata_buffer; + ArrowMetadataBuilderInit(metadata_buffer.get(), nullptr); + ArrowMetadataBuilderAppend( + metadata_buffer.get(), + ArrowCharView("geometry_type"), + ArrowCharView("polygon_ring")); + + ArrowSchemaInitFromType(data_schema.get(), NANOARROW_TYPE_STRUCT); + ArrowSchemaAllocateChildren(data_schema.get(), 3); + ArrowSchemaInitFromType(data_schema->children[0], NANOARROW_TYPE_LIST); + ArrowSchemaSetMetadata( + data_schema->children[0], + std::string( + (char*)metadata_buffer->data, metadata_buffer->size_bytes) + .c_str()); + ArrowSchemaSetType( + data_schema->children[0]->children[0], NANOARROW_TYPE_DOUBLE); + ArrowSchemaSetName(data_schema->children[0], "soma_geometry"); + ArrowSchemaInitFromType(data_schema->children[1], NANOARROW_TYPE_INT64); + ArrowSchemaSetName(data_schema->children[1], "soma_joinid"); + ArrowSchemaInitFromType( + data_schema->children[2], NANOARROW_TYPE_DOUBLE); + ArrowSchemaSetName(data_schema->children[2], "quality"); + + ArrowArrayInitFromType(data_array.get(), NANOARROW_TYPE_STRUCT); + ArrowArrayAllocateChildren(data_array.get(), 3); + ArrowArrayInitFromType(data_array->children[0], NANOARROW_TYPE_LIST); + ArrowArrayInitFromType(data_array->children[1], NANOARROW_TYPE_INT64); + ArrowArrayInitFromType(data_array->children[2], NANOARROW_TYPE_DOUBLE); + ArrowArrayAllocateChildren(data_array->children[0], 1); + ArrowArrayInitFromType( + data_array->children[0]->children[0], NANOARROW_TYPE_DOUBLE); + ArrowArrayStartAppending(data_array->children[0]); + ArrowArrayStartAppending(data_array->children[0]->children[0]); + ArrowArrayStartAppending(data_array->children[1]); + ArrowArrayStartAppending(data_array->children[2]); + + geometry::GenericGeometry polygon = geometry::Polygon( + std::vector( + {geometry::BasePoint(0, 0), + geometry::BasePoint(1, 0), + geometry::BasePoint(0, 1)})); + NANOARROW_THROW_NOT_OK(ArrowBufferAppendUInt32( + ArrowArrayBuffer(data_array->children[0], 1), 0)); + data_array->children[0]->length = 1; + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendDouble(data_array->children[0]->children[0], 0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendDouble(data_array->children[0]->children[0], 0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendDouble(data_array->children[0]->children[0], 1)); + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendDouble(data_array->children[0]->children[0], 0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendDouble(data_array->children[0]->children[0], 0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendDouble(data_array->children[0]->children[0], 1)); + NANOARROW_THROW_NOT_OK(ArrowArrayAppendInt(data_array->children[1], 1)); + NANOARROW_THROW_NOT_OK( + ArrowArrayAppendDouble(data_array->children[2], 63)); + + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuildingDefault(data_array->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault( + data_array->children[0]->children[0], nullptr)); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuildingDefault(data_array->children[1], nullptr)); + NANOARROW_THROW_NOT_OK( + ArrowArrayFinishBuildingDefault(data_array->children[2], nullptr)); + + // Write to point cloud. + auto soma_geometry = SOMAGeometryDataFrame::open( + uri, + OpenMode::write, + ctx, + {}, // column_names + ResultOrder::automatic, + std::nullopt); + + soma_geometry->set_array_data( + std::move(data_schema), std::move(data_array)); + soma_geometry->write(); + soma_geometry->close(); + + // Read back the data. + soma_geometry = SOMAGeometryDataFrame::open( + uri, + OpenMode::read, + ctx, + {}, // column_names, + ResultOrder::automatic, + std::nullopt); + + while (auto batch = soma_geometry->read_next()) { + auto arrbuf = batch.value(); + auto d0span = arrbuf->at(dim_infos[0].name)->data(); + auto d1span = arrbuf + ->at( + "tiledb__internal__" + + spatial_dim_infos[0].name + "__min") + ->data(); + auto d2span = arrbuf + ->at( + "tiledb__internal__" + + spatial_dim_infos[0].name + "__max") + ->data(); + auto d3span = arrbuf + ->at( + "tiledb__internal__" + + spatial_dim_infos[1].name + "__min") + ->data(); + auto d4span = arrbuf + ->at( + "tiledb__internal__" + + spatial_dim_infos[1].name + "__max") + ->data(); + auto wkbs = arrbuf->at(dim_infos[1].name)->binaries(); + auto a0span = arrbuf->at(attr_infos[0].name)->data(); + CHECK( + std::vector({1}) == + std::vector(d0span.begin(), d0span.end())); + CHECK( + std::vector({0}) == + std::vector(d1span.begin(), d1span.end())); + CHECK( + std::vector({1}) == + std::vector(d2span.begin(), d2span.end())); + CHECK( + std::vector({0}) == + std::vector(d3span.begin(), d3span.end())); + CHECK( + std::vector({1}) == + std::vector(d4span.begin(), d4span.end())); + auto a = geometry::to_wkb(polygon); + for (int i = 0; i < wkbs[0].size(); ++i) { + CHECK(a[i] == (unsigned char)wkbs[0][i]); + } + CHECK( + std::vector({63}) == + std::vector(a0span.begin(), a0span.end())); + } + soma_geometry->close(); + auto soma_object = SOMAObject::open(uri, OpenMode::read, ctx); REQUIRE(soma_object->uri() == uri); REQUIRE(soma_object->type() == "SOMAGeometryDataFrame");