Skip to content

Commit

Permalink
Add option for casting geometry data to WKB
Browse files Browse the repository at this point in the history
  • Loading branch information
XanthosXanthopoulos committed Oct 23, 2024
1 parent 02bfa69 commit 3c5065c
Show file tree
Hide file tree
Showing 4 changed files with 491 additions and 2 deletions.
2 changes: 1 addition & 1 deletion libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ class SOMAArray : public SOMAObject {
* @param arrow_schema
* @param arrow_array
*/
void set_array_data(
virtual void set_array_data(
std::unique_ptr<ArrowSchema> arrow_schema,
std::unique_ptr<ArrowArray> arrow_array);

Expand Down
234 changes: 234 additions & 0 deletions libtiledbsoma/src/soma/soma_geometry_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,13 @@
*/

#include "soma_geometry_dataframe.h"
#include "../geometry/geometry.h"
#include "../geometry/operators/envelope.h"
#include "../geometry/operators/io/write.h"
#include "../utils/util.h"

#include <regex>
#include <unordered_set>

namespace tiledbsoma {
using namespace tiledb;
Expand Down Expand Up @@ -122,4 +126,234 @@ uint64_t SOMAGeometryDataFrame::count() {
return this->nnz();
}

void SOMAGeometryDataFrame::set_array_data(
std::unique_ptr<ArrowSchema> arrow_schema,
std::unique_ptr<ArrowArray> arrow_array) {
std::vector<std::string> spatial_axes = this->spatial_column_names();

for (auto i = 0; i < arrow_schema->n_children; ++i) {
/**
* If `soma_geometry` conforms to specific formats automatically convert
* to WKB and create additional index columns for spatial axes.
*
* If the `soma_geometry` array is a WKB binary users are expected to
* provide the additional index columns for spatial axes.
*/

if (strcmp(arrow_schema->children[i]->name, "soma_geometry") == 0 &&
strcmp(arrow_schema->children[i]->format, "+l") == 0) {
std::string_view type_metadata;

if (ArrowMetadataHasKey(
arrow_schema->children[i]->metadata,
ArrowCharView("geometry_type"))) {
ArrowStringView out;
NANOARROW_THROW_NOT_OK(ArrowMetadataGetValue(
arrow_schema->children[i]->metadata,
ArrowCharView("geometry_type"),
&out));

type_metadata = std::string_view(out.data, out.size_bytes);
}

ArrowTable casted_data;
if (type_metadata == "polygon_ring") {
casted_data = _reconstruct_geometry_data_table(
ArrowTable(std::move(arrow_array), std::move(arrow_schema)),
_cast_polygon_vertex_list_to_wkb(arrow_array->children[i]));
} else {
throw std::runtime_error("Unknown geometry type");
}

SOMAArray::set_array_data(
std::move(casted_data.second), std::move(casted_data.first));
return;
}
}

SOMAArray::set_array_data(std::move(arrow_schema), std::move(arrow_array));
}

//===================================================================
//= private non-static
//===================================================================

std::vector<ArrowTable> SOMAGeometryDataFrame::_cast_polygon_vertex_list_to_wkb(
ArrowArray* array) {
// Initialize a vector to hold all the Arrow tables containing the
// transformed geometry data
ArrowError error;
std::vector<std::string> spatial_axes = this->spatial_column_names();
std::vector<ArrowTable> tables;
tables.push_back(ArrowTable(
std::make_unique<ArrowArray>(ArrowArray{}),
std::make_unique<ArrowSchema>(ArrowSchema{})));

NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(
tables.front().first.get(), ArrowType::NANOARROW_TYPE_LARGE_BINARY));
NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
tables.front().second.get(), ArrowType::NANOARROW_TYPE_LARGE_BINARY));
NANOARROW_THROW_NOT_OK(
ArrowSchemaSetName(tables.front().second.get(), "soma_geometry"));

for (auto axis : spatial_axes) {
// Min spatial axis
tables.push_back(ArrowTable(
std::make_unique<ArrowArray>(ArrowArray{}),
std::make_unique<ArrowSchema>(ArrowSchema{})));
NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(
tables.back().first.get(), ArrowType::NANOARROW_TYPE_DOUBLE));
NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
tables.back().second.get(), ArrowType::NANOARROW_TYPE_DOUBLE));
NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(
tables.back().second.get(),
(SOMAGeometryDataFrame::dimension_prefix + axis + "__min")
.c_str()));

// Max spatial axis
tables.push_back(ArrowTable(
std::make_unique<ArrowArray>(), std::make_unique<ArrowSchema>()));
NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(
tables.back().first.get(), ArrowType::NANOARROW_TYPE_DOUBLE));
NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
tables.back().second.get(), ArrowType::NANOARROW_TYPE_DOUBLE));
NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(
tables.back().second.get(),
(SOMAGeometryDataFrame::dimension_prefix + axis + "__max")
.c_str()));
}

// Large list of doubles
const uint32_t* offset = static_cast<const uint32_t*>(array->buffers[1]);
const double_t* data = static_cast<const double_t*>(
array->children[0]->buffers[1]);

size_t wkb_buffer_size = 0;
std::vector<geometry::GenericGeometry> geometries;

for (int64_t index = 0; index < array->length; ++index) {
int64_t stop_index = index < array->length - 1 ?
offset[index + 1] :
array->children[0]->length;

std::vector<geometry::BasePoint> ring;
for (int64_t j = offset[index]; j < stop_index; j += 2) {
ring.push_back(geometry::BasePoint(data[j], data[j + 1]));
}

geometries.push_back(
geometry::GenericGeometry(geometry::Polygon(std::move(ring))));
wkb_buffer_size += wkb_size(geometries.back());
}

NANOARROW_THROW_NOT_OK(
ArrowArrayReserve(tables.front().first.get(), wkb_buffer_size));
NANOARROW_THROW_NOT_OK(
ArrowArrayStartAppending(tables.front().first.get()));
for (size_t i = 1; i < tables.size(); ++i) {
NANOARROW_THROW_NOT_OK(
ArrowArrayReserve(tables[i].first.get(), array->length));
NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tables[i].first.get()));
}

for (auto& geometry : geometries) {
geometry::BinaryBuffer wkb = geometry::to_wkb(geometry);
geometry::Envelope envelope = geometry::envelope(geometry);

ArrowBufferView wkb_view;
wkb_view.data.data = wkb.data();
wkb_view.size_bytes = (int64_t)wkb.size();

NANOARROW_THROW_NOT_OK(
ArrowArrayAppendBytes(tables.front().first.get(), wkb_view));

for (size_t i = 0; i < spatial_axes.size(); ++i) {
NANOARROW_THROW_NOT_OK(ArrowArrayAppendDouble(
tables[2 * i + 1].first.get(), envelope.range.at(i).first));
NANOARROW_THROW_NOT_OK(ArrowArrayAppendDouble(
tables[2 * i + 2].first.get(), envelope.range.at(i).second));
}
}

for (size_t i = 0; i < tables.size(); ++i) {
NANOARROW_THROW_NOT_OK(
ArrowArrayFinishBuildingDefault(tables[i].first.get(), &error));
}

return tables;
}

ArrowTable SOMAGeometryDataFrame::_reconstruct_geometry_data_table(
ArrowTable original_data, std::vector<ArrowTable> wkb_data) {
std::vector<std::string> spatial_axes = this->spatial_column_names();
std::unordered_set<std::string> unique_column_names;
std::unique_ptr<ArrowSchema> arrow_schema = std::make_unique<ArrowSchema>(
ArrowSchema{});
std::unique_ptr<ArrowArray> arrow_array = std::make_unique<ArrowArray>(
ArrowArray{});

for (int64_t i = 0; i < original_data.second->n_children; ++i) {
unique_column_names.insert(original_data.second->children[i]->name);
}
for (int64_t i = 0; i < wkb_data.size(); ++i) {
unique_column_names.insert(wkb_data[i].second->name);
}

NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
arrow_schema.get(), ArrowType::NANOARROW_TYPE_STRUCT));
NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateChildren(
arrow_schema.get(), unique_column_names.size()));
NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(
arrow_array.get(), ArrowType::NANOARROW_TYPE_STRUCT));
NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(
arrow_array.get(), unique_column_names.size()));

// First add the wkb data columns so that already existing columns in the
// original data except `soma_geometry` can overwrite the generated columns.

for (int64_t i = 0; i < wkb_data.size(); ++i) {
ArrowSchemaMove(wkb_data[i].second.get(), arrow_schema->children[i]);
ArrowArrayMove(wkb_data[i].first.get(), arrow_array->children[i]);
}

int64_t index = wkb_data.size();
for (int64_t i = 0; i < original_data.second->n_children; ++i) {
if (strcmp(original_data.second->children[i]->name, "soma_geometry") ==
0) {
continue;
}

bool replaced = false;
for (int64_t j = 0; j < wkb_data.size(); ++j) {
if (strcmp(
arrow_schema->children[j]->name,
original_data.second->children[i]->name) == 0) {
arrow_schema->children[j]->release(arrow_schema->children[j]);
arrow_array->children[j]->release(arrow_array->children[j]);

ArrowSchemaMove(
original_data.second->children[i],
arrow_schema->children[j]);
ArrowArrayMove(
original_data.first->children[i], arrow_array->children[j]);

replaced = true;
break;
}
}

if (!replaced) {
ArrowSchemaMove(
original_data.second->children[i],
arrow_schema->children[index]);
ArrowArrayMove(
original_data.first->children[i], arrow_array->children[index]);

++index;
}
}

return ArrowTable(std::move(arrow_array), std::move(arrow_schema));
}

} // namespace tiledbsoma
35 changes: 34 additions & 1 deletion libtiledbsoma/src/soma/soma_geometry_dataframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#define SOMA_GEOMETRY_DATAFRAME

#include <filesystem>
#include <vector>

#include "soma_array.h"

Expand Down Expand Up @@ -173,7 +174,39 @@ class SOMAGeometryDataFrame : virtual public SOMAArray {
* @return int64_t
*/
uint64_t count();

void set_array_data(
std::unique_ptr<ArrowSchema> arrow_schema,
std::unique_ptr<ArrowArray> arrow_array) override;

private:
//===================================================================
//= private static
//===================================================================

const std::string dimension_prefix = "tiledb__internal__";

//===================================================================
//= private non-static
//===================================================================

/**
* @brief Cast an array containing the outer rings of polygons to an Arrow
* array holding the WKB encoded polygons and generate the additional index
* column arrays based on the spatial axes.
*/
std::vector<ArrowTable> _cast_polygon_vertex_list_to_wkb(ArrowArray* array);

/**
* @brief Create a new ArrowTable by merging the generated WKB and spatial
* index arrays and the original data.
*
* @remark Generated columns have predefined names. Any generated column
* with name already present in the original data will be skipped.
*/
ArrowTable _reconstruct_geometry_data_table(
ArrowTable original_data, std::vector<ArrowTable> wkb_data);
};
} // namespace tiledbsoma

#endif // SOMA_GEOMETRY_DATAFRAME
#endif // SOMA_GEOMETRY_DATAFRAME
Loading

0 comments on commit 3c5065c

Please sign in to comment.