From cb2c7efcdd2d659e9d8181c4fa43bc4b0e1b2788 Mon Sep 17 00:00:00 2001 From: Spencer Seale Date: Tue, 3 Sep 2024 10:38:20 -0700 Subject: [PATCH 01/11] fix doc str to proper method ref (#2942) --- apis/python/src/tiledbsoma/_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apis/python/src/tiledbsoma/_experiment.py b/apis/python/src/tiledbsoma/_experiment.py index 27961b18a6..fc0e7eea3b 100644 --- a/apis/python/src/tiledbsoma/_experiment.py +++ b/apis/python/src/tiledbsoma/_experiment.py @@ -51,7 +51,7 @@ class Experiment( # type: ignore[misc] # __eq__ false positive ... obs_df = exp.obs ... ... # the primary use case is to run queries on the experiment data. - ... q = exp.query( + ... q = exp.axis_query( ... "mtdna", ... obs_query=tiledbsoma.AxisQuery(value_filter="tissue == 'lung'"), ... var_query=tiledbsoma.AxisQuery(coords=(slice(50, 100),)), From 9ce8ea092980bc54bef470c902d99eff6fce798c Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 3 Sep 2024 13:39:32 -0400 Subject: [PATCH 02/11] [c++] Unit-test `resize` for `SparseNDArray` and `DenseNDArray` (#2947) --- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 17 ++++-- .../test/unit_soma_sparse_ndarray.cc | 54 ++++++++++++++----- 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index cbfd4783f4..7fa792d928 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -34,7 +34,7 @@ #define DIM_MAX 1000 -TEST_CASE("SOMADenseNDArray: basic") { +TEST_CASE("SOMADenseNDArray: basic", "[SOMADenseNDArray]") { int64_t dim_max = 1000; auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal @@ -98,7 +98,7 @@ TEST_CASE("SOMADenseNDArray: basic") { REQUIRE(schema->domain().has_dimension(dim_name)); REQUIRE(soma_dense->ndim() == 1); - // Once we have support for current domain in dense arrays + // TODO: Once we have support for current domain in dense arrays // if (use_current_domain) { // REQUIRE(soma_dense->shape() == std::vector{dim_max + // 1}); @@ -129,11 +129,20 @@ TEST_CASE("SOMADenseNDArray: basic") { REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); } soma_dense->close(); + + soma_dense = SOMADenseNDArray::open(uri, OpenMode::read, ctx); + auto new_shape = std::vector({DIM_MAX * 2}); + // * When use_current_domain is false: can't resize what has not + // been sized. + // * When use_current_domain is true: TODO: current domain not + // supported for dense arrays yet (see above). + REQUIRE_THROWS(soma_dense->resize(new_shape)); + soma_dense->close(); } } } -TEST_CASE("SOMADenseNDArray: platform_config") { +TEST_CASE("SOMADenseNDArray: platform_config", "[SOMADenseNDArray]") { int64_t dim_max = 1000; auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal @@ -196,7 +205,7 @@ TEST_CASE("SOMADenseNDArray: platform_config") { } } -TEST_CASE("SOMADenseNDArray: metadata") { +TEST_CASE("SOMADenseNDArray: metadata", "[SOMADenseNDArray]") { int64_t dim_max = 1000; auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index added6b5b4..ce9a943f42 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -31,11 +31,11 @@ */ #include "common.h" -#define DIM_MAX 1000 -TEST_CASE("SOMASparseNDArray: basic") { +TEST_CASE("SOMASparseNDArray: basic", "[SOMASparseNDArray]") { int64_t dim_max = 1000; - auto use_current_domain = GENERATE(false, true); + // auto use_current_domain = GENERATE(false, true); + auto use_current_domain = GENERATE(true); // TODO this could be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be // replaced with std::format. @@ -45,6 +45,7 @@ TEST_CASE("SOMASparseNDArray: basic") { auto ctx = std::make_shared(); std::string uri = "mem://unit-test-sparse-ndarray-basic"; std::string dim_name = "soma_dim_0"; + std::string attr_name = "soma_data"; tiledb_datatype_t tiledb_datatype = TILEDB_INT64; std::string arrow_format = helper::to_arrow_format(tiledb_datatype); @@ -79,7 +80,7 @@ TEST_CASE("SOMASparseNDArray: basic") { REQUIRE(soma_sparse->is_sparse() == true); REQUIRE(soma_sparse->soma_data_type() == arrow_format); auto schema = soma_sparse->tiledb_schema(); - REQUIRE(schema->has_attribute("soma_data")); + REQUIRE(schema->has_attribute(attr_name)); REQUIRE(schema->array_type() == TILEDB_SPARSE); REQUIRE(schema->domain().has_dimension(dim_name)); REQUIRE(soma_sparse->ndim() == 1); @@ -100,8 +101,8 @@ TEST_CASE("SOMASparseNDArray: basic") { std::vector a0(10, 1); soma_sparse->open(OpenMode::write); - soma_sparse->set_column_data("soma_data", a0.size(), a0.data()); soma_sparse->set_column_data(dim_name, d0.size(), d0.data()); + soma_sparse->set_column_data(attr_name, a0.size(), a0.data()); soma_sparse->write(); soma_sparse->close(); @@ -109,21 +110,50 @@ TEST_CASE("SOMASparseNDArray: basic") { while (auto batch = soma_sparse->read_next()) { auto arrbuf = batch.value(); auto d0span = arrbuf->at(dim_name)->data(); - auto a0span = arrbuf->at("soma_data")->data(); + auto a0span = arrbuf->at(attr_name)->data(); REQUIRE(d0 == std::vector(d0span.begin(), d0span.end())); REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); } - // TODO on a subsequent PR if use_current_domain: - // * test write out of bounds, including assertion of exception type - // * test resize - // * test write within new bounds + std::vector d0b({dim_max, dim_max + 1}); + std::vector a0b({30, 40}); + soma_sparse->close(); + // Try out-of-bounds write before resize. + // * Without current domain support: this should throw since it's + // outside the (immutable) doqain. + // * With current domain support: this should throw since it's outside + // the (mutable) current domain. + soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); + soma_sparse->set_column_data(dim_name, d0b.size(), d0b.data()); + soma_sparse->set_column_data(attr_name, a0b.size(), a0b.data()); + REQUIRE_THROWS(soma_sparse->write()); soma_sparse->close(); + + soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); + auto new_shape = std::vector({dim_max * 2}); + if (!use_current_domain) { + // Without current-domain support: this should throw since + // one cannot resize what has not been sized. + REQUIRE_THROWS(soma_sparse->resize(new_shape)); + } else { + soma_sparse->resize(new_shape); + } + soma_sparse->close(); + + // Try out-of-bounds write after resize. + if (use_current_domain) { + soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); + soma_sparse->set_column_data(dim_name, d0b.size(), d0b.data()); + soma_sparse->set_column_data(attr_name, a0b.size(), a0b.data()); + // Implicitly checking for no throw + soma_sparse->write(); + soma_sparse->close(); + } } } -TEST_CASE("SOMASparseNDArray: platform_config") { +TEST_CASE("SOMASparseNDArray: platform_config", "[SOMASparseNDArray]") { int64_t dim_max = 1000; auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal @@ -171,7 +201,7 @@ TEST_CASE("SOMASparseNDArray: platform_config") { } } -TEST_CASE("SOMASparseNDArray: metadata") { +TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") { int64_t dim_max = 1000; auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal From 37b6e500c06349a75bbffcf066badcd868b4ac88 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 3 Sep 2024 13:39:52 -0400 Subject: [PATCH 03/11] [c++] Make a Catch2 test fixture for dataframes (#2945) * [c++] Split out a test fixture for dataframes * fix a failure case --- libtiledbsoma/test/unit_soma_dataframe.cc | 352 +++++++++++++--------- 1 file changed, 212 insertions(+), 140 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index bd75350caa..f56a68cb94 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -32,56 +32,187 @@ #include "common.h" -#define DIM_MAX 1000 +// This is a keystroke-reduction fixture for some similar unit-test cases For +// convenience there are dims/attrs of type int64, uint32, and string. (Feel +// free to add more types.) The main value-adds of this fixture are (a) simple +// keystroke-reduction; (b) you get to pick which ones are the dim(s) and which +// are the attr(s). +struct VariouslyIndexedDataFrameFixture { + std::shared_ptr ctx_; + std::string uri_; + bool use_current_domain_; + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Using Catch2's TEST_CASE_METHOD we can't pass constructor args. + // This is a call-after-construction method. + void set_up(std::shared_ptr ctx, std::string uri) { + ctx_ = ctx; + uri_ = uri; + } -TEST_CASE("SOMADataFrame: basic") { - int64_t dim_max = 1000; - auto use_current_domain = GENERATE(false, true); - // TODO this could be formatted with fmt::format which is part of internal - // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be - // replaced with std::format. - std::ostringstream section; - section << "- use_current_domain=" << use_current_domain; - SECTION(section.str()) { - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-dataframe-basic"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; - std::string arrow_format = helper::to_arrow_format(tiledb_datatype); + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Helpers for setting up dim/attr configs and data + static const inline int64_t i64_dim_max = 100; + static const inline int64_t u32_dim_max = 10000; + static const inline int64_t str_dim_max = 0; // not used for string dims + + static const inline std::string i64_name = "soma_joinid"; + static const inline std::string u32_name = "myuint32"; + static const inline std::string str_name = "mystring"; + + tiledb_datatype_t i64_datatype = TILEDB_INT64; + tiledb_datatype_t u32_datatype = TILEDB_UINT32; + tiledb_datatype_t str_datatype = TILEDB_STRING_ASCII; + + std::string i64_arrow_format = helper::to_arrow_format(i64_datatype); + std::string u32_arrow_format = helper::to_arrow_format(u32_datatype); + std::string attr_1_arrow_format = helper::to_arrow_format(str_datatype); + + helper::DimInfo i64_dim_info(bool use_current_domain) { + return helper::DimInfo( + {.name = i64_name, + .tiledb_datatype = i64_datatype, + .dim_max = i64_dim_max, + .use_current_domain = use_current_domain}); + } + helper::DimInfo u32_dim_info(bool use_current_domain) { + return helper::DimInfo( + {.name = u32_name, + .tiledb_datatype = u32_datatype, + .dim_max = u32_dim_max, + .use_current_domain = use_current_domain}); + } + helper::DimInfo str_dim_info(bool use_current_domain) { + return helper::DimInfo( + {.name = str_name, + .tiledb_datatype = str_datatype, + .dim_max = str_dim_max, + .use_current_domain = use_current_domain}); + } - REQUIRE(!SOMADataFrame::exists(uri, ctx)); + helper::AttrInfo i64_attr_info(std::string name = i64_name) { + return helper::AttrInfo( + {.name = name, .tiledb_datatype = i64_datatype}); + } + helper::AttrInfo u32_attr_info() { + return helper::AttrInfo( + {.name = u32_name, .tiledb_datatype = u32_datatype}); + } + helper::AttrInfo str_attr_info() { + return helper::AttrInfo( + {.name = str_name, .tiledb_datatype = str_datatype}); + } - std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = dim_max, - .use_current_domain = use_current_domain}}); + std::vector make_i64_data() { + return std::vector({1, 2}); + } + std::vector make_u32_data() { + return std::vector({1234, 5678}); + } + std::vector make_str_data() { + return std::vector({"apple", "bat"}); + } - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Helper methods for create/open/write/etc. + void create( + const std::vector& dim_infos, + const std::vector& attr_infos) { auto [schema, index_columns] = helper::create_arrow_schema_and_index_columns( dim_infos, attr_infos); + SOMADataFrame::create( + uri_, + std::move(schema), + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ctx_); + } + void create( + const std::vector& dim_infos, + const std::vector& attr_infos, + const PlatformConfig& platform_config, + std::optional timestamp_range = std::nullopt) { + auto [schema, index_columns] = + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos); SOMADataFrame::create( - uri, + uri_, std::move(schema), ArrowTable( std::move(index_columns.first), std::move(index_columns.second)), - ctx); + ctx_, + platform_config, + timestamp_range); + } - REQUIRE(SOMADataFrame::exists(uri, ctx)); - REQUIRE(!SOMASparseNDArray::exists(uri, ctx)); - REQUIRE(!SOMADenseNDArray::exists(uri, ctx)); + std::unique_ptr open( + OpenMode mode, + ResultOrder result_order = ResultOrder::automatic, + std::optional timestamp_range = std::nullopt) { + return SOMADataFrame::open( + uri_, + mode, + ctx_, + {}, // column_names + result_order, + timestamp_range); + } - auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); - REQUIRE(soma_dataframe->uri() == uri); - REQUIRE(soma_dataframe->ctx() == ctx); + void write_generic_data() { + // No arguments -- for now. + // In a subsequent PR we'll vary writing in-bounds vs out of bounds. + auto soma_dataframe = SOMADataFrame::open(uri_, OpenMode::write, ctx_); + + auto i64_data = make_i64_data(); + auto u32_data = make_u32_data(); + auto str_data = make_str_data(); + + soma_dataframe->set_column_data( + i64_name, i64_data.size(), i64_data.data()); + soma_dataframe->set_column_data( + str_name, str_data.size(), str_data.data()); + soma_dataframe->set_column_data( + u32_name, u32_data.size(), u32_data.data()); + soma_dataframe->write(); + + soma_dataframe->close(); + } +}; + +TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + set_up( + std::make_shared(), "mem://unit-test-dataframe-basic"); + + std::vector dim_infos( + {i64_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); + + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); + + create(dim_infos, attr_infos); + + REQUIRE(SOMADataFrame::exists(uri_, ctx_)); + REQUIRE(!SOMASparseNDArray::exists(uri_, ctx_)); + REQUIRE(!SOMADenseNDArray::exists(uri_, ctx_)); + + auto soma_dataframe = open(OpenMode::read); + REQUIRE(soma_dataframe->uri() == uri_); + REQUIRE(soma_dataframe->ctx() == ctx_); REQUIRE(soma_dataframe->type() == "SOMADataFrame"); - std::vector expected_index_column_names = {dim_name}; + std::vector expected_index_column_names = { + dim_infos[0].name}; REQUIRE( soma_dataframe->index_column_names() == expected_index_column_names); @@ -93,30 +224,33 @@ TEST_CASE("SOMADataFrame: basic") { d0[j] = j; std::vector a0(10, 1); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); - soma_dataframe->set_column_data(attr_name, a0.size(), a0.data()); - soma_dataframe->set_column_data(dim_name, d0.size(), d0.data()); + soma_dataframe = open(OpenMode::write); + soma_dataframe->set_column_data( + dim_infos[0].name, d0.size(), d0.data()); + soma_dataframe->set_column_data( + attr_infos[0].name, a0.size(), a0.data()); soma_dataframe->write(); soma_dataframe->close(); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); + soma_dataframe = open(OpenMode::read); while (auto batch = soma_dataframe->read_next()) { auto arrbuf = batch.value(); - auto d0span = arrbuf->at(dim_name)->data(); - auto a0span = arrbuf->at(attr_name)->data(); + auto d0span = arrbuf->at(dim_infos[0].name)->data(); + auto a0span = arrbuf->at(attr_infos[0].name)->data(); REQUIRE(d0 == std::vector(d0span.begin(), d0span.end())); REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); } soma_dataframe->close(); - auto soma_object = SOMAObject::open(uri, OpenMode::read, ctx); - REQUIRE(soma_object->uri() == uri); + auto soma_object = SOMAObject::open(uri_, OpenMode::read, ctx_); + REQUIRE(soma_object->uri() == uri_); REQUIRE(soma_object->type() == "SOMADataFrame"); soma_object->close(); } } -TEST_CASE("SOMADataFrame: platform_config") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, "SOMADataFrame: platform_config") { std::pair filter = GENERATE( std::make_pair( R"({"name": "GZIP", "COMPRESSION_LEVEL": 3})", TILEDB_FILTER_GZIP), @@ -158,26 +292,20 @@ TEST_CASE("SOMADataFrame: platform_config") { std::make_pair(R"("BYTESHUFFLE")", TILEDB_FILTER_BYTESHUFFLE), std::make_pair(R"("NOOP")", TILEDB_FILTER_NONE)); - // TODO this use to be formatted with fmt::format which is part of internal + // TODO this used to be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be // replaced with std::format. std::ostringstream section; section << "- filter=" << filter.first; SECTION(section.str()) { - int64_t dim_max = 1000; auto use_current_domain = GENERATE(false, true); - // TODO this could be formatted with fmt::format which is part of - // internal header spd/log/fmt/fmt.h and should not be used. In C++20, - // this can be replaced with std::format. std::ostringstream section2; section2 << "- use_current_domain=" << use_current_domain; SECTION(section2.str()) { - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-dataframe-platform-config"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; + set_up( + std::make_shared(), + "mem://unit-test-dataframe-platform-config"); PlatformConfig platform_config; platform_config.dataframe_dim_zstd_level = 6; @@ -189,28 +317,14 @@ TEST_CASE("SOMADataFrame: platform_config") { } std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = dim_max, - .use_current_domain = use_current_domain}}); - - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); - - auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns( - dim_infos, attr_infos); - - SOMADataFrame::create( - uri, - std::move(schema), - ArrowTable( - std::move(index_columns.first), - std::move(index_columns.second)), - ctx, - platform_config); - - auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); + {i64_dim_info(use_current_domain)}); + std::vector attr_infos({i64_attr_info("a0")}); + + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); + + create(dim_infos, attr_infos, platform_config); + + auto soma_dataframe = open(OpenMode::read); auto sch = soma_dataframe->tiledb_schema(); REQUIRE( sch->offsets_filter_list().filter(0).filter_type() == @@ -221,7 +335,7 @@ TEST_CASE("SOMADataFrame: platform_config") { filter.second); auto dim_filter = sch->domain() - .dimension(dim_name) + .dimension(dim_infos[0].name) .filter_list() .filter(0); REQUIRE(dim_filter.filter_type() == TILEDB_FILTER_ZSTD); @@ -230,7 +344,7 @@ TEST_CASE("SOMADataFrame: platform_config") { if (filter.second != TILEDB_FILTER_WEBP) { REQUIRE( - sch->attribute(attr_name) + sch->attribute(attr_infos[0].name) .filter_list() .filter(0) .filter_type() == filter.second); @@ -240,8 +354,7 @@ TEST_CASE("SOMADataFrame: platform_config") { } } -TEST_CASE("SOMADataFrame: metadata") { - int64_t dim_max = 1000; +TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") { auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be @@ -249,43 +362,18 @@ TEST_CASE("SOMADataFrame: metadata") { std::ostringstream section; section << "- use_current_domain=" << use_current_domain; SECTION(section.str()) { - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-collection"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; - std::string arrow_format = helper::to_arrow_format(tiledb_datatype); + set_up(std::make_shared(), "mem://unit-test-collection"); std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = dim_max, - .use_current_domain = use_current_domain}}); + {i64_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); - auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns( - dim_infos, attr_infos); + create(dim_infos, attr_infos, PlatformConfig(), TimestampRange(0, 2)); - SOMADataFrame::create( - uri, - std::move(schema), - ArrowTable( - std::move(index_columns.first), - std::move(index_columns.second)), - ctx, - PlatformConfig(), - TimestampRange(0, 2)); - - auto soma_dataframe = SOMADataFrame::open( - uri, - OpenMode::write, - ctx, - {}, - ResultOrder::automatic, - TimestampRange(1, 1)); + auto soma_dataframe = open( + OpenMode::write, ResultOrder::automatic, TimestampRange(1, 1)); int32_t val = 100; soma_dataframe->set_metadata("md", TILEDB_INT32, 1, &val); @@ -336,54 +424,38 @@ TEST_CASE("SOMADataFrame: metadata") { } } -TEST_CASE("SOMADataFrame: bounds-checking") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, "SOMADataFrame: bounds-checking") { bool use_current_domain = true; int old_max = 100; int new_max = 200; - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-bounds-checking"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; - std::string arrow_format = helper::to_arrow_format(tiledb_datatype); - - std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = old_max, - .use_current_domain = use_current_domain}}); + set_up(std::make_shared(), "mem://unit-test-bounds-checking"); - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); + std::vector dim_infos({i64_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); - auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns(dim_infos, attr_infos); + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); - SOMADataFrame::create( - uri, - std::move(schema), - ArrowTable( - std::move(index_columns.first), std::move(index_columns.second)), - ctx); + create(dim_infos, attr_infos); - auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); + auto soma_dataframe = open(OpenMode::write); std::vector d0({old_max + 1, old_max + 2}); std::vector a0({1.5, 2.5}); - soma_dataframe->set_column_data(dim_name, d0.size(), d0.data()); - soma_dataframe->set_column_data(attr_name, a0.size(), a0.data()); + soma_dataframe->set_column_data(dim_infos[0].name, d0.size(), d0.data()); + soma_dataframe->set_column_data(attr_infos[0].name, a0.size(), a0.data()); // Writing outside the current domain should fail REQUIRE_THROWS(soma_dataframe->write()); soma_dataframe->close(); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); + soma_dataframe = open(OpenMode::write); soma_dataframe->resize(std::vector({new_max})); soma_dataframe->close(); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); - soma_dataframe->set_column_data(dim_name, d0.size(), d0.data()); - soma_dataframe->set_column_data(attr_name, a0.size(), a0.data()); + soma_dataframe = open(OpenMode::write); + soma_dataframe->set_column_data(dim_infos[0].name, d0.size(), d0.data()); + soma_dataframe->set_column_data(attr_infos[0].name, a0.size(), a0.data()); // Writing after resize should succeed soma_dataframe->write(); From ded1e4247c57d1725161743006008c2376ff8de2 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 3 Sep 2024 13:42:21 -0400 Subject: [PATCH 04/11] [c++] Unit-test variant-indexed dataframes (#2944) * [c++] Split out a test fixture for dataframes * fix a failure case * [c++] Unit-test variant-indexed dataframes [WIP] * neaten * code-review feedback [skip_ci] --- libtiledbsoma/src/soma/soma_array.h | 7 + libtiledbsoma/src/utils/arrow_adapter.cc | 5 +- libtiledbsoma/test/common.cc | 2 +- libtiledbsoma/test/unit_soma_dataframe.cc | 194 ++++++++++++++++++++++ 4 files changed, 204 insertions(+), 4 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 8e87f3b04a..790c8594bf 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -796,6 +796,13 @@ class SOMAArray : public SOMAObject { */ std::optional timestamp(); + /** + * Exposed for testing purposes. + */ + CurrentDomain get_current_domain() { + return _get_current_domain(); + } + private: //=================================================================== //= private non-static diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 5a450e3e26..f456b94647 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -868,9 +868,8 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( // nullptr) // // Fortunately, these are ASCII dims and we can range - // these accordingly. These are minimum and maximum - // values, avoiding the extremes 0x00 and 0xff. - ndrect.set_range(col_name, "\x01", "\xfe"); + // these accordingly. + ndrect.set_range(col_name, "", "\xff"); } else { const void* buff = index_column_array->children[i] ->buffers[1]; diff --git a/libtiledbsoma/test/common.cc b/libtiledbsoma/test/common.cc index f8a3b5cdaa..3fdecfc76f 100644 --- a/libtiledbsoma/test/common.cc +++ b/libtiledbsoma/test/common.cc @@ -237,7 +237,7 @@ static std::unique_ptr _create_index_cols_info_array( std::memcpy((void*)dim_array->buffers[1], vsrc, nbytes); } else { // domain small; current_domain feature not being used - int64_t dom[] = {0, info.dim_max, 1}; + uint32_t dom[] = {0, (uint32_t)info.dim_max, 1}; void* vsrc = (void*)&dom[0]; std::memcpy((void*)dim_array->buffers[1], vsrc, nbytes); } diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index f56a68cb94..540020cb91 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -461,3 +461,197 @@ TEST_CASE_METHOD( soma_dataframe->close(); } + +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32") { + auto use_current_domain = GENERATE(false, true); + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + set_up( + std::make_shared(), + "mem://unit-test-variant-indexed-dataframe-1"); + + std::vector dim_infos( + {i64_dim_info(use_current_domain)}); + std::vector attr_infos( + {str_attr_info(), u32_attr_info()}); + + // Create + create(dim_infos, attr_infos); + + // Check current domain + auto soma_dataframe = open(OpenMode::read); + + CurrentDomain current_domain = soma_dataframe->get_current_domain(); + if (!use_current_domain) { + REQUIRE(current_domain.is_empty()); + } else { + REQUIRE(!current_domain.is_empty()); + REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); + NDRectangle ndrect = current_domain.ndrectangle(); + + std::array i64_range = ndrect.range( + dim_infos[0].name); + REQUIRE(i64_range[0] == (int64_t)0); + REQUIRE(i64_range[1] == (int64_t)dim_infos[0].dim_max); + } + + soma_dataframe->close(); + + write_generic_data(); + } +} + +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str") { + auto use_current_domain = GENERATE(false, true); + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + set_up( + std::make_shared(), + "mem://unit-test-variant-indexed-dataframe-2"); + + std::vector dim_infos( + {i64_dim_info(use_current_domain), + u32_dim_info(use_current_domain)}); + std::vector attr_infos({str_attr_info()}); + + // Create + create(dim_infos, attr_infos); + + // Check current domain + auto soma_dataframe = open(OpenMode::read); + + CurrentDomain current_domain = soma_dataframe->get_current_domain(); + if (!use_current_domain) { + REQUIRE(current_domain.is_empty()); + } else { + REQUIRE(!current_domain.is_empty()); + REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); + NDRectangle ndrect = current_domain.ndrectangle(); + + std::array i64_range = ndrect.range( + dim_infos[0].name); + REQUIRE(i64_range[0] == (int64_t)0); + REQUIRE(i64_range[1] == (int64_t)dim_infos[0].dim_max); + + std::array u32_range = ndrect.range( + dim_infos[0].name); + REQUIRE(u32_range[0] == (uint32_t)0); + REQUIRE(u32_range[1] == (uint32_t)dim_infos[0].dim_max); + } + + soma_dataframe->close(); + + // Write + write_generic_data(); + } +} + +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32") { + auto use_current_domain = GENERATE(false, true); + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + set_up( + std::make_shared(), + "mem://unit-test-variant-indexed-dataframe-3"); + + std::vector dim_infos( + {i64_dim_info(use_current_domain), + str_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); + + // Create + create(dim_infos, attr_infos); + + // Check current domain + auto soma_dataframe = open(OpenMode::read); + + CurrentDomain current_domain = soma_dataframe->get_current_domain(); + if (!use_current_domain) { + REQUIRE(current_domain.is_empty()); + } else { + REQUIRE(!current_domain.is_empty()); + REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); + NDRectangle ndrect = current_domain.ndrectangle(); + + std::array i64_range = ndrect.range( + dim_infos[0].name); + REQUIRE(i64_range[0] == (int64_t)0); + REQUIRE(i64_range[1] == (int64_t)dim_infos[0].dim_max); + + std::array str_range = ndrect.range( + dim_infos[1].name); + // Can we write empty strings in this range? + REQUIRE(str_range[0] <= ""); + REQUIRE(str_range[1] >= ""); + // Can we write ASCII values in this range? + REQUIRE(str_range[0] < " "); + REQUIRE(str_range[1] > "~"); + } + + soma_dataframe->close(); + + // Write + write_generic_data(); + } +} + +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid") { + auto use_current_domain = GENERATE(false, true); + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + set_up( + std::make_shared(), + "mem://unit-test-variant-indexed-dataframe-4"); + + std::vector dim_infos( + {str_dim_info(use_current_domain), + u32_dim_info(use_current_domain)}); + std::vector attr_infos({i64_attr_info()}); + + // Create + create(dim_infos, attr_infos); + + // Check current domain + auto soma_dataframe = open(OpenMode::read); + + CurrentDomain current_domain = soma_dataframe->get_current_domain(); + if (!use_current_domain) { + REQUIRE(current_domain.is_empty()); + } else { + REQUIRE(!current_domain.is_empty()); + REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); + NDRectangle ndrect = current_domain.ndrectangle(); + + std::array str_range = ndrect.range( + dim_infos[0].name); + // Can we write empty strings in this range? + REQUIRE(str_range[0] <= ""); + REQUIRE(str_range[1] >= ""); + // Can we write ASCII values in this range? + REQUIRE(str_range[0] < " "); + REQUIRE(str_range[1] > "~"); + + std::array u32_range = ndrect.range( + dim_infos[1].name); + REQUIRE(u32_range[0] == (uint32_t)0); + REQUIRE(u32_range[1] == (uint32_t)dim_infos[1].dim_max); + } + + soma_dataframe->close(); + + // Write + write_generic_data(); + } +} From f5ae258c2abb72c1ccac540ba4407e8359594d93 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 3 Sep 2024 15:21:41 -0400 Subject: [PATCH 05/11] [c++] Performant `DataFrame.shape` (#2916) * [c++] Split out a test fixture for dataframes * fix a failure case * [c++] Unit-test variant-indexed dataframes [WIP] * neaten * [c++] Performant DataFrame.shape * Update libtiledbsoma/src/soma/soma_array.h Co-authored-by: nguyenv * Update libtiledbsoma/src/soma/soma_array.h Co-authored-by: nguyenv --------- Co-authored-by: nguyenv --- libtiledbsoma/src/soma/soma_array.cc | 24 +++++++ libtiledbsoma/src/soma/soma_array.h | 8 +++ libtiledbsoma/src/soma/soma_dataframe.cc | 6 ++ libtiledbsoma/src/soma/soma_dataframe.h | 17 +++++ libtiledbsoma/test/unit_soma_dataframe.cc | 70 ++++++++++++++++--- .../test/unit_soma_sparse_ndarray.cc | 9 ++- 6 files changed, 121 insertions(+), 13 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index ff55a360f0..092af65253 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -1191,6 +1191,30 @@ std::vector SOMAArray::maxshape() { return _tiledb_domain(); } +std::optional SOMAArray::_shape_slot_if_soma_joinid_dim() { + const std::string dim_name = "soma_joinid"; + + if (!arr_->schema().domain().has_dimension(dim_name)) { + return std::nullopt; + } + + auto current_domain = _get_current_domain(); + if (current_domain.is_empty()) { + return std::nullopt; + } + + auto t = current_domain.type(); + if (t != TILEDB_NDRECTANGLE) { + throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); + } + + NDRectangle ndrect = current_domain.ndrectangle(); + + auto range = ndrect.range(dim_name); + auto max = range[1] + 1; + return std::optional(max); +} + std::vector SOMAArray::_tiledb_domain() { std::vector result; auto dimensions = mq_->schema()->domain().dimensions(); diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 790c8594bf..686fac73e6 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -803,6 +803,14 @@ class SOMAArray : public SOMAObject { return _get_current_domain(); } + protected: + // For use nominally by SOMADataFrame. This could be moved in its entirety + // to SOMADataFrame, but it would entail moving several SOMAArray attributes + // from private to protected, which has knock-on effects on the order of + // constructor initializers, etc.: in total it's simplest to place this + // here and have SOMADataFrame invoke it. + std::optional _shape_slot_if_soma_joinid_dim(); + private: //=================================================================== //= private non-static diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index b25d4aab21..431b843c08 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -94,4 +94,10 @@ uint64_t SOMADataFrame::count() { return this->nnz(); } +std::vector SOMADataFrame::shape() { + std::optional attempt = _shape_slot_if_soma_joinid_dim(); + int64_t max = attempt.has_value() ? attempt.value() : this->nnz(); + return std::vector({max}); +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index bc353649d0..6bf50ac97a 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -163,7 +163,24 @@ class SOMADataFrame : public SOMAArray { * @return int64_t */ uint64_t count(); + + /** + * For DataFrame with default indexing, namely, a single int64_t + * soma_joinid, returns the same as SOMAArray. For DataFrame with + * soma_joinid being a dim along with other dims (optional behavior), return + * the slot along that dim. For DataFrame with soma_joinid being an attr, + * not a dim at all, returns nnz(). + * + * Note that the SOMA spec for SOMADataFrame mandates a .domain() accessor, + * which is distinct, and type-polymorphic. This shape accessor exists + * because people can and do call .shape() on SOMA DataFrames, and we have + * to keep letting them do that. + * + * @return int64_t + */ + std::vector shape(); }; + } // namespace tiledbsoma #endif // SOMA_DATAFRAME diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 540020cb91..025dc6e8bf 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -184,7 +184,10 @@ struct VariouslyIndexedDataFrameFixture { } }; -TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: basic", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be @@ -250,7 +253,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") { } TEST_CASE_METHOD( - VariouslyIndexedDataFrameFixture, "SOMADataFrame: platform_config") { + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: platform_config", + "[SOMADataFrame]") { std::pair filter = GENERATE( std::make_pair( R"({"name": "GZIP", "COMPRESSION_LEVEL": 3})", TILEDB_FILTER_GZIP), @@ -354,7 +359,10 @@ TEST_CASE_METHOD( } } -TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: metadata", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be @@ -425,7 +433,9 @@ TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") { } TEST_CASE_METHOD( - VariouslyIndexedDataFrameFixture, "SOMADataFrame: bounds-checking") { + VariouslyIndexedDataFrameFixture, + "SOMADataFrame: bounds-checking", + "[SOMADataFrame]") { bool use_current_domain = true; int old_max = 100; int new_max = 200; @@ -464,7 +474,8 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32") { + "SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; @@ -498,15 +509,26 @@ TEST_CASE_METHOD( REQUIRE(i64_range[1] == (int64_t)dim_infos[0].dim_max); } + // Check shape before write + int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str") { + "SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; @@ -545,16 +567,27 @@ TEST_CASE_METHOD( REQUIRE(u32_range[1] == (uint32_t)dim_infos[0].dim_max); } + // Check shape before write + int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); // Write write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32") { + "SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; @@ -597,16 +630,27 @@ TEST_CASE_METHOD( REQUIRE(str_range[1] > "~"); } + // Check shape before write + int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); // Write write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid") { + "SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid", + "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; @@ -649,9 +693,19 @@ TEST_CASE_METHOD( REQUIRE(u32_range[1] == (uint32_t)dim_infos[1].dim_max); } + // Check shape before write + int64_t expect = 0; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); // Write write_generic_data(); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = 2; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); } } diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index ce9a943f42..947ddcc2d6 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -86,11 +86,10 @@ TEST_CASE("SOMASparseNDArray: basic", "[SOMASparseNDArray]") { REQUIRE(soma_sparse->ndim() == 1); REQUIRE(soma_sparse->nnz() == 0); - if (use_current_domain) { - REQUIRE(soma_sparse->shape() == std::vector{dim_max + 1}); - } else { - REQUIRE( - soma_sparse->maxshape() == std::vector{dim_max + 1}); + auto expect = std::vector({dim_max + 1}); + REQUIRE(soma_sparse->shape() == expect); + if (!use_current_domain) { + REQUIRE(soma_sparse->maxshape() == expect); } soma_sparse->close(); From 7983df0730f26771f356ef143bcf4c05feaf115d Mon Sep 17 00:00:00 2001 From: John Kerl Date: Tue, 3 Sep 2024 16:07:42 -0400 Subject: [PATCH 06/11] [c++] Resize for variant-indexed `DataFrame` (#2917) * [c++] Split out a test fixture for dataframes * fix a failure case * [c++] Unit-test variant-indexed dataframes [WIP] * neaten * [c++] Performant DataFrame.shape * [c++] Resize for variant-indexed `DataFrame` --- libtiledbsoma/src/soma/soma_array.cc | 42 ++++ libtiledbsoma/src/soma/soma_array.h | 16 ++ libtiledbsoma/test/unit_soma_dataframe.cc | 230 +++++++++++++++++++--- 3 files changed, 261 insertions(+), 27 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 092af65253..6322fbba1f 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -1277,6 +1277,11 @@ void SOMAArray::resize(const std::vector& newshape) { "[SOMAArray::resize] array must be opened in write mode"); } + if (_get_current_domain().is_empty()) { + throw TileDBSOMAError( + "[SOMAArray::resize] array must already be sized"); + } + auto tctx = ctx_->tiledb_ctx(); ArraySchema schema = arr_->schema(); Domain domain = schema.domain(); @@ -1307,6 +1312,43 @@ void SOMAArray::resize(const std::vector& newshape) { schema_evolution.array_evolve(uri_); } +void SOMAArray::resize_soma_joinid_if_dim( + const std::vector& newshape) { + if (mq_->query_type() != TILEDB_WRITE) { + throw TileDBSOMAError( + "[SOMAArray::resize] array must be opened in write mode"); + } + + ArraySchema schema = arr_->schema(); + Domain domain = schema.domain(); + unsigned ndim = domain.ndim(); + if (newshape.size() != 1) { + throw TileDBSOMAError(fmt::format( + "[SOMAArray::resize]: newshape has dimension count {}; needed 1", + newshape.size(), + ndim)); + } + + auto tctx = ctx_->tiledb_ctx(); + CurrentDomain old_current_domain = ArraySchemaExperimental::current_domain( + *tctx, schema); + NDRectangle ndrect = old_current_domain.ndrectangle(); + + CurrentDomain new_current_domain(*tctx); + ArraySchemaEvolution schema_evolution(*tctx); + + for (unsigned i = 0; i < ndim; i++) { + if (domain.dimension(i).name() == "soma_joinid") { + ndrect.set_range( + domain.dimension(i).name(), 0, newshape[0] - 1); + } + } + + new_current_domain.set_ndrectangle(ndrect); + schema_evolution.expand_current_domain(new_current_domain); + schema_evolution.array_evolve(uri_); +} + uint64_t SOMAArray::ndim() const { return tiledb_schema()->domain().ndim(); } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 686fac73e6..849f37346d 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -619,6 +619,22 @@ class SOMAArray : public SOMAObject { */ void resize(const std::vector& newshape); + /** + * @brief Increases the tiledbsoma shape up to at most the maxshape, + * resizing the soma_joinid dimension if it is a dimension. + * + * While SOMA SparseNDArray and DenseNDArray, along with default-indexed + * DataFrame, have int64_t dims, non-default-indexed DataFrame objects need + * not: it is only required that they have a dim _or_ an attr called + * soma_joinid. If soma_joinid is one of the dims, it will be resized while + * the others will be preserved. If soma_joinid is not one of the dims, + * nothing will be changed, as nothing _needs_ to be changed. + * + * @return Throws if the requested shape exceeds the array's create-time + * maxshape. Throws if the array does not have current-domain support. + */ + void resize_soma_joinid_if_dim(const std::vector& newshape); + /** * @brief Get the number of dimensions. * diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 025dc6e8bf..dbe3abe1e9 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -32,6 +32,9 @@ #include "common.h" +const int64_t SOMA_JOINID_DIM_MAX = 100; +const int64_t SOMA_JOINID_RESIZE_DIM_MAX = 200; + // This is a keystroke-reduction fixture for some similar unit-test cases For // convenience there are dims/attrs of type int64, uint32, and string. (Feel // free to add more types.) The main value-adds of this fixture are (a) simple @@ -52,7 +55,7 @@ struct VariouslyIndexedDataFrameFixture { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Helpers for setting up dim/attr configs and data - static const inline int64_t i64_dim_max = 100; + static const inline int64_t i64_dim_max = SOMA_JOINID_DIM_MAX; static const inline int64_t u32_dim_max = 10000; static const inline int64_t str_dim_max = 0; // not used for string dims @@ -163,14 +166,12 @@ struct VariouslyIndexedDataFrameFixture { timestamp_range); } - void write_generic_data() { - // No arguments -- for now. - // In a subsequent PR we'll vary writing in-bounds vs out of bounds. + void write_sjid_u32_str_data_from(int64_t sjid_base) { auto soma_dataframe = SOMADataFrame::open(uri_, OpenMode::write, ctx_); - auto i64_data = make_i64_data(); - auto u32_data = make_u32_data(); - auto str_data = make_str_data(); + auto i64_data = std::vector({sjid_base + 1, sjid_base + 2}); + auto u32_data = std::vector({1234, 5678}); + auto str_data = std::vector({"apple", "bat"}); soma_dataframe->set_column_data( i64_name, i64_data.size(), i64_data.data()); @@ -437,8 +438,8 @@ TEST_CASE_METHOD( "SOMADataFrame: bounds-checking", "[SOMADataFrame]") { bool use_current_domain = true; - int old_max = 100; - int new_max = 200; + int old_max = SOMA_JOINID_DIM_MAX; + int new_max = SOMA_JOINID_RESIZE_DIM_MAX; set_up(std::make_shared(), "mem://unit-test-bounds-checking"); @@ -460,7 +461,7 @@ TEST_CASE_METHOD( soma_dataframe->close(); soma_dataframe = open(OpenMode::write); - soma_dataframe->resize(std::vector({new_max})); + soma_dataframe->resize_soma_joinid_if_dim(std::vector({new_max})); soma_dataframe->close(); soma_dataframe = open(OpenMode::write); @@ -474,15 +475,16 @@ TEST_CASE_METHOD( TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid attr:str,u32", + "SOMADataFrame: variant-indexed dataframe dim-sjid attr-str-u32", "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; SECTION(section.str()) { + std::string suffix = use_current_domain ? "true" : "false"; set_up( std::make_shared(), - "mem://unit-test-variant-indexed-dataframe-1"); + "mem://unit-test-variant-indexed-dataframe-1-" + suffix); std::vector dim_infos( {i64_dim_info(use_current_domain)}); @@ -515,27 +517,67 @@ TEST_CASE_METHOD( soma_dataframe->close(); - write_generic_data(); + write_sjid_u32_str_data_from(0); - // Check shape after write - soma_dataframe = open(OpenMode::read); - expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); - soma_dataframe->close(); + // Resize + auto new_shape = std::vector({SOMA_JOINID_RESIZE_DIM_MAX}); + + if (!use_current_domain) { + // Domain is already set. The domain (not current domain but domain) + // is immutable. All we can do is check for: + // * throw on write beyond domain + // * throw on an attempt to resize. + REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX)); + + soma_dataframe = open(OpenMode::write); + // Array not resizeable if it has not already been sized + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + } else { + // Expect throw on write beyond current domain before resize + REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX)); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = dim_infos[0].dim_max + 1; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::read); + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::write); + soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->close(); + + // Check shape after resize + soma_dataframe = open(OpenMode::read); + expect = SOMA_JOINID_RESIZE_DIM_MAX; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + // Implicitly we expect no throw + write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX); + } } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid,u32 attr:str", + "SOMADataFrame: variant-indexed dataframe dim-sjid-u32 attr-str", "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; SECTION(section.str()) { + std::string suffix = use_current_domain ? "true" : "false"; set_up( std::make_shared(), - "mem://unit-test-variant-indexed-dataframe-2"); + "mem://unit-test-variant-indexed-dataframe-2-" + suffix); std::vector dim_infos( {i64_dim_info(use_current_domain), @@ -574,27 +616,73 @@ TEST_CASE_METHOD( soma_dataframe->close(); // Write - write_generic_data(); + write_sjid_u32_str_data_from(0); // Check shape after write soma_dataframe = open(OpenMode::read); expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; REQUIRE(soma_dataframe->shape() == std::vector({expect})); soma_dataframe->close(); + + // Resize + auto new_shape = std::vector({SOMA_JOINID_RESIZE_DIM_MAX}); + + if (!use_current_domain) { + // Domain is already set. The domain (not current domain but domain) + // is immutable. All we can do is check for: + // * throw on write beyond domain + // * throw on an attempt to resize. + REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX)); + + soma_dataframe = open(OpenMode::write); + // Array not resizeable if it has not already been sized + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + } else { + // Expect throw on write beyond current domain before resize + REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX)); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = dim_infos[0].dim_max + 1; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::read); + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::write); + soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->close(); + + // Check shape after resize + soma_dataframe = open(OpenMode::read); + expect = SOMA_JOINID_RESIZE_DIM_MAX; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + // Implicitly we expect no throw + write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX); + } } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:sjid,str attr:u32", + "SOMADataFrame: variant-indexed dataframe dim-sjid-str attr-u32", "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; SECTION(section.str()) { + std::string suffix = use_current_domain ? "true" : "false"; set_up( std::make_shared(), - "mem://unit-test-variant-indexed-dataframe-3"); + "mem://unit-test-variant-indexed-dataframe-3-" + suffix); std::vector dim_infos( {i64_dim_info(use_current_domain), @@ -637,27 +725,73 @@ TEST_CASE_METHOD( soma_dataframe->close(); // Write - write_generic_data(); + write_sjid_u32_str_data_from(0); // Check shape after write soma_dataframe = open(OpenMode::read); expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; REQUIRE(soma_dataframe->shape() == std::vector({expect})); soma_dataframe->close(); + + // Resize + auto new_shape = std::vector({SOMA_JOINID_RESIZE_DIM_MAX}); + + if (!use_current_domain) { + // Domain is already set. The domain (not current domain but domain) + // is immutable. All we can do is check for: + // * throw on write beyond domain + // * throw on an attempt to resize. + REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX)); + + soma_dataframe = open(OpenMode::write); + // Array not resizeable if it has not already been sized + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + } else { + // Expect throw on write beyond current domain before resize + REQUIRE_THROWS(write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX)); + + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = dim_infos[0].dim_max + 1; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::read); + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::write); + soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->close(); + + // Check shape after resize + soma_dataframe = open(OpenMode::read); + expect = SOMA_JOINID_RESIZE_DIM_MAX; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + // Implicitly we expect no throw + write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX); + } } } TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, - "SOMADataFrame: variant-indexed dataframe dim:str,u32 attr:sjid", + "SOMADataFrame: variant-indexed dataframe dim-str-u32 attr-sjid", "[SOMADataFrame]") { auto use_current_domain = GENERATE(false, true); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; SECTION(section.str()) { + std::string suffix = use_current_domain ? "true" : "false"; set_up( std::make_shared(), - "mem://unit-test-variant-indexed-dataframe-4"); + "mem://unit-test-variant-indexed-dataframe-4-" + suffix); std::vector dim_infos( {str_dim_info(use_current_domain), @@ -700,12 +834,54 @@ TEST_CASE_METHOD( soma_dataframe->close(); // Write - write_generic_data(); + write_sjid_u32_str_data_from(0); // Check shape after write soma_dataframe = open(OpenMode::read); expect = 2; REQUIRE(soma_dataframe->shape() == std::vector({expect})); soma_dataframe->close(); + + // Resize + auto new_shape = std::vector({SOMA_JOINID_RESIZE_DIM_MAX}); + + if (!use_current_domain) { + // Domain is already set. The domain (not current domain but domain) + // is immutable. All we can do is check for: + // * throw on write beyond domain -- except here, soma_joinid is not + // a dim, so no throw + // * throw on an attempt to resize. + + soma_dataframe = open(OpenMode::write); + // Array not resizeable if it has not already been sized + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + } else { + // Check shape after write + soma_dataframe = open(OpenMode::read); + expect = 2; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::read); + REQUIRE_THROWS( + soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + soma_dataframe->close(); + + soma_dataframe = open(OpenMode::write); + soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->close(); + + // Check shape after resize -- noting soma_joinid is not a dim here + soma_dataframe = open(OpenMode::read); + expect = 2; + REQUIRE(soma_dataframe->shape() == std::vector({expect})); + soma_dataframe->close(); + + // Implicitly we expect no throw + write_sjid_u32_str_data_from(SOMA_JOINID_DIM_MAX); + } } } From 86cd52cbf9feb541e7cfd2bfecc2aaed170ec441 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 4 Sep 2024 15:14:06 -0400 Subject: [PATCH 07/11] [c++] Implement `upgrade_shape` for `SparseNDArray` and `DenseNDArray` (#2948) * [c++] Unit-test `resize` for `SparseNDArray` and `DenseNDArray` * [c++] Implement `upgrade_shape` [skip ci] * make format * fix merge typo * clang-format 14, not 17 --- libtiledbsoma/src/soma/soma_array.cc | 53 ++++++++++++++++--- libtiledbsoma/src/soma/soma_array.h | 25 +++++++++ .../test/unit_soma_sparse_ndarray.cc | 27 +++++++--- 3 files changed, 92 insertions(+), 13 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 6322fbba1f..d9818d6497 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -1216,6 +1216,9 @@ std::optional SOMAArray::_shape_slot_if_soma_joinid_dim() { } std::vector SOMAArray::_tiledb_domain() { + // Variant-indexed dataframes must use a separate path + _assert_dims_are_int64(); + std::vector result; auto dimensions = mq_->schema()->domain().dimensions(); @@ -1243,6 +1246,9 @@ std::vector SOMAArray::_tiledb_domain() { } std::vector SOMAArray::_tiledb_current_domain() { + // Variant-indexed dataframes must use a separate path + _assert_dims_are_int64(); + std::vector result; auto current_domain = tiledb::ArraySchemaExperimental::current_domain( @@ -1262,9 +1268,6 @@ std::vector SOMAArray::_tiledb_current_domain() { NDRectangle ndrect = current_domain.ndrectangle(); for (auto dimension_name : dimension_names()) { - // TODO: non-int64 types for SOMADataFrame extra dims. - // This simply needs to be integrated with switch statements as in the - // legacy code below. auto range = ndrect.range(dimension_name); result.push_back(range[1] + 1); } @@ -1272,11 +1275,31 @@ std::vector SOMAArray::_tiledb_current_domain() { } void SOMAArray::resize(const std::vector& newshape) { + if (_get_current_domain().is_empty()) { + throw TileDBSOMAError( + "[SOMAArray::resize] array must already have a shape"); + } + _set_current_domain_from_shape(newshape); +} + +void SOMAArray::upgrade_shape(const std::vector& newshape) { + if (!_get_current_domain().is_empty()) { + throw TileDBSOMAError( + "[SOMAArray::resize] array must not already have a shape"); + } + _set_current_domain_from_shape(newshape); +} + +void SOMAArray::_set_current_domain_from_shape( + const std::vector& newshape) { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError( "[SOMAArray::resize] array must be opened in write mode"); } + // Variant-indexed dataframes must use a separate path + _assert_dims_are_int64(); + if (_get_current_domain().is_empty()) { throw TileDBSOMAError( "[SOMAArray::resize] array must already be sized"); @@ -1290,9 +1313,6 @@ void SOMAArray::resize(const std::vector& newshape) { NDRectangle ndrect(*tctx, domain); - // TODO: non-int64 for DataFrame when it has extra index dims. - // This will be via a resize-helper. - unsigned n = domain.ndim(); if ((unsigned)newshape.size() != n) { throw TileDBSOMAError(fmt::format( @@ -1312,6 +1332,24 @@ void SOMAArray::resize(const std::vector& newshape) { schema_evolution.array_evolve(uri_); } +bool SOMAArray::_dims_are_int64() { + ArraySchema schema = arr_->schema(); + Domain domain = schema.domain(); + for (auto dimension : domain.dimensions()) { + if (dimension.type() != TILEDB_INT64) { + return false; + } + } + return true; +} + +void SOMAArray::_assert_dims_are_int64() { + if (!_dims_are_int64()) { + throw TileDBSOMAError( + "[SOMAArray] internal coding error: expected all dims to be int64"); + } +} + void SOMAArray::resize_soma_joinid_if_dim( const std::vector& newshape) { if (mq_->query_type() != TILEDB_WRITE) { @@ -1324,7 +1362,8 @@ void SOMAArray::resize_soma_joinid_if_dim( unsigned ndim = domain.ndim(); if (newshape.size() != 1) { throw TileDBSOMAError(fmt::format( - "[SOMAArray::resize]: newshape has dimension count {}; needed 1", + "[SOMAArray::resize]: newshape has dimension count {}; needed " + "1", newshape.size(), ndim)); } diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 849f37346d..4d08e59a47 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -619,6 +619,14 @@ class SOMAArray : public SOMAObject { */ void resize(const std::vector& newshape); + /** + * @brief Given an old-style array without current domain, sets its + * current domain. This is applicable only to arrays having all dims + * of int64 type. Namely, all SparseNDArray/DenseNDArray, and + * default-indexed DataFrame. + */ + void upgrade_shape(const std::vector& newshape); + /** * @brief Increases the tiledbsoma shape up to at most the maxshape, * resizing the soma_joinid dimension if it is a dimension. @@ -853,6 +861,23 @@ class SOMAArray : public SOMAObject { *ctx_->tiledb_ctx(), arr_->schema()); } + /** + * Helper method for resize and upgrade_shape. + */ + void _set_current_domain_from_shape(const std::vector& newshape); + + /** + * While SparseNDArray, DenseNDArray, and default-indexed DataFrame + * have int64 dims, variant-indexed DataFrames do not. This helper + * lets us pre-check any attempts to treat dims as if they were int64. + */ + bool _dims_are_int64(); + + /** + * Same, but throws. + */ + void _assert_dims_are_int64(); + /** * With old shape: core domain mapped to tiledbsoma shape; core current * domain did not exist. diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 947ddcc2d6..b1eaa52a93 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -129,19 +129,34 @@ TEST_CASE("SOMASparseNDArray: basic", "[SOMASparseNDArray]") { REQUIRE_THROWS(soma_sparse->write()); soma_sparse->close(); - soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); - auto new_shape = std::vector({dim_max * 2}); if (!use_current_domain) { + auto new_shape = std::vector({dim_max}); + + soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); // Without current-domain support: this should throw since // one cannot resize what has not been sized. REQUIRE_THROWS(soma_sparse->resize(new_shape)); + // Now set the shape + soma_sparse->upgrade_shape(new_shape); + // Should not fail since we're setting it to what it already is. + soma_sparse->resize(new_shape); + soma_sparse->close(); + + soma_sparse = SOMASparseNDArray::open(uri, OpenMode::read, ctx); + REQUIRE(soma_sparse->shape() == new_shape); + soma_sparse->close(); + } else { + auto new_shape = std::vector({dim_max * 2}); + + soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); + // Should throw since this already has a shape (core current + // domain). + REQUIRE_THROWS(soma_sparse->upgrade_shape(new_shape)); soma_sparse->resize(new_shape); - } - soma_sparse->close(); + soma_sparse->close(); - // Try out-of-bounds write after resize. - if (use_current_domain) { + // Try out-of-bounds write after resize. soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); soma_sparse->set_column_data(dim_name, d0b.size(), d0b.data()); soma_sparse->set_column_data(attr_name, a0b.size(), a0b.data()); From 8c4890c8665c74313f3a4c86ef392f014e1add47 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 4 Sep 2024 17:04:13 -0400 Subject: [PATCH 08/11] [c++] Clarify dataframe-shaping test/access points (#2951) * [c++] Clarify dataframe-shaping test/access points * code-review feedback * code-review feedback * misc. name-neatens/re-orderings between soma_array.h/cc * more on code-review feedback --- libtiledbsoma/src/soma/soma_array.cc | 426 ++++++++++++---------- libtiledbsoma/src/soma/soma_array.h | 297 +++++++-------- libtiledbsoma/src/soma/soma_dataframe.cc | 10 +- libtiledbsoma/src/soma/soma_dataframe.h | 23 +- libtiledbsoma/test/unit_soma_dataframe.cc | 122 ++++--- 5 files changed, 467 insertions(+), 411 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index d9818d6497..97e0f2196b 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -1030,6 +1030,19 @@ void SOMAArray::_cast_bit_to_uint8( } } +uint64_t SOMAArray::ndim() const { + return tiledb_schema()->domain().ndim(); +} + +std::vector SOMAArray::dimension_names() const { + std::vector result; + auto dimensions = tiledb_schema()->domain().dimensions(); + for (const auto& dim : dimensions) { + result.push_back(dim.name()); + } + return result; +} + void SOMAArray::write(bool sort_coords) { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError("[SOMAArray] array must be opened in write mode"); @@ -1049,6 +1062,116 @@ void SOMAArray::consolidate_and_vacuum(std::vector modes) { } } +std::map SOMAArray::get_attr_to_enum_mapping() { + std::map result; + for (uint32_t i = 0; i < arr_->schema().attribute_num(); ++i) { + auto attr = arr_->schema().attribute(i); + if (attr_has_enum(attr.name())) { + auto enmr_label = *get_enum_label_on_attr(attr.name()); + auto enmr = ArrayExperimental::get_enumeration( + *ctx_->tiledb_ctx(), *arr_, enmr_label); + result.insert({attr.name(), enmr}); + } + } + return result; +} + +std::optional SOMAArray::get_enum_label_on_attr( + std::string attr_name) { + auto attr = arr_->schema().attribute(attr_name); + return AttributeExperimental::get_enumeration_name( + *ctx_->tiledb_ctx(), attr); +} + +bool SOMAArray::attr_has_enum(std::string attr_name) { + return get_enum_label_on_attr(attr_name).has_value(); +} + +void SOMAArray::set_metadata( + const std::string& key, + tiledb_datatype_t value_type, + uint32_t value_num, + const void* value, + bool force) { + if (!force && key.compare(SOMA_OBJECT_TYPE_KEY) == 0) + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be modified."); + + if (!force && key.compare(ENCODING_VERSION_KEY) == 0) + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be modified."); + + arr_->put_metadata(key, value_type, value_num, value); + + MetadataValue mdval(value_type, value_num, value); + std::pair mdpair(key, mdval); + metadata_.insert(mdpair); +} + +void SOMAArray::delete_metadata(const std::string& key) { + if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) { + throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be deleted."); + } + + if (key.compare(ENCODING_VERSION_KEY) == 0) { + throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be deleted."); + } + + arr_->delete_metadata(key); + metadata_.erase(key); +} + +std::optional SOMAArray::get_metadata(const std::string& key) { + if (metadata_.count(key) == 0) { + return std::nullopt; + } + + return metadata_[key]; +} + +std::map SOMAArray::get_metadata() { + return metadata_; +} + +bool SOMAArray::has_metadata(const std::string& key) { + return metadata_.count(key) != 0; +} + +uint64_t SOMAArray::metadata_num() const { + return metadata_.size(); +} + +void SOMAArray::validate( + OpenMode mode, + std::string_view name, + std::optional timestamp) { + // Validate parameters + auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; + + try { + LOG_DEBUG(fmt::format("[SOMAArray] opening array '{}'", uri_)); + if (timestamp) { + arr_ = std::make_shared( + *ctx_->tiledb_ctx(), + uri_, + tdb_mode, + TemporalPolicy( + TimestampStartEnd, timestamp->first, timestamp->second)); + } else { + arr_ = std::make_shared(*ctx_->tiledb_ctx(), uri_, tdb_mode); + } + LOG_TRACE(fmt::format("[SOMAArray] loading enumerations")); + ArrayExperimental::load_all_enumerations( + *ctx_->tiledb_ctx(), *(arr_.get())); + mq_ = std::make_unique(arr_, ctx_->tiledb_ctx(), name); + } catch (const std::exception& e) { + throw TileDBSOMAError( + fmt::format("Error opening array: '{}'\n {}", uri_, e.what())); + } +} + +std::optional SOMAArray::timestamp() { + return timestamp_; +} + uint64_t SOMAArray::nnz() { // Verify array is sparse if (mq_->schema()->array_type() != TILEDB_SPARSE) { @@ -1080,7 +1203,7 @@ uint64_t SOMAArray::nnz() { frag_ts.second <= timestamp_->second)) { // fragment overlaps read timestamp range, but isn't fully // contained within: fall back to count_cells to sort that out. - return nnz_slow(); + return _nnz_slow(); } } // fall through: fragment is fully contained within the read timestamp @@ -1093,7 +1216,7 @@ uint64_t SOMAArray::nnz() { // application's job to otherwise ensure uniqueness), then // sum-over-fragments is the right thing to do. if (!mq_->schema()->allows_dups() && frag_ts.first != frag_ts.second) { - return nnz_slow(); + return _nnz_slow(); } } @@ -1149,10 +1272,10 @@ uint64_t SOMAArray::nnz() { return total_cell_num; } // Found relevant fragments with overlap, count cells - return nnz_slow(); + return _nnz_slow(); } -uint64_t SOMAArray::nnz_slow() { +uint64_t SOMAArray::_nnz_slow() { LOG_DEBUG( "[SOMAArray] nnz() found consolidated or overlapping fragments, " "counting cells..."); @@ -1182,98 +1305,14 @@ std::vector SOMAArray::shape() { // * Even after the new-shape feature is fully released, there will be old // arrays on disk that were created before this feature existed. // So this is long-term code. - auto current_domain = _get_current_domain(); - return current_domain.is_empty() ? _tiledb_domain() : - _tiledb_current_domain(); + return _get_current_domain().is_empty() ? _tiledb_domain() : + _tiledb_current_domain(); } std::vector SOMAArray::maxshape() { return _tiledb_domain(); } -std::optional SOMAArray::_shape_slot_if_soma_joinid_dim() { - const std::string dim_name = "soma_joinid"; - - if (!arr_->schema().domain().has_dimension(dim_name)) { - return std::nullopt; - } - - auto current_domain = _get_current_domain(); - if (current_domain.is_empty()) { - return std::nullopt; - } - - auto t = current_domain.type(); - if (t != TILEDB_NDRECTANGLE) { - throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); - } - - NDRectangle ndrect = current_domain.ndrectangle(); - - auto range = ndrect.range(dim_name); - auto max = range[1] + 1; - return std::optional(max); -} - -std::vector SOMAArray::_tiledb_domain() { - // Variant-indexed dataframes must use a separate path - _assert_dims_are_int64(); - - std::vector result; - auto dimensions = mq_->schema()->domain().dimensions(); - - for (const auto& dim : dimensions) { - // Callers inquiring about non-int64 shapes should not be here. - // - // In the SOMA data model: - // * SparseNDArray has dims which are all necessarily int64_t - // * DenseNDArray has dims which are all necessarily int64_t - // * DataFrame _default_ indexing is one dim named "soma_dim_0" of type - // int64_t, however: - // * Users can (and do) add other additional dims - // * The SOMA data model requires that soma_joinid be present in each - // DataFrame either as a dim or an attr -- and there are DataFrame - // objects for which soma_joinid is not a dim at all - // * These cases are all actively unit-tested within apis/python/tests - if (dim.type() != TILEDB_INT64) { - throw TileDBSOMAError("Found unexpected non-int64 dimension type."); - } - result.push_back( - dim.domain().second - dim.domain().first + 1); - } - - return result; -} - -std::vector SOMAArray::_tiledb_current_domain() { - // Variant-indexed dataframes must use a separate path - _assert_dims_are_int64(); - - std::vector result; - - auto current_domain = tiledb::ArraySchemaExperimental::current_domain( - *ctx_->tiledb_ctx(), arr_->schema()); - - if (current_domain.is_empty()) { - throw TileDBSOMAError( - "Internal error: current domain requested for an array which does " - "not support it"); - } - - auto t = current_domain.type(); - if (t != TILEDB_NDRECTANGLE) { - throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); - } - - NDRectangle ndrect = current_domain.ndrectangle(); - - for (auto dimension_name : dimension_names()) { - auto range = ndrect.range(dimension_name); - result.push_back(range[1] + 1); - } - return result; -} - void SOMAArray::resize(const std::vector& newshape) { if (_get_current_domain().is_empty()) { throw TileDBSOMAError( @@ -1298,7 +1337,7 @@ void SOMAArray::_set_current_domain_from_shape( } // Variant-indexed dataframes must use a separate path - _assert_dims_are_int64(); + _check_dims_are_int64(); if (_get_current_domain().is_empty()) { throw TileDBSOMAError( @@ -1332,26 +1371,7 @@ void SOMAArray::_set_current_domain_from_shape( schema_evolution.array_evolve(uri_); } -bool SOMAArray::_dims_are_int64() { - ArraySchema schema = arr_->schema(); - Domain domain = schema.domain(); - for (auto dimension : domain.dimensions()) { - if (dimension.type() != TILEDB_INT64) { - return false; - } - } - return true; -} - -void SOMAArray::_assert_dims_are_int64() { - if (!_dims_are_int64()) { - throw TileDBSOMAError( - "[SOMAArray] internal coding error: expected all dims to be int64"); - } -} - -void SOMAArray::resize_soma_joinid_if_dim( - const std::vector& newshape) { +void SOMAArray::maybe_resize_soma_joinid(const std::vector& newshape) { if (mq_->query_type() != TILEDB_WRITE) { throw TileDBSOMAError( "[SOMAArray::resize] array must be opened in write mode"); @@ -1362,8 +1382,7 @@ void SOMAArray::resize_soma_joinid_if_dim( unsigned ndim = domain.ndim(); if (newshape.size() != 1) { throw TileDBSOMAError(fmt::format( - "[SOMAArray::resize]: newshape has dimension count {}; needed " - "1", + "[SOMAArray::resize]: newshape has dimension count {}; needed 1", newshape.size(), ndim)); } @@ -1388,127 +1407,132 @@ void SOMAArray::resize_soma_joinid_if_dim( schema_evolution.array_evolve(uri_); } -uint64_t SOMAArray::ndim() const { - return tiledb_schema()->domain().ndim(); -} +std::vector SOMAArray::_tiledb_current_domain() { + // Variant-indexed dataframes must use a separate path + _check_dims_are_int64(); -std::vector SOMAArray::dimension_names() const { - std::vector result; - auto dimensions = tiledb_schema()->domain().dimensions(); - for (const auto& dim : dimensions) { - result.push_back(dim.name()); + std::vector result; + + auto current_domain = tiledb::ArraySchemaExperimental::current_domain( + *ctx_->tiledb_ctx(), arr_->schema()); + + if (current_domain.is_empty()) { + throw TileDBSOMAError( + "Internal error: current domain requested for an array which does " + "not support it"); + } + + auto t = current_domain.type(); + if (t != TILEDB_NDRECTANGLE) { + throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); + } + + NDRectangle ndrect = current_domain.ndrectangle(); + + for (auto dimension_name : dimension_names()) { + auto range = ndrect.range(dimension_name); + result.push_back(range[1] + 1); } return result; } -std::map SOMAArray::get_attr_to_enum_mapping() { - std::map result; - for (uint32_t i = 0; i < arr_->schema().attribute_num(); ++i) { - auto attr = arr_->schema().attribute(i); - if (attr_has_enum(attr.name())) { - auto enmr_label = *get_enum_label_on_attr(attr.name()); - auto enmr = ArrayExperimental::get_enumeration( - *ctx_->tiledb_ctx(), *arr_, enmr_label); - result.insert({attr.name(), enmr}); - } +std::vector SOMAArray::_tiledb_domain() { + // Variant-indexed dataframes must use a separate path + _check_dims_are_int64(); + + std::vector result; + auto dimensions = mq_->schema()->domain().dimensions(); + + for (const auto& dim : dimensions) { + result.push_back( + dim.domain().second - dim.domain().first + 1); } + return result; } -std::optional SOMAArray::get_enum_label_on_attr( - std::string attr_name) { - auto attr = arr_->schema().attribute(attr_name); - return AttributeExperimental::get_enumeration_name( - *ctx_->tiledb_ctx(), attr); +std::optional SOMAArray::_maybe_soma_joinid_shape() { + return _get_current_domain().is_empty() ? + _maybe_soma_joinid_tiledb_domain() : + _maybe_soma_joinid_tiledb_current_domain(); } -bool SOMAArray::attr_has_enum(std::string attr_name) { - return get_enum_label_on_attr(attr_name).has_value(); +std::optional SOMAArray::_maybe_soma_joinid_maxshape() { + return _maybe_soma_joinid_tiledb_domain(); } -void SOMAArray::set_metadata( - const std::string& key, - tiledb_datatype_t value_type, - uint32_t value_num, - const void* value, - bool force) { - if (!force && key.compare(SOMA_OBJECT_TYPE_KEY) == 0) - throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be modified."); - - if (!force && key.compare(ENCODING_VERSION_KEY) == 0) - throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be modified."); +std::optional SOMAArray::_maybe_soma_joinid_tiledb_current_domain() { + const std::string dim_name = "soma_joinid"; - arr_->put_metadata(key, value_type, value_num, value); + auto dom = arr_->schema().domain(); + if (!dom.has_dimension(dim_name)) { + return std::nullopt; + } - MetadataValue mdval(value_type, value_num, value); - std::pair mdpair(key, mdval); - metadata_.insert(mdpair); -} + auto current_domain = _get_current_domain(); + if (current_domain.is_empty()) { + throw TileDBSOMAError("internal coding error"); + } -void SOMAArray::delete_metadata(const std::string& key) { - if (key.compare(SOMA_OBJECT_TYPE_KEY) == 0) { - throw TileDBSOMAError(SOMA_OBJECT_TYPE_KEY + " cannot be deleted."); + auto t = current_domain.type(); + if (t != TILEDB_NDRECTANGLE) { + throw TileDBSOMAError("current_domain type is not NDRECTANGLE"); } - if (key.compare(ENCODING_VERSION_KEY) == 0) { - throw TileDBSOMAError(ENCODING_VERSION_KEY + " cannot be deleted."); + NDRectangle ndrect = current_domain.ndrectangle(); + + auto dim = dom.dimension(dim_name); + if (dim.type() != TILEDB_INT64) { + throw TileDBSOMAError(fmt::format( + "expected {} dim to be {}; got {}", + dim_name, + tiledb::impl::type_to_str(TILEDB_INT64), + tiledb::impl::type_to_str(dim.type()))); } - arr_->delete_metadata(key); - metadata_.erase(key); + auto range = ndrect.range(dim_name); + auto max = range[1] + 1; + return std::optional(max); } -std::optional SOMAArray::get_metadata(const std::string& key) { - if (metadata_.count(key) == 0) { +std::optional SOMAArray::_maybe_soma_joinid_tiledb_domain() { + const std::string dim_name = "soma_joinid"; + + auto dom = arr_->schema().domain(); + if (!dom.has_dimension(dim_name)) { return std::nullopt; } - return metadata_[key]; -} - -std::map SOMAArray::get_metadata() { - return metadata_; -} + auto dim = dom.dimension(dim_name); + if (dim.type() != TILEDB_INT64) { + throw TileDBSOMAError(fmt::format( + "expected {} dim to be {}; got {}", + dim_name, + tiledb::impl::type_to_str(TILEDB_INT64), + tiledb::impl::type_to_str(dim.type()))); + } -bool SOMAArray::has_metadata(const std::string& key) { - return metadata_.count(key) != 0; -} + auto max = dim.domain().second + 1; -uint64_t SOMAArray::metadata_num() const { - return metadata_.size(); + return std::optional(max); } -void SOMAArray::validate( - OpenMode mode, - std::string_view name, - std::optional timestamp) { - // Validate parameters - auto tdb_mode = mode == OpenMode::read ? TILEDB_READ : TILEDB_WRITE; - - try { - LOG_DEBUG(fmt::format("[SOMAArray] opening array '{}'", uri_)); - if (timestamp) { - arr_ = std::make_shared( - *ctx_->tiledb_ctx(), - uri_, - tdb_mode, - TemporalPolicy( - TimestampStartEnd, timestamp->first, timestamp->second)); - } else { - arr_ = std::make_shared(*ctx_->tiledb_ctx(), uri_, tdb_mode); +bool SOMAArray::_dims_are_int64() { + ArraySchema schema = arr_->schema(); + Domain domain = schema.domain(); + for (auto dimension : domain.dimensions()) { + if (dimension.type() != TILEDB_INT64) { + return false; } - LOG_TRACE(fmt::format("[SOMAArray] loading enumerations")); - ArrayExperimental::load_all_enumerations( - *ctx_->tiledb_ctx(), *(arr_.get())); - mq_ = std::make_unique(arr_, ctx_->tiledb_ctx(), name); - } catch (const std::exception& e) { - throw TileDBSOMAError( - fmt::format("Error opening array: '{}'\n {}", uri_, e.what())); } + return true; } -std::optional SOMAArray::timestamp() { - return timestamp_; +void SOMAArray::_check_dims_are_int64() { + if (!_dims_are_int64()) { + throw TileDBSOMAError( + "[SOMAArray] internal coding error: expected all dims to be int64"); + } } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 4d08e59a47..69d0494c4f 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -269,6 +269,20 @@ class SOMAArray : public SOMAObject { std::string_view batch_size = "auto", ResultOrder result_order = ResultOrder::automatic); + /** + * @brief Get the number of dimensions. + * + * @return uint64_t Number of dimensions. + */ + uint64_t ndim() const; + + /** + * @brief Get the name of each dimensions. + * + * @return std::vector Name of each dimensions. + */ + std::vector dimension_names() const; + /** * @brief Set the dimension slice using one point * @@ -543,13 +557,6 @@ class SOMAArray : public SOMAObject { return first_read_next_; } - /** - * @brief Get the total number of unique cells in the array. - * - * @return uint64_t Total number of unique cells - */ - uint64_t nnz(); - /** * @brief Get the TileDB ArraySchema. This should eventually * be removed in lieu of arrow_schema below. @@ -570,131 +577,6 @@ class SOMAArray : public SOMAObject { ctx_->tiledb_ctx(), arr_); } - /** - * @brief Get the current capacity of each dimension. - * - * This applies to arrays all of whose dims are of type int64_t: this - * includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed - * SOMADataFrame. - * - * At the TileDB-SOMA level we call this "shape". At the TileDB Core - * storage level this maps to "current domain". - * - * Further, we map this single n to the pair (0, n-1) since core permits a - * doubly inclusive pair (lo, hi) on each dimension slot. - * - * @return A vector with length equal to the number of dimensions; each - * value in the vector is the capacity of each dimension. - */ - std::vector shape(); - - /** - * @brief Get the maximum resizable capacity of each dimension. - * - * This applies to arrays all of whose dims are of type int64_t: this - * includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed - * SOMADataFrame. - * - * At the TileDB-SOMA level we call this "maxshape". At the TileDB Core - * storage level this maps to "domain". - * - * Further, we map this single n to the pair (0, n-1) since core permits a - * doubly inclusive pair (lo, hi) on each dimension slot. - * - * @return A vector with length equal to the number of dimensions; each - * value in the vector is the maximum capacity of each dimension. - */ - std::vector maxshape(); - - /** - * @brief Resize the shape (what core calls "current domain") up to the - * maxshape (what core calls "domain"). - * - * This applies to arrays all of whose dims are of type int64_t: this - * includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed - * SOMADataFrame. - * - * @return Nothing. Raises an exception if the resize would be a downsize, - * which is not supported. - */ - void resize(const std::vector& newshape); - - /** - * @brief Given an old-style array without current domain, sets its - * current domain. This is applicable only to arrays having all dims - * of int64 type. Namely, all SparseNDArray/DenseNDArray, and - * default-indexed DataFrame. - */ - void upgrade_shape(const std::vector& newshape); - - /** - * @brief Increases the tiledbsoma shape up to at most the maxshape, - * resizing the soma_joinid dimension if it is a dimension. - * - * While SOMA SparseNDArray and DenseNDArray, along with default-indexed - * DataFrame, have int64_t dims, non-default-indexed DataFrame objects need - * not: it is only required that they have a dim _or_ an attr called - * soma_joinid. If soma_joinid is one of the dims, it will be resized while - * the others will be preserved. If soma_joinid is not one of the dims, - * nothing will be changed, as nothing _needs_ to be changed. - * - * @return Throws if the requested shape exceeds the array's create-time - * maxshape. Throws if the array does not have current-domain support. - */ - void resize_soma_joinid_if_dim(const std::vector& newshape); - - /** - * @brief Get the number of dimensions. - * - * @return uint64_t Number of dimensions. - */ - uint64_t ndim() const; - - /** - * Retrieves the non-empty domain from the array. This is the union of the - * non-empty domains of the array fragments. - */ - template - std::pair non_empty_domain(const std::string& name) { - try { - return arr_->non_empty_domain(name); - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } - } - - /** - * Retrieves the non-empty domain from the array on the given dimension. - * This is the union of the non-empty domains of the array fragments. - * Applicable only to var-sized dimensions. - */ - std::pair non_empty_domain_var( - const std::string& name) { - try { - return arr_->non_empty_domain_var(name); - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } - } - - /** - * Returns the domain of the given dimension. - * - * @tparam T Domain datatype - * @return Pair of [lower, upper] inclusive bounds. - */ - template - std::pair domain(const std::string& name) const { - return arr_->schema().domain().dimension(name).domain(); - } - - /** - * @brief Get the name of each dimensions. - * - * @return std::vector Name of each dimensions. - */ - std::vector dimension_names() const; - /** * @brief Get the mapping of attributes to Enumerations. * @@ -755,12 +637,10 @@ class SOMAArray : public SOMAObject { /** * @brief Given a key, get the associated value datatype, number of * values, and value in binary form. The array must be opened in READ - mode, - * otherwise the function will error out. + * mode, otherwise the function will error out. * * The value may consist of more than one items of the same datatype. - Keys - * that do not exist in the metadata will be return NULL for the value. + * Keys that do not exist in the metadata will be return NULL for the value. * * **Example:** * @code{.cpp} @@ -776,8 +656,7 @@ class SOMAArray : public SOMAObject { * @endcode * * @param key The key of the metadata item to be retrieved. UTF-8 - encodings - * are acceptable. + * encodings are acceptable. * @return MetadataValue (std::tuple) */ @@ -820,6 +699,124 @@ class SOMAArray : public SOMAObject { */ std::optional timestamp(); + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. + */ + template + std::pair non_empty_domain(const std::string& name) { + try { + return arr_->non_empty_domain(name); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + } + + /** + * Retrieves the non-empty domain from the array on the given dimension. + * This is the union of the non-empty domains of the array fragments. + * Applicable only to var-sized dimensions. + */ + std::pair non_empty_domain_var( + const std::string& name) { + try { + return arr_->non_empty_domain_var(name); + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } + } + + /** + * Returns the domain of the given dimension. + * + * @tparam T Domain datatype + * @return Pair of [lower, upper] inclusive bounds. + */ + template + std::pair domain(const std::string& name) const { + return arr_->schema().domain().dimension(name).domain(); + } + + /** + * @brief Get the total number of unique cells in the array. + * + * @return uint64_t Total number of unique cells + */ + uint64_t nnz(); + + /** + * @brief Get the current capacity of each dimension. + * + * This applies to arrays all of whose dims are of type int64_t: this + * includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed + * SOMADataFrame. + * + * At the TileDB-SOMA level we call this "shape". At the TileDB Core + * storage level this maps to "current domain". + * + * Further, we map this single n to the pair (0, n-1) since core permits a + * doubly inclusive pair (lo, hi) on each dimension slot. + * + * @return A vector with length equal to the number of dimensions; each + * value in the vector is the capacity of each dimension. + */ + std::vector shape(); + + /** + * @brief Get the maximum resizable capacity of each dimension. + * + * This applies to arrays all of whose dims are of type int64_t: this + * includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed + * SOMADataFrame. + * + * At the TileDB-SOMA level we call this "maxshape". At the TileDB Core + * storage level this maps to "domain". + * + * Further, we map this single n to the pair (0, n-1) since core permits a + * doubly inclusive pair (lo, hi) on each dimension slot. + * + * @return A vector with length equal to the number of dimensions; each + * value in the vector is the maximum capacity of each dimension. + */ + std::vector maxshape(); + + /** + * @brief Resize the shape (what core calls "current domain") up to the + * maxshape (what core calls "domain"). + * + * This applies to arrays all of whose dims are of type int64_t: this + * includes SOMASparseNDArray and SOMADenseNDArray, and default-indexed + * SOMADataFrame. + * + * @return Nothing. Raises an exception if the resize would be a downsize, + * which is not supported. + */ + void resize(const std::vector& newshape); + + /** + * @brief Given an old-style array without current domain, sets its + * current domain. This is applicable only to arrays having all dims + * of int64 type. Namely, all SparseNDArray/DenseNDArray, and + * default-indexed DataFrame. + */ + void upgrade_shape(const std::vector& newshape); + + /** + * @brief Increases the tiledbsoma shape up to at most the maxshape, + * resizing the soma_joinid dimension if it is a dimension. + * + * While SOMA SparseNDArray and DenseNDArray, along with default-indexed + * DataFrame, have int64_t dims, non-default-indexed DataFrame objects need + * not: it is only required that they have a dim _or_ an attr called + * soma_joinid. If soma_joinid is one of the dims, it will be resized while + * the others will be preserved. If soma_joinid is not one of the dims, + * nothing will be changed, as nothing _needs_ to be changed. + * + * @return Throws if the requested shape exceeds the array's create-time + * maxshape. Throws if the array does not have current-domain support. + */ + void maybe_resize_soma_joinid(const std::vector& newshape); + /** * Exposed for testing purposes. */ @@ -828,12 +825,18 @@ class SOMAArray : public SOMAObject { } protected: - // For use nominally by SOMADataFrame. This could be moved in its entirety - // to SOMADataFrame, but it would entail moving several SOMAArray attributes - // from private to protected, which has knock-on effects on the order of - // constructor initializers, etc.: in total it's simplest to place this - // here and have SOMADataFrame invoke it. - std::optional _shape_slot_if_soma_joinid_dim(); + // These two are for use nominally by SOMADataFrame. This could be moved in + // its entirety to SOMADataFrame, but it would entail moving several + // SOMAArray attributes from private to protected, which has knock-on + // effects on the order of constructor initializers, etc.: in total it's + // simplest to place this here and have SOMADataFrame invoke it. + // + // They return the shape and maxshape slots for the soma_joinid dim, if + // the array has one. These are important test-points and dev-internal + // access-points, in particular, for the tiledbsoma-io experiment-level + // resizer. + std::optional _maybe_soma_joinid_shape(); + std::optional _maybe_soma_joinid_maxshape(); private: //=================================================================== @@ -876,7 +879,7 @@ class SOMAArray : public SOMAObject { /** * Same, but throws. */ - void _assert_dims_are_int64(); + void _check_dims_are_int64(); /** * With old shape: core domain mapped to tiledbsoma shape; core current @@ -889,6 +892,8 @@ class SOMAArray : public SOMAObject { */ std::vector _tiledb_domain(); std::vector _tiledb_current_domain(); + std::optional _maybe_soma_joinid_tiledb_current_domain(); + std::optional _maybe_soma_joinid_tiledb_domain(); bool _extend_enumeration( ArrowSchema* value_schema, @@ -1379,7 +1384,7 @@ class SOMAArray : public SOMAObject { bool submitted_ = false; // Unoptimized method for computing nnz() (issue `count_cells` query) - uint64_t nnz_slow(); + uint64_t _nnz_slow(); // ArrayBuffers to hold ColumnBuffers alive when submitting to write // query diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 431b843c08..a65773a81e 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -94,10 +94,12 @@ uint64_t SOMADataFrame::count() { return this->nnz(); } -std::vector SOMADataFrame::shape() { - std::optional attempt = _shape_slot_if_soma_joinid_dim(); - int64_t max = attempt.has_value() ? attempt.value() : this->nnz(); - return std::vector({max}); +std::optional SOMADataFrame::maybe_soma_joinid_shape() { + return _maybe_soma_joinid_shape(); +} + +std::optional SOMADataFrame::maybe_soma_joinid_maxshape() { + return _maybe_soma_joinid_maxshape(); } } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index 6bf50ac97a..231e578baf 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -165,20 +165,23 @@ class SOMADataFrame : public SOMAArray { uint64_t count(); /** - * For DataFrame with default indexing, namely, a single int64_t - * soma_joinid, returns the same as SOMAArray. For DataFrame with - * soma_joinid being a dim along with other dims (optional behavior), return - * the slot along that dim. For DataFrame with soma_joinid being an attr, - * not a dim at all, returns nnz(). + * While application-level SOMA DataFrame doesn't have shape + * and maxshape, these are important test-point accessors, + * as well as crucial for experiment-level resize within tiledbsoma.io. * * Note that the SOMA spec for SOMADataFrame mandates a .domain() accessor, - * which is distinct, and type-polymorphic. This shape accessor exists - * because people can and do call .shape() on SOMA DataFrames, and we have - * to keep letting them do that. + * which is distinct, and type-polymorphic. * - * @return int64_t + * @return std::optional + */ + std::optional maybe_soma_joinid_shape(); + + /** + * See comments for maybe_soma_joinid_shape. + * + * @return std::optional */ - std::vector shape(); + std::optional maybe_soma_joinid_maxshape(); }; } // namespace tiledbsoma diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index dbe3abe1e9..f160a0221b 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -461,7 +461,7 @@ TEST_CASE_METHOD( soma_dataframe->close(); soma_dataframe = open(OpenMode::write); - soma_dataframe->resize_soma_joinid_if_dim(std::vector({new_max})); + soma_dataframe->maybe_resize_soma_joinid(std::vector({new_max})); soma_dataframe->close(); soma_dataframe = open(OpenMode::write); @@ -512,8 +512,11 @@ TEST_CASE_METHOD( } // Check shape before write - int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + int64_t expect = dim_infos[0].dim_max + 1; + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); @@ -531,8 +534,7 @@ TEST_CASE_METHOD( soma_dataframe = open(OpenMode::write); // Array not resizeable if it has not already been sized - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); } else { @@ -542,22 +544,28 @@ TEST_CASE_METHOD( // Check shape after write soma_dataframe = open(OpenMode::read); expect = dim_infos[0].dim_max + 1; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); soma_dataframe = open(OpenMode::read); - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); soma_dataframe = open(OpenMode::write); - soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->maybe_resize_soma_joinid(new_shape); soma_dataframe->close(); // Check shape after resize soma_dataframe = open(OpenMode::read); - expect = SOMA_JOINID_RESIZE_DIM_MAX; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + expect = SOMA_JOINID_RESIZE_DIM_MAX; // XXX MISSING A + 1 SOMEWHERE + actual = soma_dataframe->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); + soma_dataframe->close(); // Implicitly we expect no throw @@ -610,9 +618,11 @@ TEST_CASE_METHOD( } // Check shape before write - int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); - + int64_t expect = dim_infos[0].dim_max + 1; + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); // Write @@ -620,8 +630,10 @@ TEST_CASE_METHOD( // Check shape after write soma_dataframe = open(OpenMode::read); - expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + expect = dim_infos[0].dim_max + 1; + actual = soma_dataframe->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); // Resize @@ -636,8 +648,7 @@ TEST_CASE_METHOD( soma_dataframe = open(OpenMode::write); // Array not resizeable if it has not already been sized - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); } else { @@ -647,22 +658,26 @@ TEST_CASE_METHOD( // Check shape after write soma_dataframe = open(OpenMode::read); expect = dim_infos[0].dim_max + 1; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); soma_dataframe = open(OpenMode::read); - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); soma_dataframe = open(OpenMode::write); - soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->maybe_resize_soma_joinid(new_shape); soma_dataframe->close(); // Check shape after resize soma_dataframe = open(OpenMode::read); expect = SOMA_JOINID_RESIZE_DIM_MAX; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + actual = soma_dataframe->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); // Implicitly we expect no throw @@ -675,7 +690,8 @@ TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, "SOMADataFrame: variant-indexed dataframe dim-sjid-str attr-u32", "[SOMADataFrame]") { - auto use_current_domain = GENERATE(false, true); + // auto use_current_domain = GENERATE(false, true); + auto use_current_domain = GENERATE(false); std::ostringstream section; section << "- use_current_domain=" << use_current_domain; SECTION(section.str()) { @@ -719,9 +735,11 @@ TEST_CASE_METHOD( } // Check shape before write - int64_t expect = use_current_domain ? dim_infos[0].dim_max + 1 : 0; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); - + int64_t expect = dim_infos[0].dim_max + 1; + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); // Write @@ -729,8 +747,10 @@ TEST_CASE_METHOD( // Check shape after write soma_dataframe = open(OpenMode::read); - expect = use_current_domain ? dim_infos[0].dim_max + 1 : 2; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + expect = dim_infos[0].dim_max + 1; + actual = soma_dataframe->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); // Resize @@ -745,8 +765,7 @@ TEST_CASE_METHOD( soma_dataframe = open(OpenMode::write); // Array not resizeable if it has not already been sized - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); } else { @@ -756,22 +775,26 @@ TEST_CASE_METHOD( // Check shape after write soma_dataframe = open(OpenMode::read); expect = dim_infos[0].dim_max + 1; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); soma_dataframe = open(OpenMode::read); - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); soma_dataframe = open(OpenMode::write); - soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->maybe_resize_soma_joinid(new_shape); soma_dataframe->close(); // Check shape after resize soma_dataframe = open(OpenMode::read); expect = SOMA_JOINID_RESIZE_DIM_MAX; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + actual = soma_dataframe->maybe_soma_joinid_shape(); + REQUIRE(actual.has_value()); + REQUIRE(actual.value() == expect); soma_dataframe->close(); // Implicitly we expect no throw @@ -828,9 +851,9 @@ TEST_CASE_METHOD( } // Check shape before write - int64_t expect = 0; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); - + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(!actual.has_value()); soma_dataframe->close(); // Write @@ -838,8 +861,8 @@ TEST_CASE_METHOD( // Check shape after write soma_dataframe = open(OpenMode::read); - expect = 2; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + actual = soma_dataframe->maybe_soma_joinid_shape(); + REQUIRE(!actual.has_value()); soma_dataframe->close(); // Resize @@ -854,30 +877,29 @@ TEST_CASE_METHOD( soma_dataframe = open(OpenMode::write); // Array not resizeable if it has not already been sized - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); } else { // Check shape after write soma_dataframe = open(OpenMode::read); - expect = 2; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + std::optional actual = soma_dataframe + ->maybe_soma_joinid_shape(); + REQUIRE(!actual.has_value()); soma_dataframe->close(); soma_dataframe = open(OpenMode::read); - REQUIRE_THROWS( - soma_dataframe->resize_soma_joinid_if_dim(new_shape)); + REQUIRE_THROWS(soma_dataframe->maybe_resize_soma_joinid(new_shape)); soma_dataframe->close(); soma_dataframe = open(OpenMode::write); - soma_dataframe->resize_soma_joinid_if_dim(new_shape); + soma_dataframe->maybe_resize_soma_joinid(new_shape); soma_dataframe->close(); // Check shape after resize -- noting soma_joinid is not a dim here soma_dataframe = open(OpenMode::read); - expect = 2; - REQUIRE(soma_dataframe->shape() == std::vector({expect})); + actual = soma_dataframe->maybe_soma_joinid_shape(); + REQUIRE(!actual.has_value()); soma_dataframe->close(); // Implicitly we expect no throw From 79ec0c8a8025715a6d9bdbb655aa651575a66b6e Mon Sep 17 00:00:00 2001 From: Dirk Eddelbuettel Date: Thu, 5 Sep 2024 12:31:09 -0500 Subject: [PATCH 09/11] [c++] Also check for leading slash when checking for 'relative' URLs (#2956) * [c++] Also check for leading slash when checking for 'relative' URLs * [c++] clang-format --- libtiledbsoma/src/soma/soma_group.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_group.cc b/libtiledbsoma/src/soma/soma_group.cc index e62a259268..8d174b39ad 100644 --- a/libtiledbsoma/src/soma/soma_group.cc +++ b/libtiledbsoma/src/soma/soma_group.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2023 TileDB, Inc. + * @copyright Copyright (c) 2023-2024 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -220,7 +220,8 @@ void SOMAGroup::set( const std::string& soma_type) { bool relative = uri_type == URIType::relative; if (uri_type == URIType::automatic) { - relative = uri.find("://") != std::string::npos; + relative = !( + (uri.find("://") != std::string::npos) || (uri.find("/") == 0)); } group_->add_member(uri, relative, name); members_map_[name] = SOMAGroupEntry(uri, soma_type); From dd47db95aae5ddaa57d0b864eb6f1b237d7efdb8 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 5 Sep 2024 17:59:40 -0400 Subject: [PATCH 10/11] [python/r] Expose shape-related accessors to Python/R bindings (#2953) * [python/r] Expose shape-related accessors from C++ to bindings * code-review feedback * code-review feedback --- .../python/src/tiledbsoma/_common_nd_array.py | 22 ++ apis/python/src/tiledbsoma/_dataframe.py | 34 +++ apis/python/src/tiledbsoma/_tdb_handles.py | 84 ++++++- apis/python/src/tiledbsoma/soma_dataframe.cc | 10 +- .../src/tiledbsoma/soma_dense_ndarray.cc | 7 +- .../src/tiledbsoma/soma_sparse_ndarray.cc | 7 +- apis/python/tests/test_dataframe.py | 7 + apis/python/tests/test_shape.py | 211 ++++++++++++++++++ apis/python/tests/test_sparse_nd_array.py | 8 + apis/r/R/RcppExports.R | 16 ++ apis/r/R/SOMADataFrame.R | 24 +- apis/r/R/SOMANDArrayBase.R | 13 ++ apis/r/R/TileDBArray.R | 11 + apis/r/src/RcppExports.cpp | 52 +++++ apis/r/src/rinterface.cpp | 44 ++++ .../r/tests/testthat/test-SOMASparseNDArray.R | 11 + apis/r/tests/testthat/test-shape.r | 100 +++++++++ libtiledbsoma/src/soma/soma_array.h | 12 + libtiledbsoma/src/soma/soma_dataframe.cc | 8 + libtiledbsoma/src/soma/soma_dataframe.h | 56 +++++ libtiledbsoma/test/unit_soma_collection.cc | 1 + .../test/unit_soma_sparse_ndarray.cc | 2 + 22 files changed, 732 insertions(+), 8 deletions(-) create mode 100644 apis/python/tests/test_shape.py create mode 100644 apis/r/tests/testthat/test-shape.r diff --git a/apis/python/src/tiledbsoma/_common_nd_array.py b/apis/python/src/tiledbsoma/_common_nd_array.py index 5525220012..28632c5503 100644 --- a/apis/python/src/tiledbsoma/_common_nd_array.py +++ b/apis/python/src/tiledbsoma/_common_nd_array.py @@ -95,6 +95,28 @@ def shape(self) -> Tuple[int, ...]: """ return cast(Tuple[int, ...], tuple(self._handle.shape)) + @property + def maxshape(self) -> Tuple[int, ...]: + """Returns the maximum resizable capacity of each dimension, always a list of length + ``ndim``. This will not necessarily match the bounds of occupied cells within the array. + It is the upper limit for ``resize`` on the array. + + Lifecycle: + Maturing. + """ + return cast(Tuple[int, ...], tuple(self._handle.maxshape)) + + @property + def has_upgraded_shape(self) -> bool: + """Returns true if the array has the upgraded resizeable shape feature + from TileDB-SOMA 1.14: the array was created with this support, or it has + had ``.upgrade_shape`` applied to it. + + Lifecycle: + Maturing. + """ + return self._handle.has_upgraded_shape + @classmethod def _dim_capacity_and_extent( cls, diff --git a/apis/python/src/tiledbsoma/_dataframe.py b/apis/python/src/tiledbsoma/_dataframe.py index 84e0aa34e0..5f6373cb71 100644 --- a/apis/python/src/tiledbsoma/_dataframe.py +++ b/apis/python/src/tiledbsoma/_dataframe.py @@ -317,6 +317,40 @@ def count(self) -> int: # if is it in read open mode, then it is a DataFrameWrapper return cast(DataFrameWrapper, self._handle).count + @property + def _maybe_soma_joinid_shape(self) -> Optional[int]: + """An internal helper method that returns the shape + value along the ``soma_joinid`` index column, if the ``DataFrame + has one, else ``None``. + + + Lifecycle: + Experimental. + """ + return self._handle.maybe_soma_joinid_shape + + @property + def _maybe_soma_joinid_maxshape(self) -> Optional[int]: + """An internal helper method that returns the maxshape + value along the ``soma_joinid`` index column, if the ``DataFrame + has one, else ``None``. + + Lifecycle: + Experimental. + """ + return self._handle.maybe_soma_joinid_maxshape + + @property + def has_upgraded_domain(self) -> bool: + """Returns true if the array has the upgraded resizeable domain feature + from TileDB-SOMA 1.14: the array was created with this support, or it has + had ``.upgrade_domain`` applied to it. + + Lifecycle: + Maturing. + """ + return self._handle.has_upgraded_domain + def __len__(self) -> int: """Returns the number of rows in the dataframe. Same as ``df.count``.""" return self.count diff --git a/apis/python/src/tiledbsoma/_tdb_handles.py b/apis/python/src/tiledbsoma/_tdb_handles.py index edc2f4ed76..2c02ec2169 100644 --- a/apis/python/src/tiledbsoma/_tdb_handles.py +++ b/apis/python/src/tiledbsoma/_tdb_handles.py @@ -406,7 +406,33 @@ def dim_names(self) -> Tuple[str, ...]: @property def shape(self) -> Tuple[int, ...]: - return tuple(self._handle.shape) + """Not implemented for DataFrame.""" + return cast(Tuple[int, ...], tuple(self._handle.shape)) + + @property + def maxshape(self) -> Tuple[int, ...]: + """Not implemented for DataFrame.""" + return cast(Tuple[int, ...], tuple(self._handle.maxshape)) + + @property + def maybe_soma_joinid_shape(self) -> Optional[int]: + """Only implemented for DataFrame.""" + raise NotImplementedError + + @property + def maybe_soma_joinid_maxshape(self) -> Optional[int]: + """Only implemented for DataFrame.""" + raise NotImplementedError + + @property + def has_upgraded_shape(self) -> bool: + """Not implemented for DataFrame.""" + raise NotImplementedError + + @property + def has_upgraded_domain(self) -> bool: + """Only implemented for DataFrame.""" + raise NotImplementedError class DataFrameWrapper(SOMAArrayWrapper[clib.SOMADataFrame]): @@ -422,9 +448,37 @@ def write(self, values: pa.RecordBatch) -> None: self._handle.write(values) @property - def shape(self) -> Tuple[int, ...]: - # Shape is not implemented for DataFrames - raise NotImplementedError + def maybe_soma_joinid_shape(self) -> Optional[int]: + """Return the shape slot for the soma_joinid dim, if the array has one. + This is an important test-point and dev-internal access-point, + in particular, for the tiledbsoma-io experiment-level resizer. + + Lifecycle: + Maturing. + """ + return cast(Optional[int], self._handle.maybe_soma_joinid_shape) + + @property + def maybe_soma_joinid_maxshape(self) -> Optional[int]: + """Return the maxshape slot for the soma_joinid dim, if the array has one. + This is an important test-point and dev-internal access-point, + in particular, for the tiledbsoma-io experiment-level resizer. + + Lifecycle: + Maturing. + """ + return cast(Optional[int], self._handle.maybe_soma_joinid_maxshape) + + @property + def has_upgraded_domain(self) -> bool: + """Returns true if the array has the upgraded resizeable domain feature + from TileDB-SOMA 1.14: the array was created with this support, or it has + had ``.upgrade_domain`` applied to it. + + Lifecycle: + Maturing. + """ + return cast(bool, self._handle.has_upgraded_domain) class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): @@ -432,6 +486,17 @@ class DenseNDArrayWrapper(SOMAArrayWrapper[clib.SOMADenseNDArray]): _ARRAY_WRAPPED_TYPE = clib.SOMADenseNDArray + @property + def has_upgraded_shape(self) -> bool: + """Returns true if the array has the upgraded resizeable shape feature + from TileDB-SOMA 1.14: the array was created with this support, or it has + had ``.upgrade_shape`` applied to it. + + Lifecycle: + Maturing. + """ + return cast(bool, self._handle.has_upgraded_shape) + class SparseNDArrayWrapper(SOMAArrayWrapper[clib.SOMASparseNDArray]): """Wrapper around a Pybind11 SparseNDArrayWrapper handle.""" @@ -442,6 +507,17 @@ class SparseNDArrayWrapper(SOMAArrayWrapper[clib.SOMASparseNDArray]): def nnz(self) -> int: return int(self._handle.nnz()) + @property + def has_upgraded_shape(self) -> bool: + """Returns true if the array has the upgraded resizeable shape feature + from TileDB-SOMA 1.14: the array was created with this support, or it has + had ``.upgrade_shape`` applied to it. + + Lifecycle: + Maturing. + """ + return cast(bool, self._handle.has_upgraded_shape) + class _DictMod(enum.Enum): """State machine to keep track of modifications to a dictionary. diff --git a/apis/python/src/tiledbsoma/soma_dataframe.cc b/apis/python/src/tiledbsoma/soma_dataframe.cc index 30ba59541e..0716a419f5 100644 --- a/apis/python/src/tiledbsoma/soma_dataframe.cc +++ b/apis/python/src/tiledbsoma/soma_dataframe.cc @@ -144,9 +144,17 @@ void load_soma_dataframe(py::module& m) { .def_static("exists", &SOMADataFrame::exists) .def_property_readonly( "index_column_names", &SOMADataFrame::index_column_names) + .def_property_readonly( "count", &SOMADataFrame::count, - py::call_guard()); + py::call_guard()) + .def_property_readonly( + "maybe_soma_joinid_shape", &SOMADataFrame::maybe_soma_joinid_shape) + .def_property_readonly( + "maybe_soma_joinid_maxshape", + &SOMADataFrame::maybe_soma_joinid_maxshape) + .def_property_readonly( + "has_upgraded_domain", &SOMAArray::has_current_domain); } } // namespace libtiledbsomacpp diff --git a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc index 0a8d8c3555..bd5a3b42b4 100644 --- a/apis/python/src/tiledbsoma/soma_dense_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_dense_ndarray.cc @@ -124,6 +124,11 @@ void load_soma_dense_ndarray(py::module& m) { .def_static("exists", &SOMADenseNDArray::exists) - .def("write", write); + .def("write", write) + + .def_property_readonly("shape", &SOMADenseNDArray::shape) + .def_property_readonly("maxshape", &SOMADenseNDArray::maxshape) + .def_property_readonly( + "has_upgraded_shape", &SOMAArray::has_current_domain); } } // namespace libtiledbsomacpp diff --git a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc index 40bd4c8b43..bb45120f4e 100644 --- a/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc +++ b/apis/python/src/tiledbsoma/soma_sparse_ndarray.cc @@ -110,6 +110,11 @@ void load_soma_sparse_ndarray(py::module& m) { "result_order"_a = ResultOrder::automatic, "timestamp"_a = py::none()) - .def_static("exists", &SOMASparseNDArray::exists); + .def_static("exists", &SOMASparseNDArray::exists) + + .def_property_readonly("shape", &SOMASparseNDArray::shape) + .def_property_readonly("maxshape", &SOMASparseNDArray::maxshape) + .def_property_readonly( + "has_upgraded_shape", &SOMAArray::has_current_domain); } } // namespace libtiledbsomacpp diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 615a6ba4cd..edb5c6cfc8 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -90,9 +90,16 @@ def test_dataframe(tmp_path, arrow_schema): assert sdf.count == 5 assert len(sdf) == 5 + # More to come on https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + assert not sdf.has_upgraded_domain + with pytest.raises(AttributeError): assert sdf.shape is None + # soma_joinid is not a dim here + assert sdf._maybe_soma_joinid_shape is None + assert sdf._maybe_soma_joinid_maxshape is None + # Read all table = sdf.read().concat() assert table.num_rows == 5 diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py new file mode 100644 index 0000000000..42c27f2701 --- /dev/null +++ b/apis/python/tests/test_shape.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import pyarrow as pa +import pytest + +import tiledbsoma + +from tests._util import maybe_raises + + +@pytest.mark.parametrize( + "element_dtype", + [ + pa.float64(), + pa.float32(), + pa.int64(), + pa.uint16(), + ], +) +@pytest.mark.parametrize( + "shape_exc", + [ + # Note: non-None exceptions are coming on https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + [(100,), None], + [(100, 200), None], + [(100, 200, 300), None], + ], +) +def test_sparse_nd_array_basics( + tmp_path, + element_dtype, + shape_exc, +): + uri = tmp_path.as_posix() + arg_shape, arg_create_exc = shape_exc + ndim = len(arg_shape) + + # Create the array + with maybe_raises(arg_create_exc): + snda = tiledbsoma.SparseNDArray.create( + uri, + type=element_dtype, + shape=arg_shape, + ) + if arg_create_exc is not None: + return + + assert tiledbsoma.SparseNDArray.exists(uri) + + # Test the various accessors + with tiledbsoma.SparseNDArray.open(uri) as snda: + + assert snda.shape == arg_shape + + # More to come on https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + assert not snda.has_upgraded_shape + + # Before current-domain support: shape is maxshape. + # + # With current-domain support: We expect the maxshape to be set to a big + # signed int32. (There are details on the exact value of that number, + # involving R compatibility, and leaving room for a single tile + # capacity, etc ... we could check for some magic value but it suffices + # to check that it's over 2 billion.) + assert snda.shape == snda.maxshape + # for e in snda.maxshape: + # assert e > 2_000_000_000 + + # No data have been written for this test case + assert snda.non_empty_domain() == tuple([(0, 0)] * ndim) + + # soma_dim_0: (0,1) + # soma_dim_1: (2,3) + # soma_dim_2: (4,5) + coords = [] + dim_names = [] + for i in range(ndim): + dim_names.append(f"soma_dim_{i}") + coords.append((2 * i, 2 * i + 1)) + coords = tuple(coords) + + # Write some data + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + dikt = {"soma_data": [4, 5]} + for i in range(ndim): + dikt[dim_names[i]] = coords[i] + table = pa.Table.from_pydict(dikt) + snda.write(table) + + # Test the various accessors + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == arg_shape + # This will change with current-domain support + assert snda.shape == snda.maxshape + # for e in snda.maxshape: + # assert e > 2_000_000_000 + assert snda.non_empty_domain() == coords + + # Test reads out of bounds + with tiledbsoma.SparseNDArray.open(uri) as snda: + # TODO: make sure this doesn't come through as RuntimeError + # https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + with pytest.raises((RuntimeError, ValueError)): + coords = tuple(arg_shape[i] + 10 for i in range(ndim)) + snda.read(coords).tables().concat() + + # Test writes out of bounds + with tiledbsoma.SparseNDArray.open(uri, "w") as snda: + with pytest.raises(tiledbsoma.SOMAError): + dikt = {"soma_data": [30]} + dikt = {name: [shape + 20] for name, shape in zip(dim_names, arg_shape)} + table = pa.Table.from_pydict(dikt) + snda.write(table) + + with tiledbsoma.SparseNDArray.open(uri) as snda: + assert snda.shape == arg_shape + assert snda.shape == snda.maxshape + + +## Pending 2.27 timeframe for dense support for current domain, including resize +## TODO: mark these with a linked GitHub tracking issue +def test_dense_nd_array_basics(tmp_path): + uri = tmp_path.as_posix() + shape = (100, 200) + tiledbsoma.DenseNDArray.create(uri, type=pa.float64(), shape=shape) + + with tiledbsoma.DenseNDArray.open(uri) as dnda: + assert dnda.shape == (100, 200) + assert dnda.maxshape == (100, 200) + + assert dnda.non_empty_domain() == ((0, 0), (0, 0)) + + +@pytest.mark.parametrize( + "soma_joinid_domain", + [ + # TODO: https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + # None, + (0, 1), + (0, 3), + (0, 100), + ], +) +@pytest.mark.parametrize( + "index_column_names", + [ + ["soma_joinid"], + ["soma_joinid", "myint"], + ["soma_joinid", "mystring"], + ["mystring", "myint"], + ], +) +def test_dataframe_basics(tmp_path, soma_joinid_domain, index_column_names): + uri = tmp_path.as_posix() + + schema = pa.schema( + [ + ("soma_joinid", pa.int64()), + ("mystring", pa.string()), + ("myint", pa.int16()), + ("myfloat", pa.float32()), + ] + ) + + data = pa.Table.from_pydict( + { + "soma_joinid": [0, 1, 2, 3], + "mystring": ["a", "b", "a", "b"], + "myint": [20, 30, 40, 50], + "myfloat": [1.0, 2.5, 4.0, 5.5], + } + ) + + domain_slots = { + "soma_joinid": soma_joinid_domain, + "mystring": None, + "myint": (-1000, 1000), + "myfloat": (-999.5, 999.5), + } + + domain = tuple([domain_slots[name] for name in index_column_names]) + + soma_joinid_coords = data["soma_joinid"] + oob_write = any( + e.as_py() < soma_joinid_domain[0] or e.as_py() > soma_joinid_domain[1] + for e in soma_joinid_coords + ) + oob_write = oob_write and "soma_joinid" in index_column_names + + with tiledbsoma.DataFrame.create( + uri, + schema=schema, + index_column_names=index_column_names, + domain=domain, + ) as sdf: + if oob_write: + with pytest.raises(tiledbsoma.SOMAError): + sdf.write(data) + else: + sdf.write(data) + + with tiledbsoma.DataFrame.open(uri) as sdf: + has_sjid_dim = "soma_joinid" in index_column_names + if has_sjid_dim: + assert sdf._maybe_soma_joinid_shape == 1 + soma_joinid_domain[1] + assert sdf._maybe_soma_joinid_maxshape == 1 + soma_joinid_domain[1] + else: + assert sdf._maybe_soma_joinid_shape is None + assert sdf._maybe_soma_joinid_maxshape is None + + assert len(sdf.non_empty_domain()) == len(index_column_names) diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py index 1ccc597020..e043e925ba 100644 --- a/apis/python/tests/test_sparse_nd_array.py +++ b/apis/python/tests/test_sparse_nd_array.py @@ -51,6 +51,14 @@ def test_sparse_nd_array_create_ok( assert a.uri == tmp_path.as_posix() assert a.ndim == len(shape) assert a.shape == tuple(shape) + + # TODO: more testing with current-domain feature integrated + # https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + assert isinstance(a.maxshape, tuple) + assert len(a.maxshape) == len(a.shape) + for ms, s in zip(a.maxshape, a.shape): + assert ms >= s + assert a.is_sparse is True assert a.schema is not None diff --git a/apis/r/R/RcppExports.R b/apis/r/R/RcppExports.R index 4f20189acf..8b0adbd287 100644 --- a/apis/r/R/RcppExports.R +++ b/apis/r/R/RcppExports.R @@ -126,6 +126,22 @@ shape <- function(uri, config = NULL) { .Call(`_tiledbsoma_shape`, uri, config) } +maxshape <- function(uri, config = NULL) { + .Call(`_tiledbsoma_maxshape`, uri, config) +} + +maybe_soma_joinid_shape <- function(uri, config = NULL) { + .Call(`_tiledbsoma_maybe_soma_joinid_shape`, uri, config) +} + +maybe_soma_joinid_maxshape <- function(uri, config = NULL) { + .Call(`_tiledbsoma_maybe_soma_joinid_maxshape`, uri, config) +} + +has_current_domain <- function(uri, config = NULL) { + .Call(`_tiledbsoma_has_current_domain`, uri, config) +} + #' Iterator-Style Access to SOMA Array via SOMAArray #' #' The `sr_*` functions provide low-level access to an instance of the SOMAArray diff --git a/apis/r/R/SOMADataFrame.R b/apis/r/R/SOMADataFrame.R index 62877e6a5f..3ac205f9d1 100644 --- a/apis/r/R/SOMADataFrame.R +++ b/apis/r/R/SOMADataFrame.R @@ -329,7 +329,29 @@ SOMADataFrame <- R6::R6Class( shape = function() stop(errorCondition( "'SOMADataFrame$shape()' is not implemented yet", class = 'notYetImplementedError' - )) + )), + + #' @description Retrieve the maxshape; as \code{SOMADataFrames} are shapeless, + #' simply raises an error + #' + #' @return None, instead a \code{\link{.NotYetImplemented}()} error is raised + #' + maxshape = function() stop(errorCondition( + "'SOMADataFrame$maxshape()' is not implemented", + class = 'notYetImplementedError' + )), + + #' @description Returns TRUE if the array has the upgraded resizeable domain + #' feature from TileDB-SOMA 1.14: the array was created with this support, + #' or it has had ``upgrade_domain`` applied to it. + #' (lifecycle: maturing) + #' @return Logical + has_upgraded_domain = function() { + has_current_domain( + self$uri, + config=as.character(tiledb::config(self$tiledbsoma_ctx$context())) + ) + } ), diff --git a/apis/r/R/SOMANDArrayBase.R b/apis/r/R/SOMANDArrayBase.R index a52fa13235..b2a3adcfb0 100644 --- a/apis/r/R/SOMANDArrayBase.R +++ b/apis/r/R/SOMANDArrayBase.R @@ -84,7 +84,20 @@ SOMANDArrayBase <- R6::R6Class( set_data_type = function(type) { spdl::debug("[SOMANDArrayBase::set_data_type] caching type {}", type$ToString()) private$.type <- type + }, + + #' @description Returns TRUE if the array has the upgraded resizeable shape + #' feature from TileDB-SOMA 1.14: the array was created with this support, + #' or it has had ``upgrade_domain`` applied to it. + #' (lifecycle: maturing) + #' @return Logical + has_upgraded_shape = function() { + has_current_domain( + self$uri, + config=as.character(tiledb::config(self$tiledbsoma_ctx$context())) + ) } + ), private = list( diff --git a/apis/r/R/TileDBArray.R b/apis/r/R/TileDBArray.R index 9ebba1a873..84c6bd358a 100644 --- a/apis/r/R/TileDBArray.R +++ b/apis/r/R/TileDBArray.R @@ -158,6 +158,17 @@ TileDBArray <- R6::R6Class( )) }, + #' @description Retrieve the maxshape, i.e. maximum possible that the + # shape can be resized up to. + #' (lifecycle: maturing) + #' @return A named vector of dimension length (and the same type as the dimension) + maxshape = function() { + as.integer64(maxshape( + self$uri, + config=as.character(tiledb::config(self$tiledbsoma_ctx$context())) + )) + }, + #' @description Retrieve the range of indexes for a dimension that were #' explicitly written. This method is deprecated as of TileDB-SOMA 1.13, and will be #' removed in TileDB-SOMA 1.14. diff --git a/apis/r/src/RcppExports.cpp b/apis/r/src/RcppExports.cpp index 4021362a23..1ffea59301 100644 --- a/apis/r/src/RcppExports.cpp +++ b/apis/r/src/RcppExports.cpp @@ -260,6 +260,54 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// maxshape +Rcpp::NumericVector maxshape(const std::string& uri, Rcpp::Nullable config); +RcppExport SEXP _tiledbsoma_maxshape(SEXP uriSEXP, SEXP configSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type config(configSEXP); + rcpp_result_gen = Rcpp::wrap(maxshape(uri, config)); + return rcpp_result_gen; +END_RCPP +} +// maybe_soma_joinid_shape +Rcpp::NumericVector maybe_soma_joinid_shape(const std::string& uri, Rcpp::Nullable config); +RcppExport SEXP _tiledbsoma_maybe_soma_joinid_shape(SEXP uriSEXP, SEXP configSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type config(configSEXP); + rcpp_result_gen = Rcpp::wrap(maybe_soma_joinid_shape(uri, config)); + return rcpp_result_gen; +END_RCPP +} +// maybe_soma_joinid_maxshape +Rcpp::NumericVector maybe_soma_joinid_maxshape(const std::string& uri, Rcpp::Nullable config); +RcppExport SEXP _tiledbsoma_maybe_soma_joinid_maxshape(SEXP uriSEXP, SEXP configSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type config(configSEXP); + rcpp_result_gen = Rcpp::wrap(maybe_soma_joinid_maxshape(uri, config)); + return rcpp_result_gen; +END_RCPP +} +// has_current_domain +Rcpp::LogicalVector has_current_domain(const std::string& uri, Rcpp::Nullable config); +RcppExport SEXP _tiledbsoma_has_current_domain(SEXP uriSEXP, SEXP configSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type uri(uriSEXP); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type config(configSEXP); + rcpp_result_gen = Rcpp::wrap(has_current_domain(uri, config)); + return rcpp_result_gen; +END_RCPP +} // sr_setup Rcpp::List sr_setup(const std::string& uri, Rcpp::CharacterVector config, Rcpp::Nullable colnames, Rcpp::Nullable> qc, Rcpp::Nullable dim_points, Rcpp::Nullable dim_ranges, std::string batch_size, std::string result_order, Rcpp::Nullable timestamprange, const std::string& loglevel); RcppExport SEXP _tiledbsoma_sr_setup(SEXP uriSEXP, SEXP configSEXP, SEXP colnamesSEXP, SEXP qcSEXP, SEXP dim_pointsSEXP, SEXP dim_rangesSEXP, SEXP batch_sizeSEXP, SEXP result_orderSEXP, SEXP timestamprangeSEXP, SEXP loglevelSEXP) { @@ -449,6 +497,10 @@ static const R_CallMethodDef CallEntries[] = { {"_tiledbsoma_check_arrow_schema_tag", (DL_FUNC) &_tiledbsoma_check_arrow_schema_tag, 1}, {"_tiledbsoma_check_arrow_array_tag", (DL_FUNC) &_tiledbsoma_check_arrow_array_tag, 1}, {"_tiledbsoma_shape", (DL_FUNC) &_tiledbsoma_shape, 2}, + {"_tiledbsoma_maxshape", (DL_FUNC) &_tiledbsoma_maxshape, 2}, + {"_tiledbsoma_maybe_soma_joinid_shape", (DL_FUNC) &_tiledbsoma_maybe_soma_joinid_shape, 2}, + {"_tiledbsoma_maybe_soma_joinid_maxshape", (DL_FUNC) &_tiledbsoma_maybe_soma_joinid_maxshape, 2}, + {"_tiledbsoma_has_current_domain", (DL_FUNC) &_tiledbsoma_has_current_domain, 2}, {"_tiledbsoma_sr_setup", (DL_FUNC) &_tiledbsoma_sr_setup, 10}, {"_tiledbsoma_sr_complete", (DL_FUNC) &_tiledbsoma_sr_complete, 1}, {"_tiledbsoma_create_empty_arrow_table", (DL_FUNC) &_tiledbsoma_create_empty_arrow_table, 0}, diff --git a/apis/r/src/rinterface.cpp b/apis/r/src/rinterface.cpp index d5a919c7cc..9437276625 100644 --- a/apis/r/src/rinterface.cpp +++ b/apis/r/src/rinterface.cpp @@ -234,3 +234,47 @@ Rcpp::NumericVector shape(const std::string& uri, auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, "unnamed", config_vector_to_map(Rcpp::wrap(config))); return Rcpp::toInteger64(sr->shape()); } + +// [[Rcpp::export]] +Rcpp::NumericVector maxshape(const std::string& uri, + Rcpp::Nullable config = R_NilValue) { + auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, "unnamed", config_vector_to_map(Rcpp::wrap(config))); + return Rcpp::toInteger64(sr->maxshape()); +} + +// [[Rcpp::export]] +Rcpp::NumericVector maybe_soma_joinid_shape(const std::string& uri, + Rcpp::Nullable config = R_NilValue) { + // Pro-tip: + // * Open with mode and uri gives a SOMAArray. + // * Open with uri and mode gives a SOMADataFrame. + // This was done intentionally to resolve an ambiguous-overload compiler error. + auto sr = tdbs::SOMADataFrame::open(uri, OpenMode::read, "unnamed", config_vector_to_map(Rcpp::wrap(config))); + auto retval = sr->maybe_soma_joinid_shape(); + if (retval.has_value()) { + return Rcpp::toInteger64(retval.value()); + } else { + // We use this sentinel to facilitate a pure-R return value of NA. + return Rcpp::toInteger64(-1); + } +} + +// [[Rcpp::export]] +Rcpp::NumericVector maybe_soma_joinid_maxshape(const std::string& uri, + Rcpp::Nullable config = R_NilValue) { + auto sr = tdbs::SOMADataFrame::open(uri, OpenMode::read, "unnamed", config_vector_to_map(Rcpp::wrap(config))); + auto retval = sr->maybe_soma_joinid_maxshape(); + if (retval.has_value()) { + return Rcpp::toInteger64(retval.value()); + } else { + // We use this sentinel to facilitate a pure-R return value of NA. + return Rcpp::toInteger64(-1); + } +} + +// [[Rcpp::export]] +Rcpp::LogicalVector has_current_domain(const std::string& uri, + Rcpp::Nullable config = R_NilValue) { + auto sr = tdbs::SOMAArray::open(OpenMode::read, uri, "unnamed", config_vector_to_map(Rcpp::wrap(config))); + return Rcpp::LogicalVector(sr->has_current_domain()); +} diff --git a/apis/r/tests/testthat/test-SOMASparseNDArray.R b/apis/r/tests/testthat/test-SOMASparseNDArray.R index a2d2881e74..9474f43ac6 100644 --- a/apis/r/tests/testthat/test-SOMASparseNDArray.R +++ b/apis/r/tests/testthat/test-SOMASparseNDArray.R @@ -59,6 +59,17 @@ test_that("SOMASparseNDArray creation", { ## shape expect_equal(ndarray$shape(), as.integer64(c(10, 10))) + ## maxshape + # TODO: more testing with current-domain feature integrated + # https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + expect_false(ndarray$has_upgraded_shape()) + shape <- ndarray$shape() + maxshape <- ndarray$maxshape() + expect_equal(length(shape), length(maxshape)) + for (i in 1:length(shape)) { + expect(maxshape[[i]] >= shape[[i]]) + } + ## ndim expect_equal(ndarray$ndim(), 2L) diff --git a/apis/r/tests/testthat/test-shape.r b/apis/r/tests/testthat/test-shape.r new file mode 100644 index 0000000000..1dc3626577 --- /dev/null +++ b/apis/r/tests/testthat/test-shape.r @@ -0,0 +1,100 @@ +test_that("SOMADataFrame shape", { + #uri <- withr::local_tempdir("soma-dataframe-shape") + uri <- "/tmp/fooze" + asch <- create_arrow_schema() + + index_column_name_choices = list( + "soma_joinid", + c("soma_joinid", "foo"), + c("soma_joinid", "baz"), + c("baz", "foo") + ) + + for (index_column_names in index_column_name_choices) { + has_soma_joinid_dim <- "soma_joinid" %in% index_column_names + + if (dir.exists(uri)) unlink(uri, recursive=TRUE) + + sdf <- SOMADataFrameCreate(uri, asch, index_column_names = index_column_names) + expect_true(sdf$exists()) + expect_true(dir.exists(uri)) + + tbl0 <- arrow::arrow_table(foo = 1L:4L, + soma_joinid = 1L:4L, + bar = 1.1:4.1, + baz = c("apple", "ball", "cat", "dog"), + schema = asch) + + sdf$write(tbl0) + sdf$close() + + sdf <- SOMADataFrameOpen(uri) + + expect_false(sdf$has_upgraded_domain()) + expect_error(sdf$shape(), class = "notYetImplementedError") + expect_error(sdf$maxshape(), class = "notYetImplementedError") + + # Not implemented this way per + # https://github.com/single-cell-data/TileDB-SOMA/pull/2953#discussion_r1746125089 + # sjid_shape <- sdf$.maybe_soma_joinid_shape() + # sjid_maxshape <- sdf$.maybe_soma_joinid_maxshape() + sjid_shape <- maybe_soma_joinid_shape(sdf$uri, config = as.character(tiledb::config(sdf$tiledbsoma_ctx$context()))) + sjid_maxshape <- maybe_soma_joinid_maxshape(sdf$uri, config = as.character(tiledb::config(sdf$tiledbsoma_ctx$context()))) + + if (has_soma_joinid_dim) { + # More testing to come on + # https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + expect_false(sjid_shape == -1) + expect_false(sjid_maxshape == -1) + } else { + expect_true(sjid_shape == -1) + expect_true(sjid_maxshape == -1) + } + + sdf$close() + + rm(sdf, tbl0) + + gc() + } +}) + +test_that("SOMASparseNDArray shape", { + uri <- withr::local_tempdir("soma-sparse-ndarray-shape") + asch <- create_arrow_schema() + + element_type_choices = list(arrow::float32(), arrow::int16()) + shape_choices = list( + c(100), + c(100,200), + c(100,200,300) + ) + for (element_type in element_type_choices) { + for (shape in shape_choices) { + + if (dir.exists(uri)) unlink(uri, recursive=TRUE) + ndarray <- SOMASparseNDArrayCreate(uri, element_type, shape = shape) + + expect_equal(ndarray$ndim(), length(shape)) + + expect_equal(ndarray$shape(), as.integer64(shape)) + + # More generally after current-domain support: + readback_shape <- ndarray$shape() + readback_maxshape <- ndarray$maxshape() + expect_equal(length(readback_shape), length(readback_maxshape)) + for (i in 1:length(shape)) { + s <- as.integer(readback_shape[[i]]) + ms <- as.integer(readback_maxshape[[i]]) + expect_true(s <= ms) + } + + # Resize tests upcoming on + # https://github.com/single-cell-data/TileDB-SOMA/issues/2407 + ndarray$close() + + rm(ndarray) + gc() + } + } +}) diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 69d0494c4f..1c2fe61a5d 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -824,6 +824,18 @@ class SOMAArray : public SOMAObject { return _get_current_domain(); } + /** + * @brief Returns true if the array has a non-empty current domain, else + * false. Note that at the core level it's "current domain" for all arrays; + * at the SOMA-API level it's "upgraded_shape" for SOMASparseNDArray and + * SOMADenseNDArray, and "upgraded_domain" for SOMADataFrame; here + * we use the core language and defer to Python/R to conform to + * SOMA-API syntax. + */ + bool has_current_domain() { + return !_get_current_domain().is_empty(); + } + protected: // These two are for use nominally by SOMADataFrame. This could be moved in // its entirety to SOMADataFrame, but it would entail moving several diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index a65773a81e..2900e152a5 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -68,6 +68,14 @@ std::unique_ptr SOMADataFrame::open( mode, uri, ctx, column_names, result_order, timestamp); } +std::unique_ptr SOMADataFrame::open( + std::string_view uri, + OpenMode mode, + std::string_view name, + std::map platform_config) { + return std::make_unique(mode, uri, name, platform_config); +} + bool SOMADataFrame::exists( std::string_view uri, std::shared_ptr ctx) { try { diff --git a/libtiledbsoma/src/soma/soma_dataframe.h b/libtiledbsoma/src/soma/soma_dataframe.h index 231e578baf..0ba3c53e78 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.h +++ b/libtiledbsoma/src/soma/soma_dataframe.h @@ -71,6 +71,13 @@ class SOMADataFrame : public SOMAArray { /** * @brief Open and return a SOMADataFrame object at the given URI. * + * Note: the first two arguments uri and mode are reversed from + * the SOMAArrayConstructor. This is an intentional decision to + * avoid ambiguous-overload compiler errors. Even though + * SOMADataFrame extends SOMAArray, callers using open + * and wishing to obtain a SOMADataFrame rather than a SOMAArray + * are advised to place the uri argument before the mode argument. + * * @param uri URI to create the SOMADataFrame * @param mode read or write * @param ctx SOMAContext @@ -91,6 +98,26 @@ class SOMADataFrame : public SOMAArray { ResultOrder result_order = ResultOrder::automatic, std::optional timestamp = std::nullopt); + /** + * @brief Open and return a SOMADataFrame object at the given URI. + * + * This is nominally for TileDB-SOMA R use, since while we have + * TileDB-SOMA-R and TileDB-R co-existing, it's less desirable + * to pass ctx from one copy of core to another, and more + * desirable to pass a config map. + * + * @param uri URI to create the SOMADataFrame + * @param mode read or write + * @param name Name of the array + * @param platform_config Config parameter dictionary + * @return std::unique_ptr SOMADataFrame + */ + static std::unique_ptr open( + std::string_view uri, + OpenMode mode, + std::string_view name, + std::map platform_config); + /** * @brief Check if the SOMADataFrame exists at the URI. * @@ -132,6 +159,35 @@ class SOMADataFrame : public SOMAArray { timestamp) { } + /** + * @brief Construct a new SOMADataFrame object. + * + * This is nominally for TileDB-SOMA R use, since while we have + * TileDB-SOMA-R and TileDB-R co-existing, it's less desirable + * to pass ctx from one copy of core to another, and more + * desirable to pass a config map. + * + * @param uri URI to create the SOMADataFrame + * @param mode read or write + * @param name Name of the array + * @param platform_config Config parameter dictionary + * @return std::unique_ptr SOMADataFrame + */ + SOMADataFrame( + OpenMode mode, + std::string_view uri, + std::string_view name, + std::map platform_config) + : SOMAArray( + mode, + uri, + std::make_shared(platform_config), + name, + {}, + "auto", + ResultOrder::automatic) { + } + SOMADataFrame(const SOMAArray& other) : SOMAArray(other) { } diff --git a/libtiledbsoma/test/unit_soma_collection.cc b/libtiledbsoma/test/unit_soma_collection.cc index 903c9710f3..42309e1d7b 100644 --- a/libtiledbsoma/test/unit_soma_collection.cc +++ b/libtiledbsoma/test/unit_soma_collection.cc @@ -119,6 +119,7 @@ TEST_CASE("SOMACollection: add SOMADenseNDArray") { SOMACollection::create(base_uri, ctx, ts); // TODO: add support for current domain in dense arrays once we have that // support from core + // https://github.com/single-cell-data/TileDB-SOMA/issues/2955 std::vector dim_infos( {{.name = dim_name, .tiledb_datatype = tiledb_datatype, diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index b1eaa52a93..65e1a94005 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -135,9 +135,11 @@ TEST_CASE("SOMASparseNDArray: basic", "[SOMASparseNDArray]") { soma_sparse = SOMASparseNDArray::open(uri, OpenMode::write, ctx); // Without current-domain support: this should throw since // one cannot resize what has not been sized. + REQUIRE(!soma_sparse->has_current_domain()); REQUIRE_THROWS(soma_sparse->resize(new_shape)); // Now set the shape soma_sparse->upgrade_shape(new_shape); + REQUIRE(soma_sparse->has_current_domain()); // Should not fail since we're setting it to what it already is. soma_sparse->resize(new_shape); soma_sparse->close(); From 796c168ae280e73020aecbd2110d4f2f3417dda2 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Thu, 5 Sep 2024 23:24:09 -0400 Subject: [PATCH 11/11] [python] Expose tiledbsoma stats as JSON string or Python-parsed (#2958) * [python] Expose tiledbsoma stats as JSON string or Python dict * code-review feedback --- apis/python/src/tiledbsoma/__init__.py | 4 ++++ apis/python/src/tiledbsoma/pytiledbsoma.cc | 7 +++++++ apis/python/src/tiledbsoma/stats.py | 22 ++++++++++++++++++++++ apis/python/tests/test_stats.py | 14 ++++++++++++++ 4 files changed, 47 insertions(+) create mode 100644 apis/python/src/tiledbsoma/stats.py create mode 100644 apis/python/tests/test_stats.py diff --git a/apis/python/src/tiledbsoma/__init__.py b/apis/python/src/tiledbsoma/__init__.py index a9c3055e74..491eaa35c4 100644 --- a/apis/python/src/tiledbsoma/__init__.py +++ b/apis/python/src/tiledbsoma/__init__.py @@ -172,6 +172,10 @@ tiledbsoma_stats_enable, tiledbsoma_stats_reset, ) +from .stats import ( + tiledbsoma_stats_json, + tiledbsoma_stats_as_py, +) __version__ = get_implementation_version() diff --git a/apis/python/src/tiledbsoma/pytiledbsoma.cc b/apis/python/src/tiledbsoma/pytiledbsoma.cc index 82670549ab..fced2214b2 100644 --- a/apis/python/src/tiledbsoma/pytiledbsoma.cc +++ b/apis/python/src/tiledbsoma/pytiledbsoma.cc @@ -98,6 +98,13 @@ PYBIND11_MODULE(pytiledbsoma, m) { py::print(stats); }, "Print TileDB internal statistics. Lifecycle: experimental."); + m.def( + "tiledbsoma_stats_string", + []() { + std::string stats = tiledbsoma::stats::dump(); + return stats; + }, + "Print TileDB internal statistics. Lifecycle: experimental."); py::class_(m, "PlatformConfig") .def(py::init<>()) diff --git a/apis/python/src/tiledbsoma/stats.py b/apis/python/src/tiledbsoma/stats.py new file mode 100644 index 0000000000..0462753a5d --- /dev/null +++ b/apis/python/src/tiledbsoma/stats.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024 TileDB, Inc. +# +# Licensed under the MIT License. + +import json +from typing import Dict, List, Literal, Union, cast + +from .pytiledbsoma import tiledbsoma_stats_string + +ParsedStats = List[Dict[Literal["counters", "timers"], Dict[str, Union[float, int]]]] + + +def tiledbsoma_stats_json() -> str: + """Returns tiledbsoma stats as a JSON string""" + # cast is needed for pybind11 things + return cast(str, tiledbsoma_stats_string()) + + +def tiledbsoma_stats_as_py() -> ParsedStats: + """Returns tiledbsoma stats as a Python dict""" + # cast is needed for pybind11 things + return cast(ParsedStats, json.loads(tiledbsoma_stats_string())) diff --git a/apis/python/tests/test_stats.py b/apis/python/tests/test_stats.py new file mode 100644 index 0000000000..5b2061c144 --- /dev/null +++ b/apis/python/tests/test_stats.py @@ -0,0 +1,14 @@ +import tiledbsoma + + +def test_stats_json(): + out = tiledbsoma.tiledbsoma_stats_json() + assert isinstance(out, str) + assert out[0] == "[" + assert out[-2] == "]" + assert out[-1] == "\n" + + +def test_stats_as_py(): + out = tiledbsoma.tiledbsoma_stats_as_py() + assert isinstance(out, list)