From 426e6a2cc244e9478ad5803609154348cc546eed Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 30 Aug 2024 15:59:13 -0400 Subject: [PATCH 1/2] [c++] Split out a test fixture for dataframes --- libtiledbsoma/test/unit_soma_dataframe.cc | 348 +++++++++++++--------- 1 file changed, 208 insertions(+), 140 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index bd75350caa..67f4d49ac4 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -32,56 +32,183 @@ #include "common.h" -#define DIM_MAX 1000 +// This is a keystroke-reduction fixture for some similar unit-test cases +struct VariouslyIndexedDataFrameFixture { + std::shared_ptr ctx_; + std::string uri_; + bool use_current_domain_; + + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Using Catch2's TEST_CASE_METHOD we can't pass constructor args. + // This is a call-after-construction method. + void set_up(std::shared_ptr ctx, std::string uri) { + ctx_ = ctx; + uri_ = uri; + } -TEST_CASE("SOMADataFrame: basic") { - int64_t dim_max = 1000; - auto use_current_domain = GENERATE(false, true); - // TODO this could be formatted with fmt::format which is part of internal - // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be - // replaced with std::format. - std::ostringstream section; - section << "- use_current_domain=" << use_current_domain; - SECTION(section.str()) { - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-dataframe-basic"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; - std::string arrow_format = helper::to_arrow_format(tiledb_datatype); + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Helpers for setting up dim/attr configs and data + int64_t i64_dim_max = 100; + int64_t u32_dim_max = 10000; + int64_t str_dim_max = 0; // not used for string dims + + std::string i64_name = "soma_joinid"; + std::string u32_name = "myuint32"; + std::string str_name = "mystring"; + + tiledb_datatype_t i64_datatype = TILEDB_INT64; + tiledb_datatype_t u32_datatype = TILEDB_UINT32; + tiledb_datatype_t str_datatype = TILEDB_STRING_ASCII; + + std::string i64_arrow_format = helper::to_arrow_format(i64_datatype); + std::string u32_arrow_format = helper::to_arrow_format(u32_datatype); + std::string attr_1_arrow_format = helper::to_arrow_format(str_datatype); + + helper::DimInfo i64_dim_info(bool use_current_domain) { + return helper::DimInfo( + {.name = i64_name, + .tiledb_datatype = i64_datatype, + .dim_max = i64_dim_max, + .use_current_domain = use_current_domain}); + } + helper::DimInfo u32_dim_info(bool use_current_domain) { + return helper::DimInfo( + {.name = u32_name, + .tiledb_datatype = u32_datatype, + .dim_max = u32_dim_max, + .use_current_domain = use_current_domain}); + } + helper::DimInfo str_dim_info(bool use_current_domain) { + return helper::DimInfo( + {.name = str_name, + .tiledb_datatype = str_datatype, + .dim_max = str_dim_max, + .use_current_domain = use_current_domain}); + } - REQUIRE(!SOMADataFrame::exists(uri, ctx)); + helper::AttrInfo i64_attr_info() { + return helper::AttrInfo( + {.name = i64_name, .tiledb_datatype = i64_datatype}); + } + helper::AttrInfo u32_attr_info() { + return helper::AttrInfo( + {.name = u32_name, .tiledb_datatype = u32_datatype}); + } + helper::AttrInfo str_attr_info() { + return helper::AttrInfo( + {.name = str_name, .tiledb_datatype = str_datatype}); + } - std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = dim_max, - .use_current_domain = use_current_domain}}); + std::vector make_i64_data() { + return std::vector({1, 2}); + } + std::vector make_u32_data() { + return std::vector({1234, 5678}); + } + std::vector make_str_data() { + return std::vector({"apple", "bat"}); + } - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); + // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + // Helper methods for create/open/write/etc. + void create( + const std::vector& dim_infos, + const std::vector& attr_infos) { auto [schema, index_columns] = helper::create_arrow_schema_and_index_columns( dim_infos, attr_infos); + SOMADataFrame::create( + uri_, + std::move(schema), + ArrowTable( + std::move(index_columns.first), + std::move(index_columns.second)), + ctx_); + } + void create( + const std::vector& dim_infos, + const std::vector& attr_infos, + const PlatformConfig& platform_config, + std::optional timestamp_range = std::nullopt) { + auto [schema, index_columns] = + helper::create_arrow_schema_and_index_columns( + dim_infos, attr_infos); SOMADataFrame::create( - uri, + uri_, std::move(schema), ArrowTable( std::move(index_columns.first), std::move(index_columns.second)), - ctx); + ctx_, + platform_config, + timestamp_range); + } - REQUIRE(SOMADataFrame::exists(uri, ctx)); - REQUIRE(!SOMASparseNDArray::exists(uri, ctx)); - REQUIRE(!SOMADenseNDArray::exists(uri, ctx)); + std::unique_ptr open( + OpenMode mode, + ResultOrder result_order = ResultOrder::automatic, + std::optional timestamp_range = std::nullopt) { + return SOMADataFrame::open( + uri_, + mode, + ctx_, + {}, // column_names + result_order, + timestamp_range); + } - auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); - REQUIRE(soma_dataframe->uri() == uri); - REQUIRE(soma_dataframe->ctx() == ctx); + void write_generic_data() { + // No arguments -- for now. + // In a subsequent PR we'll vary writing in-bounds vs out of bounds. + auto soma_dataframe = SOMADataFrame::open(uri_, OpenMode::write, ctx_); + + auto i64_data = make_i64_data(); + auto u32_data = make_u32_data(); + auto str_data = make_str_data(); + + soma_dataframe->set_column_data( + i64_name, i64_data.size(), i64_data.data()); + soma_dataframe->set_column_data( + str_name, str_data.size(), str_data.data()); + soma_dataframe->set_column_data( + u32_name, u32_data.size(), u32_data.data()); + soma_dataframe->write(); + + soma_dataframe->close(); + } +}; + +TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: basic") { + auto use_current_domain = GENERATE(false, true); + // TODO this could be formatted with fmt::format which is part of internal + // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be + // replaced with std::format. + std::ostringstream section; + section << "- use_current_domain=" << use_current_domain; + SECTION(section.str()) { + set_up( + std::make_shared(), "mem://unit-test-dataframe-basic"); + + std::vector dim_infos( + {i64_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); + + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); + + create(dim_infos, attr_infos); + + REQUIRE(SOMADataFrame::exists(uri_, ctx_)); + REQUIRE(!SOMASparseNDArray::exists(uri_, ctx_)); + REQUIRE(!SOMADenseNDArray::exists(uri_, ctx_)); + + auto soma_dataframe = open(OpenMode::read); + REQUIRE(soma_dataframe->uri() == uri_); + REQUIRE(soma_dataframe->ctx() == ctx_); REQUIRE(soma_dataframe->type() == "SOMADataFrame"); - std::vector expected_index_column_names = {dim_name}; + std::vector expected_index_column_names = { + dim_infos[0].name}; REQUIRE( soma_dataframe->index_column_names() == expected_index_column_names); @@ -93,30 +220,33 @@ TEST_CASE("SOMADataFrame: basic") { d0[j] = j; std::vector a0(10, 1); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); - soma_dataframe->set_column_data(attr_name, a0.size(), a0.data()); - soma_dataframe->set_column_data(dim_name, d0.size(), d0.data()); + soma_dataframe = open(OpenMode::write); + soma_dataframe->set_column_data( + dim_infos[0].name, d0.size(), d0.data()); + soma_dataframe->set_column_data( + attr_infos[0].name, a0.size(), a0.data()); soma_dataframe->write(); soma_dataframe->close(); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); + soma_dataframe = open(OpenMode::read); while (auto batch = soma_dataframe->read_next()) { auto arrbuf = batch.value(); - auto d0span = arrbuf->at(dim_name)->data(); - auto a0span = arrbuf->at(attr_name)->data(); + auto d0span = arrbuf->at(dim_infos[0].name)->data(); + auto a0span = arrbuf->at(attr_infos[0].name)->data(); REQUIRE(d0 == std::vector(d0span.begin(), d0span.end())); REQUIRE(a0 == std::vector(a0span.begin(), a0span.end())); } soma_dataframe->close(); - auto soma_object = SOMAObject::open(uri, OpenMode::read, ctx); - REQUIRE(soma_object->uri() == uri); + auto soma_object = SOMAObject::open(uri_, OpenMode::read, ctx_); + REQUIRE(soma_object->uri() == uri_); REQUIRE(soma_object->type() == "SOMADataFrame"); soma_object->close(); } } -TEST_CASE("SOMADataFrame: platform_config") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, "SOMADataFrame: platform_config") { std::pair filter = GENERATE( std::make_pair( R"({"name": "GZIP", "COMPRESSION_LEVEL": 3})", TILEDB_FILTER_GZIP), @@ -158,26 +288,20 @@ TEST_CASE("SOMADataFrame: platform_config") { std::make_pair(R"("BYTESHUFFLE")", TILEDB_FILTER_BYTESHUFFLE), std::make_pair(R"("NOOP")", TILEDB_FILTER_NONE)); - // TODO this use to be formatted with fmt::format which is part of internal + // TODO this used to be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be // replaced with std::format. std::ostringstream section; section << "- filter=" << filter.first; SECTION(section.str()) { - int64_t dim_max = 1000; auto use_current_domain = GENERATE(false, true); - // TODO this could be formatted with fmt::format which is part of - // internal header spd/log/fmt/fmt.h and should not be used. In C++20, - // this can be replaced with std::format. std::ostringstream section2; section2 << "- use_current_domain=" << use_current_domain; SECTION(section2.str()) { - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-dataframe-platform-config"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; + set_up( + std::make_shared(), + "mem://unit-test-dataframe-platform-config"); PlatformConfig platform_config; platform_config.dataframe_dim_zstd_level = 6; @@ -189,28 +313,14 @@ TEST_CASE("SOMADataFrame: platform_config") { } std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = dim_max, - .use_current_domain = use_current_domain}}); - - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); - - auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns( - dim_infos, attr_infos); - - SOMADataFrame::create( - uri, - std::move(schema), - ArrowTable( - std::move(index_columns.first), - std::move(index_columns.second)), - ctx, - platform_config); - - auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::read, ctx); + {i64_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); + + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); + + create(dim_infos, attr_infos, platform_config); + + auto soma_dataframe = open(OpenMode::read); auto sch = soma_dataframe->tiledb_schema(); REQUIRE( sch->offsets_filter_list().filter(0).filter_type() == @@ -221,7 +331,7 @@ TEST_CASE("SOMADataFrame: platform_config") { filter.second); auto dim_filter = sch->domain() - .dimension(dim_name) + .dimension(dim_infos[0].name) .filter_list() .filter(0); REQUIRE(dim_filter.filter_type() == TILEDB_FILTER_ZSTD); @@ -230,7 +340,7 @@ TEST_CASE("SOMADataFrame: platform_config") { if (filter.second != TILEDB_FILTER_WEBP) { REQUIRE( - sch->attribute(attr_name) + sch->attribute(attr_infos[0].name) .filter_list() .filter(0) .filter_type() == filter.second); @@ -240,8 +350,7 @@ TEST_CASE("SOMADataFrame: platform_config") { } } -TEST_CASE("SOMADataFrame: metadata") { - int64_t dim_max = 1000; +TEST_CASE_METHOD(VariouslyIndexedDataFrameFixture, "SOMADataFrame: metadata") { auto use_current_domain = GENERATE(false, true); // TODO this could be formatted with fmt::format which is part of internal // header spd/log/fmt/fmt.h and should not be used. In C++20, this can be @@ -249,43 +358,18 @@ TEST_CASE("SOMADataFrame: metadata") { std::ostringstream section; section << "- use_current_domain=" << use_current_domain; SECTION(section.str()) { - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-collection"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; - std::string arrow_format = helper::to_arrow_format(tiledb_datatype); + set_up(std::make_shared(), "mem://unit-test-collection"); std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = dim_max, - .use_current_domain = use_current_domain}}); + {i64_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); - auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns( - dim_infos, attr_infos); + create(dim_infos, attr_infos, PlatformConfig(), TimestampRange(0, 2)); - SOMADataFrame::create( - uri, - std::move(schema), - ArrowTable( - std::move(index_columns.first), - std::move(index_columns.second)), - ctx, - PlatformConfig(), - TimestampRange(0, 2)); - - auto soma_dataframe = SOMADataFrame::open( - uri, - OpenMode::write, - ctx, - {}, - ResultOrder::automatic, - TimestampRange(1, 1)); + auto soma_dataframe = open( + OpenMode::write, ResultOrder::automatic, TimestampRange(1, 1)); int32_t val = 100; soma_dataframe->set_metadata("md", TILEDB_INT32, 1, &val); @@ -336,54 +420,38 @@ TEST_CASE("SOMADataFrame: metadata") { } } -TEST_CASE("SOMADataFrame: bounds-checking") { +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, "SOMADataFrame: bounds-checking") { bool use_current_domain = true; int old_max = 100; int new_max = 200; - auto ctx = std::make_shared(); - std::string uri = "mem://unit-test-bounds-checking"; - std::string dim_name = "d0"; - std::string attr_name = "a0"; - tiledb_datatype_t tiledb_datatype = TILEDB_INT64; - std::string arrow_format = helper::to_arrow_format(tiledb_datatype); - - std::vector dim_infos( - {{.name = dim_name, - .tiledb_datatype = tiledb_datatype, - .dim_max = old_max, - .use_current_domain = use_current_domain}}); + set_up(std::make_shared(), "mem://unit-test-bounds-checking"); - std::vector attr_infos( - {{.name = attr_name, .tiledb_datatype = tiledb_datatype}}); + std::vector dim_infos({i64_dim_info(use_current_domain)}); + std::vector attr_infos({u32_attr_info()}); - auto [schema, index_columns] = - helper::create_arrow_schema_and_index_columns(dim_infos, attr_infos); + REQUIRE(!SOMADataFrame::exists(uri_, ctx_)); - SOMADataFrame::create( - uri, - std::move(schema), - ArrowTable( - std::move(index_columns.first), std::move(index_columns.second)), - ctx); + create(dim_infos, attr_infos); - auto soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); + auto soma_dataframe = open(OpenMode::write); std::vector d0({old_max + 1, old_max + 2}); std::vector a0({1.5, 2.5}); - soma_dataframe->set_column_data(dim_name, d0.size(), d0.data()); - soma_dataframe->set_column_data(attr_name, a0.size(), a0.data()); + soma_dataframe->set_column_data(dim_infos[0].name, d0.size(), d0.data()); + soma_dataframe->set_column_data(attr_infos[0].name, a0.size(), a0.data()); // Writing outside the current domain should fail REQUIRE_THROWS(soma_dataframe->write()); soma_dataframe->close(); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); + soma_dataframe = open(OpenMode::write); soma_dataframe->resize(std::vector({new_max})); soma_dataframe->close(); - soma_dataframe = SOMADataFrame::open(uri, OpenMode::write, ctx); - soma_dataframe->set_column_data(dim_name, d0.size(), d0.data()); - soma_dataframe->set_column_data(attr_name, a0.size(), a0.data()); + soma_dataframe = open(OpenMode::write); + soma_dataframe->set_column_data(dim_infos[0].name, d0.size(), d0.data()); + soma_dataframe->set_column_data(attr_infos[0].name, a0.size(), a0.data()); // Writing after resize should succeed soma_dataframe->write(); From 1bb4f6978025a21d5dced4fc9f3ca8cc2466955a Mon Sep 17 00:00:00 2001 From: John Kerl Date: Fri, 30 Aug 2024 17:04:56 -0400 Subject: [PATCH 2/2] fix a failure case --- libtiledbsoma/test/unit_soma_dataframe.cc | 24 +++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index 67f4d49ac4..f56a68cb94 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -32,7 +32,11 @@ #include "common.h" -// This is a keystroke-reduction fixture for some similar unit-test cases +// This is a keystroke-reduction fixture for some similar unit-test cases For +// convenience there are dims/attrs of type int64, uint32, and string. (Feel +// free to add more types.) The main value-adds of this fixture are (a) simple +// keystroke-reduction; (b) you get to pick which ones are the dim(s) and which +// are the attr(s). struct VariouslyIndexedDataFrameFixture { std::shared_ptr ctx_; std::string uri_; @@ -48,13 +52,13 @@ struct VariouslyIndexedDataFrameFixture { // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // Helpers for setting up dim/attr configs and data - int64_t i64_dim_max = 100; - int64_t u32_dim_max = 10000; - int64_t str_dim_max = 0; // not used for string dims + static const inline int64_t i64_dim_max = 100; + static const inline int64_t u32_dim_max = 10000; + static const inline int64_t str_dim_max = 0; // not used for string dims - std::string i64_name = "soma_joinid"; - std::string u32_name = "myuint32"; - std::string str_name = "mystring"; + static const inline std::string i64_name = "soma_joinid"; + static const inline std::string u32_name = "myuint32"; + static const inline std::string str_name = "mystring"; tiledb_datatype_t i64_datatype = TILEDB_INT64; tiledb_datatype_t u32_datatype = TILEDB_UINT32; @@ -86,9 +90,9 @@ struct VariouslyIndexedDataFrameFixture { .use_current_domain = use_current_domain}); } - helper::AttrInfo i64_attr_info() { + helper::AttrInfo i64_attr_info(std::string name = i64_name) { return helper::AttrInfo( - {.name = i64_name, .tiledb_datatype = i64_datatype}); + {.name = name, .tiledb_datatype = i64_datatype}); } helper::AttrInfo u32_attr_info() { return helper::AttrInfo( @@ -314,7 +318,7 @@ TEST_CASE_METHOD( std::vector dim_infos( {i64_dim_info(use_current_domain)}); - std::vector attr_infos({u32_attr_info()}); + std::vector attr_infos({i64_attr_info("a0")}); REQUIRE(!SOMADataFrame::exists(uri_, ctx_));