From a08bd0c914dc6d49afaa5fdd6654a61131902f33 Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Fri, 24 Nov 2023 09:42:01 -0500 Subject: [PATCH 1/4] Using C++23 to reimplement sort by using zip --- CMakeLists.txt | 2 +- include/DataFrame/DataFrameTypes.h | 4 +- include/DataFrame/Internals/DataFrame.tcc | 531 ++++++++++-------- .../DataFrame/Internals/DataFrame_functors.h | 3 +- .../DataFrame/Internals/DataFrame_misc.tcc | 3 +- .../Internals/DataFrame_standalone.tcc | 31 +- src/Makefile.Linux.GCC64 | 2 +- src/Makefile.Linux.GCC64D | 4 +- 8 files changed, 320 insertions(+), 260 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bfea8b352..1737fb05c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,7 @@ add_library(DataFrame::DataFrame ALIAS DataFrame) target_sources(DataFrame PRIVATE src/Utils/DateTime.cc) -target_compile_features(DataFrame PUBLIC cxx_std_20) +target_compile_features(DataFrame PUBLIC cxx_std_23) target_compile_definitions( DataFrame PRIVATE $<$:HMDF_HAVE_CLOCK_GETTIME> diff --git a/include/DataFrame/DataFrameTypes.h b/include/DataFrame/DataFrameTypes.h index 7e2977ff2..f4fe86f73 100644 --- a/include/DataFrame/DataFrameTypes.h +++ b/include/DataFrame/DataFrameTypes.h @@ -579,10 +579,10 @@ struct RandGenParams { std::size_t t_dist { 1 }; // The μ distribution parameter (the mean of the distribution) // - double mean { 1.0 }; + double mean { 0 }; // the σ distribution parameter (standard deviation) // - double std { 0 }; + double std { 1 }; // The λ distribution parameter (the rate parameter) // double lambda { 1.0 }; diff --git a/include/DataFrame/Internals/DataFrame.tcc b/include/DataFrame/Internals/DataFrame.tcc index f88f398e6..a427a9d53 100644 --- a/include/DataFrame/Internals/DataFrame.tcc +++ b/include/DataFrame/Internals/DataFrame.tcc @@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include // ---------------------------------------------------------------------------- @@ -124,7 +125,7 @@ void DataFrame:: sort_common_(DataFrame &df, CF &&comp_func, bool ignore_index) { const size_type idx_s = df.indices_.size(); - StlVecType sorting_idxs(idx_s, 0); + StlVecType sorting_idxs(idx_s); std::iota(sorting_idxs.begin(), sorting_idxs.end(), 0); std::sort(sorting_idxs.begin(), sorting_idxs.end(), comp_func); @@ -691,81 +692,67 @@ sort(const char *name, sort_spec dir, bool ignore_index) { make_consistent(); - const SpinGuard guard (lock_); + ColumnVecType *vec { nullptr}; + const SpinGuard guard (lock_); if (! ::strcmp(name, DF_INDEX_COL_NAME)) { - const auto &idx_vec = get_index(); - auto a = - [&idx_vec](size_type i, size_type j) -> bool { - return (idx_vec[i] < idx_vec[j]); - }; - auto d = - [&idx_vec](size_type i, size_type j) -> bool { - return (idx_vec[i] > idx_vec[j]); - }; - auto aa = - [&idx_vec](size_type i, size_type j) -> bool { - return (abs__(idx_vec[i]) < abs__(idx_vec[j])); - }; - auto ad = - [&idx_vec](size_type i, size_type j) -> bool { - return (abs__(idx_vec[i]) > abs__(idx_vec[j])); - }; - - if (dir == sort_spec::ascen) - sort_common_(*this, - std::move(a), - ignore_index); - else if (dir == sort_spec::desce) - sort_common_(*this, - std::move(d), - ignore_index); - else if (dir == sort_spec::abs_ascen) - sort_common_(*this, - std::move(aa), - ignore_index); - else if (dir == sort_spec::abs_desce) - sort_common_(*this, - std::move(ad), - ignore_index); + vec = reinterpret_cast *>(&indices_); + ignore_index = true; } - else { - const auto &col_vec = get_column(name); - auto a = - [&col_vec](size_type i, size_type j) -> bool { - return (col_vec[i] < col_vec[j]); - }; - auto d = - [&col_vec](size_type i, size_type j) -> bool { - return (col_vec[i] > col_vec[j]); - }; - auto aa = - [&col_vec](size_type i, size_type j) -> bool { - return (abs__(col_vec[i]) < abs__(col_vec[j])); - }; - auto ad = - [&col_vec](size_type i, size_type j) -> bool { - return (abs__(col_vec[i]) > abs__(col_vec[j])); - }; - - if (dir == sort_spec::ascen) [[likely]] - sort_common_(*this, - std::move(a), - ignore_index); - else if (dir == sort_spec::desce) - sort_common_(*this, - std::move(d), - ignore_index); - else if (dir == sort_spec::abs_ascen) - sort_common_(*this, - std::move(aa), - ignore_index); - else if (dir == sort_spec::abs_desce) - sort_common_(*this, - std::move(ad), - ignore_index); + else + vec = &(get_column(name, false)); + + auto a = [](const auto &lhs, const auto &rhs) -> bool { + return (std::get<0>(lhs) < std::get<0>(rhs)); + }; + auto d = [](const auto &lhs, const auto &rhs) -> bool { + return (std::get<0>(lhs) > std::get<0>(rhs)); + }; + auto aa = [](const auto &lhs, const auto &rhs) -> bool { + return (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))); + }; + auto ad = [](const auto &lhs, const auto &rhs) -> bool { + return (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))); + }; + + const size_type idx_s = indices_.size(); + StlVecType sorting_idxs(idx_s); + + std::iota(sorting_idxs.begin(), sorting_idxs.end(), 0); + + auto zip = std::ranges::views::zip(*vec, sorting_idxs); + auto zip_idx = std::ranges::views::zip(*vec, indices_, sorting_idxs); + + if (dir == sort_spec::ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, a); + else + std::ranges::sort(zip, a); + } + else if (dir == sort_spec::desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, d); + else + std::ranges::sort(zip, d); + } + else if (dir == sort_spec::abs_ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, aa); + else + std::ranges::sort(zip, aa); } + else if (dir == sort_spec::abs_desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, ad); + else + std::ranges::sort(zip, ad); + } + + sort_functor_ functor (sorting_idxs, idx_s); + for (const auto &citer : column_list_) [[likely]] + if (citer.first != name) + data_[citer.second].change(functor); return; } @@ -774,219 +761,270 @@ sort(const char *name, sort_spec dir, bool ignore_index) { template template void DataFrame:: -sort(const char *name1, sort_spec dir1, const char *name2, sort_spec dir2, +sort(const char *name1, sort_spec dir1, + const char *name2, sort_spec dir2, bool ignore_index) { make_consistent(); - const ColumnVecType *vec1 { nullptr}; - const ColumnVecType *vec2 { nullptr}; - const SpinGuard guard (lock_); + ColumnVecType *vec1 { nullptr}; + ColumnVecType *vec2 { nullptr}; + const SpinGuard guard (lock_); - if (! ::strcmp(name1, DF_INDEX_COL_NAME)) + if (! ::strcmp(name1, DF_INDEX_COL_NAME)) { vec1 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec1 = &(get_column(name1, false)); - if (! ::strcmp(name2, DF_INDEX_COL_NAME)) + if (! ::strcmp(name2, DF_INDEX_COL_NAME)) { vec2 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec2 = &(get_column(name2, false)); auto a_a = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) < vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) < std::get<0>(rhs)) return (true); - else if (vec1->at(i) > vec1->at(j)) + else if (std::get<0>(lhs) > std::get<0>(rhs)) return (false); - return (vec2->at(i) < vec2->at(j)); + return (std::get<1>(lhs) < std::get<1>(rhs)); }; auto d_d = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) > vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) > std::get<0>(rhs)) return (true); - else if (vec1->at(i) < vec1->at(j)) + else if (std::get<0>(lhs) < std::get<0>(rhs)) return (false); - return (vec2->at(i) > vec2->at(j)); + return (std::get<1>(lhs) > std::get<1>(rhs)); }; auto a_d = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) < vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) < std::get<0>(rhs)) return (true); - else if (vec1->at(i) > vec1->at(j)) + else if (std::get<0>(lhs) > std::get<0>(rhs)) return (false); - return (vec2->at(i) > vec2->at(j)); + return (std::get<1>(lhs) > std::get<1>(rhs)); }; auto d_a = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) > vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) > std::get<0>(rhs)) return (true); - else if (vec1->at(i) < vec1->at(j)) + else if (std::get<0>(lhs) < std::get<0>(rhs)) return (false); - return (vec2->at(i) < vec2->at(j)); + return (std::get<1>(lhs) < std::get<1>(rhs)); }; auto aa_aa = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (false); - return (abs__(vec2->at(i)) < abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))); }; auto ad_ad = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (false); - return (abs__(vec2->at(i)) > abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))); }; auto aa_ad = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (false); - return (abs__(vec2->at(i)) > abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))); }; auto ad_aa = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (false); - return (abs__(vec2->at(i)) < abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))); }; auto a_aa = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) < vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) < std::get<0>(rhs)) return (true); - else if (vec1->at(i) > vec1->at(j)) + else if (std::get<0>(lhs) > std::get<0>(rhs)) return (false); - return (abs__(vec2->at(i)) < abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))); }; auto a_ad = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) < vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) < std::get<0>(rhs)) return (true); - else if (vec1->at(i) > vec1->at(j)) + else if (std::get<0>(lhs) > std::get<0>(rhs)) return (false); - return (abs__(vec2->at(i)) > abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))); }; auto d_aa = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) > vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) > std::get<0>(rhs)) return (true); - else if (vec1->at(i) < vec1->at(j)) + else if (std::get<0>(lhs) < std::get<0>(rhs)) return (false); - return (abs__(vec2->at(i)) < abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))); }; auto d_ad = - [vec1, vec2](size_type i, size_type j) -> bool { - if (vec1->at(i) > vec1->at(j)) + [](const auto &lhs, const auto &rhs) -> bool { + if (std::get<0>(lhs) > std::get<0>(rhs)) return (true); - else if (vec1->at(i) < vec1->at(j)) + else if (std::get<0>(lhs) < std::get<0>(rhs)) return (false); - return (abs__(vec2->at(i)) > abs__(vec2->at(j))); + return (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))); }; auto aa_a = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (false); - return (vec2->at(i) < vec2->at(j)); + return (std::get<1>(lhs) < std::get<1>(rhs)); }; auto ad_a = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (false); - return (vec2->at(i) < vec2->at(j)); + return (std::get<1>(lhs) < std::get<1>(rhs)); }; auto aa_d = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (false); - return (vec2->at(i) > vec2->at(j)); + return (std::get<1>(lhs) > std::get<1>(rhs)); }; auto ad_d = - [vec1, vec2](size_type i, size_type j) -> bool { - if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + [](const auto &lhs, const auto &rhs) -> bool { + if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (false); - return (vec2->at(i) > vec2->at(j)); + return (std::get<1>(lhs) > std::get<1>(rhs)); }; + const size_type idx_s = indices_.size(); + StlVecType sorting_idxs(idx_s); + + std::iota(sorting_idxs.begin(), sorting_idxs.end(), 0); + + auto zip = std::ranges::views::zip(*vec1, *vec2, sorting_idxs); + auto zip_idx = + std::ranges::views::zip(*vec1, *vec2, indices_, sorting_idxs); + + if (dir1 == sort_spec::ascen && dir2 == sort_spec::ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, a_a); + else + std::ranges::sort(zip, a_a); + } + else if (dir1 == sort_spec::desce && dir2 == sort_spec::desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, d_d); + else + std::ranges::sort(zip, d_d); + } + else if (dir1 == sort_spec::ascen && dir2 == sort_spec::desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, a_d); + else + std::ranges::sort(zip, a_d); + } + else if (dir1 == sort_spec::desce && dir2 == sort_spec::ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, d_a); + else + std::ranges::sort(zip, d_a); + } + else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::abs_ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, aa_aa); + else + std::ranges::sort(zip, aa_aa); + } + else if (dir1 == sort_spec::abs_desce && dir2 == sort_spec::abs_desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, ad_ad); + else + std::ranges::sort(zip, ad_ad); + } + else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::abs_desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, aa_ad); + else + std::ranges::sort(zip, aa_ad); + } + else if (dir1 == sort_spec::abs_desce && dir2 == sort_spec::abs_ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, ad_aa); + else + std::ranges::sort(zip, ad_aa); + } + else if (dir1 == sort_spec::ascen && dir2 == sort_spec::abs_ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, a_aa); + else + std::ranges::sort(zip, a_aa); + } + else if (dir1 == sort_spec::ascen && dir2 == sort_spec::abs_desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, a_ad); + else + std::ranges::sort(zip, a_ad); + } + else if (dir1 == sort_spec::desce && dir2 == sort_spec::abs_ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, d_aa); + else + std::ranges::sort(zip, d_aa); + } + else if (dir1 == sort_spec::desce && dir2 == sort_spec::abs_desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, d_ad); + else + std::ranges::sort(zip, d_ad); + } + else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, aa_a); + else + std::ranges::sort(zip, aa_a); + } + else if (dir1 == sort_spec::abs_desce && dir2 == sort_spec::ascen) { + if (! ignore_index) + std::ranges::sort(zip_idx, ad_a); + else + std::ranges::sort(zip, ad_a); + } + else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::desce) { + if (! ignore_index) + std::ranges::sort(zip_idx, aa_d); + else + std::ranges::sort(zip, aa_d); + } + else { // dir1 == sort_spec::abs_desce && dir2 == sort_spec::desce + if (! ignore_index) + std::ranges::sort(zip_idx, ad_d); + else + std::ranges::sort(zip, ad_d); + } - if (dir1 == sort_spec::ascen && dir2 == sort_spec::ascen) - sort_common_(*this, - std::move(a_a), - ignore_index); - else if (dir1 == sort_spec::desce && dir2 == sort_spec::desce) - sort_common_(*this, - std::move(d_d), - ignore_index); - else if (dir1 == sort_spec::ascen && dir2 == sort_spec::desce) - sort_common_(*this, - std::move(a_d), - ignore_index); - else if (dir1 == sort_spec::desce && dir2 == sort_spec::ascen) - sort_common_(*this, - std::move(d_a), - ignore_index); - else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::abs_ascen) - sort_common_(*this, - std::move(aa_aa), - ignore_index); - else if (dir1 == sort_spec::abs_desce && dir2 == sort_spec::abs_desce) - sort_common_(*this, - std::move(ad_ad), - ignore_index); - else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::abs_desce) - sort_common_(*this, - std::move(aa_ad), - ignore_index); - else if (dir1 == sort_spec::abs_desce && dir2 == sort_spec::abs_ascen) - sort_common_(*this, - std::move(ad_aa), - ignore_index); - else if (dir1 == sort_spec::ascen && dir2 == sort_spec::abs_ascen) - sort_common_(*this, - std::move(a_aa), - ignore_index); - else if (dir1 == sort_spec::ascen && dir2 == sort_spec::abs_desce) - sort_common_(*this, - std::move(a_ad), - ignore_index); - else if (dir1 == sort_spec::desce && dir2 == sort_spec::abs_ascen) - sort_common_(*this, - std::move(d_aa), - ignore_index); - else if (dir1 == sort_spec::desce && dir2 == sort_spec::abs_desce) - sort_common_(*this, - std::move(d_ad), - ignore_index); - else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::ascen) - sort_common_(*this, - std::move(aa_a), - ignore_index); - else if (dir1 == sort_spec::abs_desce && dir2 == sort_spec::ascen) - sort_common_(*this, - std::move(ad_a), - ignore_index); - else if (dir1 == sort_spec::abs_ascen && dir2 == sort_spec::desce) - sort_common_(*this, - std::move(aa_d), - ignore_index); - else // dir1 == sort_spec::abs_desce && dir2 == sort_spec::desce - sort_common_(*this, - std::move(ad_d), - ignore_index); + sort_functor_ functor (sorting_idxs, idx_s); + + for (const auto &citer : column_list_) [[likely]] + if (citer.first != name1 && citer.first != name2) + data_[citer.second].change(functor); return; } @@ -1002,90 +1040,113 @@ sort(const char *name1, sort_spec dir1, make_consistent(); - const ColumnVecType *vec1 { nullptr}; - const ColumnVecType *vec2 { nullptr}; - const ColumnVecType *vec3 { nullptr}; - const SpinGuard guard (lock_); + ColumnVecType *vec1 { nullptr}; + ColumnVecType *vec2 { nullptr}; + ColumnVecType *vec3 { nullptr}; + const SpinGuard guard (lock_); - if (! ::strcmp(name1, DF_INDEX_COL_NAME)) + if (! ::strcmp(name1, DF_INDEX_COL_NAME)) { vec1 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec1 = &(get_column(name1, false)); - if (! ::strcmp(name2, DF_INDEX_COL_NAME)) + if (! ::strcmp(name2, DF_INDEX_COL_NAME)) { vec2 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec2 = &(get_column(name2, false)); - if (! ::strcmp(name3, DF_INDEX_COL_NAME)) + if (! ::strcmp(name3, DF_INDEX_COL_NAME)) { vec3 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec3 = &(get_column(name3, false)); auto cf = - [vec1, vec2, vec3, dir1, dir2, dir3] - (size_type i, size_type j) -> bool { + [dir1, dir2, dir3](const auto &lhs, const auto &rhs) -> bool { if (dir1 == sort_spec::ascen) { - if (vec1->at(i) < vec1->at(j)) + if (std::get<0>(lhs) < std::get<0>(rhs)) return (true); - else if (vec1->at(i) > vec1->at(j)) + else if (std::get<0>(lhs) > std::get<0>(rhs)) return (false); } else if (dir1 == sort_spec::desce) { - if (vec1->at(i) > vec1->at(j)) + if (std::get<0>(lhs) > std::get<0>(rhs)) return (true); - else if (vec1->at(i) < vec1->at(j)) + else if (std::get<0>(lhs) < std::get<0>(rhs)) return (false); } else if (dir1 == sort_spec::abs_ascen) { - if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (false); } if (dir2 == sort_spec::ascen) { - if (vec2->at(i) < vec2->at(j)) + if (std::get<1>(lhs) < std::get<1>(rhs)) return (true); - else if (vec2->at(i) > vec2->at(j)) + else if (std::get<1>(lhs) > std::get<1>(rhs)) return (false); } else if (dir2 == sort_spec::desce) { - if (vec2->at(i) > vec2->at(j)) + if (std::get<1>(lhs) > std::get<1>(rhs)) return (true); - else if (vec2->at(i) < vec2->at(j)) + else if (std::get<1>(lhs) < std::get<1>(rhs)) return (false); } else if (dir2 == sort_spec::abs_ascen) { - if (abs__(vec2->at(i)) < abs__(vec2->at(j))) + if (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))) return (true); - else if (abs__(vec2->at(i)) > abs__(vec2->at(j))) + else if (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec2->at(i)) > abs__(vec2->at(j))) + if (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))) return (true); - else if (abs__(vec2->at(i)) < abs__(vec2->at(j))) + else if (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))) return (false); } if (dir3 == sort_spec::ascen) - return (vec3->at(i) < vec3->at(j)); + return (std::get<2>(lhs) < std::get<2>(rhs)); else if (dir3 == sort_spec::desce) - return (vec3->at(i) > vec3->at(j)); + return (std::get<2>(lhs) > std::get<2>(rhs)); else if (dir3 == sort_spec::abs_ascen) - return (abs__(vec3->at(i)) < abs__(vec3->at(j))); + return (abs__(std::get<2>(lhs)) < abs__(std::get<2>(rhs))); else // sort_spec::abs_desce - return (abs__(vec3->at(i)) > abs__(vec3->at(j))); + return (abs__(std::get<2>(lhs)) > abs__(std::get<2>(rhs))); }; - sort_common_(*this, std::move(cf), ignore_index); + const size_type idx_s = indices_.size(); + StlVecType sorting_idxs(idx_s); + + std::iota(sorting_idxs.begin(), sorting_idxs.end(), 0); + + auto zip = std::ranges::views::zip(*vec1, *vec2, *vec3, sorting_idxs); + auto zip_idx = + std::ranges::views::zip(*vec1, *vec2, *vec3, indices_, sorting_idxs); + + if (! ignore_index) + std::ranges::sort(zip_idx, cf); + else + std::ranges::sort(zip, cf); + + sort_functor_ functor (sorting_idxs, idx_s); + + for (const auto &citer : column_list_) [[likely]] + if (citer.first != name1 && citer.first != name2 && citer.first != name3) + data_[citer.second].change(functor); return; } diff --git a/include/DataFrame/Internals/DataFrame_functors.h b/include/DataFrame/Internals/DataFrame_functors.h index 754c4f059..5eb7e4e6a 100644 --- a/include/DataFrame/Internals/DataFrame_functors.h +++ b/include/DataFrame/Internals/DataFrame_functors.h @@ -67,8 +67,7 @@ struct sort_functor_ : DataVec::template visitor_base { : sorted_idxs(si), idx_s(is) { } const StlVecType &sorted_idxs; - StlVecType sorted_idxs_copy; - const size_t idx_s; + const size_t idx_s; template void operator() (T2 &vec); diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc index 2c9203156..e6fb1f88d 100644 --- a/include/DataFrame/Internals/DataFrame_misc.tcc +++ b/include/DataFrame/Internals/DataFrame_misc.tcc @@ -77,8 +77,7 @@ template void DataFrame::sort_functor_::operator() (T2 &vec) { - sorted_idxs_copy = sorted_idxs; - _sort_by_sorted_index_(vec, sorted_idxs_copy, idx_s); + _sort_by_sorted_index_(vec, sorted_idxs, idx_s); return; } diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index e655e485e..ebeaec5c7 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -865,23 +865,24 @@ inline static O _remove_copy_if_(I first, I last, O d_first, PRE predicate) { template static inline void -_sort_by_sorted_index_(T &to_be_sorted, V &sorting_idxs, size_t idx_s) { - - if (idx_s > 0) { - idx_s -= 1; - for (size_t i = 0; i < idx_s; ++i) [[likely]] { - // while the element i is not yet in place - // - while (sorting_idxs[i] != sorting_idxs[sorting_idxs[i]]) { - // swap it with the element at its final place - // - const size_t j = sorting_idxs[i]; - - std::swap(to_be_sorted[j], to_be_sorted[sorting_idxs[j]]); - std::swap(sorting_idxs[i], sorting_idxs[j]); +_sort_by_sorted_index_(T &to_be_sorted, const V &sorting_idxs, size_t idx_s) { + + std::vector done (idx_s, false); + + for (std::size_t i = 0; i < idx_s; ++i) [[likely]] + if (! done[i]) { + done[i] = true; + + std::size_t prev_j = i; + std::size_t j = sorting_idxs[i]; + + while (i != j) { + std::swap(to_be_sorted[prev_j], to_be_sorted[j]); + done[j] = true; + prev_j = j; + j = sorting_idxs[j]; } } - } } // ---------------------------------------------------------------------------- diff --git a/src/Makefile.Linux.GCC64 b/src/Makefile.Linux.GCC64 index e67e1a0b4..ed055558c 100644 --- a/src/Makefile.Linux.GCC64 +++ b/src/Makefile.Linux.GCC64 @@ -11,7 +11,7 @@ CXX = /usr/bin/g++ INCLUDES = -I/usr/include/c++/7 -I/usr/include LFLAGS = -CXXFLAGS = -O3 $(INCLUDES) $(DEFINES) -std=c++20 +CXXFLAGS = -O3 $(INCLUDES) $(DEFINES) -std=c++2b PLATFORM_LIBS = -lpthread -ldl -lm -lstdc++ diff --git a/src/Makefile.Linux.GCC64D b/src/Makefile.Linux.GCC64D index 51f306649..bbaadbacf 100644 --- a/src/Makefile.Linux.GCC64D +++ b/src/Makefile.Linux.GCC64D @@ -11,8 +11,8 @@ CXX = /usr/bin/g++ INCLUDES = -I/usr/include/c++/7 -I/usr/inc17lude LFLAGS = -CXXFLAGS = -g $(INCLUDES) $(DEFINES) -D_GLIBCXX_DEBUG -pedantic -Wall -Wextra -std=c++20 -# CXXFLAGS = -g $(INCLUDES) $(DEFINES) -std=c++20 +CXXFLAGS = -g $(INCLUDES) $(DEFINES) -D_GLIBCXX_DEBUG -pedantic -Wall -Wextra -std=c++2b +# CXXFLAGS = -g $(INCLUDES) $(DEFINES) -std=c++2b PLATFORM_LIBS = -lpthread -ldl -lm -lstdc++ -fsanitize-address-use-after-scope -fsanitize=address From 80c2a93599c94e50f0fbaf375907d9865fce2a9f Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Fri, 24 Nov 2023 11:14:26 -0500 Subject: [PATCH 2/4] more code toward zip sorting --- README.md | 6 +- include/DataFrame/DataFrameStatsVisitors.h | 8 +- include/DataFrame/Internals/DataFrame.tcc | 251 +++++++++++------- .../DataFrame/Internals/DataFrame_functors.h | 3 +- .../DataFrame/Internals/DataFrame_misc.tcc | 2 +- .../Internals/DataFrame_private_decl.h | 4 - .../Internals/DataFrame_standalone.tcc | 16 +- 7 files changed, 169 insertions(+), 121 deletions(-) diff --git a/README.md b/README.md index 07c5c20e5..2ef8428cf 100644 --- a/README.md +++ b/README.md @@ -24,15 +24,13 @@ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> +[![C++23](https://img.shields.io/badge/C%2B%2B-23-blue.svg)](https://isocpp.org/std/the-standard ) [![Build status](https://ci.appveyor.com/api/projects/status/hjw01qui3bvxs8yi?svg=true)](https://ci.appveyor.com/project/hosseinmoein/dataframe) +
![GitHub](https://img.shields.io/github/license/hosseinmoein/DataFrame.svg?color=red&style=popout) -[![C++20](https://img.shields.io/badge/C%2B%2B-20-blue.svg)](https://isocpp.org/std/the-standard ) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/db646376a4014c3788c7224e670fe451)](https://app.codacy.com/manual/hosseinmoein/DataFrame?utm_source=github.com&utm_medium=referral&utm_content=hosseinmoein/DataFrame&utm_campaign=Badge_Grade_Dashboard)
-[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/hosseinmoein/DataFrame/master) -[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://GitHub.com/hosseinmoein/DataFrame/graphs/commit-activity) ![GitHub tag (latest by date)](https://img.shields.io/github/tag-date/hosseinmoein/DataFrame.svg?color=blue&label=Official%20Release&style=popout) -
![Conan Center](https://img.shields.io/conan/v/dataframe) [![VCPKG package](https://repology.org/badge/version-for-repo/vcpkg/dataframe.svg)](https://repology.org/project/dataframe/versions) diff --git a/include/DataFrame/DataFrameStatsVisitors.h b/include/DataFrame/DataFrameStatsVisitors.h index e07611e27..014d90cf6 100644 --- a/include/DataFrame/DataFrameStatsVisitors.h +++ b/include/DataFrame/DataFrameStatsVisitors.h @@ -4842,6 +4842,9 @@ struct LowessVisitor { const Y &y_begin, const Y &y_end, // dependent variable const X &x_begin, const X &x_end) { // independent variable + using bool_vec_t = + std::vector::type>; + assert(frac_ >= 0 && frac_ <= 1); assert(loop_n_ > 2); @@ -4862,7 +4865,10 @@ struct LowessVisitor { [] (auto lhs, auto rhs) -> bool { return (lhs < rhs); }); - _sort_by_sorted_index_(yvals, sorting_idxs, col_s); + + bool_vec_t done_vec (col_s); + + _sort_by_sorted_index_(yvals, sorting_idxs, done_vec, col_s); lowess_(idx_begin, idx_end, yvals.begin(), yvals.end(), xvals.begin(), xvals.end()); diff --git a/include/DataFrame/Internals/DataFrame.tcc b/include/DataFrame/Internals/DataFrame.tcc index a427a9d53..27c9eb9a6 100644 --- a/include/DataFrame/Internals/DataFrame.tcc +++ b/include/DataFrame/Internals/DataFrame.tcc @@ -118,28 +118,6 @@ void DataFrame::remove_lock () { lock_ = nullptr; } // ---------------------------------------------------------------------------- - -template -template -void DataFrame:: -sort_common_(DataFrame &df, CF &&comp_func, bool ignore_index) { - - const size_type idx_s = df.indices_.size(); - StlVecType sorting_idxs(idx_s); - - std::iota(sorting_idxs.begin(), sorting_idxs.end(), 0); - std::sort(sorting_idxs.begin(), sorting_idxs.end(), comp_func); - - sort_functor_ functor (sorting_idxs, idx_s); - - for (const auto &iter : df.data_) [[likely]] - iter.change(functor); - if (! ignore_index) - _sort_by_sorted_index_(df.indices_, sorting_idxs, idx_s); -} - -// ---------------------------------------------------------------------------- - template template void @@ -723,7 +701,7 @@ sort(const char *name, sort_spec dir, bool ignore_index) { auto zip = std::ranges::views::zip(*vec, sorting_idxs); auto zip_idx = std::ranges::views::zip(*vec, indices_, sorting_idxs); - if (dir == sort_spec::ascen) { + if (dir == sort_spec::ascen) { if (! ignore_index) std::ranges::sort(zip_idx, a); else @@ -1145,7 +1123,9 @@ sort(const char *name1, sort_spec dir1, sort_functor_ functor (sorting_idxs, idx_s); for (const auto &citer : column_list_) [[likely]] - if (citer.first != name1 && citer.first != name2 && citer.first != name3) + if (citer.first != name1 && + citer.first != name2 && + citer.first != name3) data_[citer.second].change(functor); return; } @@ -1169,115 +1149,145 @@ sort(const char *name1, sort_spec dir1, const ColumnVecType *vec4 { nullptr}; const SpinGuard guard (lock_); - if (! ::strcmp(name1, DF_INDEX_COL_NAME)) + if (! ::strcmp(name1, DF_INDEX_COL_NAME)) { vec1 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec1 = &(get_column(name1, false)); - if (! ::strcmp(name2, DF_INDEX_COL_NAME)) + if (! ::strcmp(name2, DF_INDEX_COL_NAME)) { vec2 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec2 = &(get_column(name2, false)); - if (! ::strcmp(name3, DF_INDEX_COL_NAME)) + if (! ::strcmp(name3, DF_INDEX_COL_NAME)) { vec3 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec3 = &(get_column(name3, false)); - if (! ::strcmp(name4, DF_INDEX_COL_NAME)) + if (! ::strcmp(name4, DF_INDEX_COL_NAME)) { vec4 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec4 = &(get_column(name4, false)); auto cf = - [vec1, vec2, vec3, vec4, dir1, dir2, dir3, dir4] - (size_type i, size_type j) -> bool { + [dir1, dir2, dir3, dir4](const auto &lhs, const auto &rhs) -> bool { if (dir1 == sort_spec::ascen) { - if (vec1->at(i) < vec1->at(j)) + if (std::get<0>(lhs) < std::get<0>(rhs)) return (true); - else if (vec1->at(i) > vec1->at(j)) + else if (std::get<0>(lhs) > std::get<0>(rhs)) return (false); } else if (dir1 == sort_spec::desce) { - if (vec1->at(i) > vec1->at(j)) + if (std::get<0>(lhs) > std::get<0>(rhs)) return (true); - else if (vec1->at(i) < vec1->at(j)) + else if (std::get<0>(lhs) < std::get<0>(rhs)) return (false); } else if (dir1 == sort_spec::abs_ascen) { - if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (false); } if (dir2 == sort_spec::ascen) { - if (vec2->at(i) < vec2->at(j)) + if (std::get<1>(lhs) < std::get<1>(rhs)) return (true); - else if (vec2->at(i) > vec2->at(j)) + else if (std::get<1>(lhs) > std::get<1>(rhs)) return (false); } else if (dir2 == sort_spec::desce) { - if (vec2->at(i) > vec2->at(j)) + if (std::get<1>(lhs) > std::get<1>(rhs)) return (true); - else if (vec2->at(i) < vec2->at(j)) + else if (std::get<1>(lhs) < std::get<1>(rhs)) return (false); } else if (dir2 == sort_spec::abs_ascen) { - if (abs__(vec2->at(i)) < abs__(vec2->at(j))) + if (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))) return (true); - else if (abs__(vec2->at(i)) > abs__(vec2->at(j))) + else if (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec2->at(i)) > abs__(vec2->at(j))) + if (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))) return (true); - else if (abs__(vec2->at(i)) < abs__(vec2->at(j))) + else if (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))) return (false); } if (dir3 == sort_spec::ascen) { - if (vec3->at(i) < vec3->at(j)) + if (std::get<2>(lhs) < std::get<2>(rhs)) return (true); - else if (vec3->at(i) > vec3->at(j)) + else if (std::get<2>(lhs) > std::get<2>(rhs)) return (false); } else if (dir3 == sort_spec::desce) { - if (vec3->at(i) > vec3->at(j)) + if (std::get<2>(lhs) > std::get<2>(rhs)) return (true); - else if (vec3->at(i) < vec3->at(j)) + else if (std::get<2>(lhs) < std::get<2>(rhs)) return (false); } else if (dir3 == sort_spec::abs_ascen) { - if (abs__(vec3->at(i)) < abs__(vec3->at(j))) + if (abs__(std::get<2>(lhs)) < abs__(std::get<2>(rhs))) return (true); - else if (abs__(vec3->at(i)) > abs__(vec3->at(j))) + else if (abs__(std::get<2>(lhs)) > abs__(std::get<2>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec3->at(i)) > abs__(vec3->at(j))) + if (abs__(std::get<2>(lhs)) > abs__(std::get<2>(rhs))) return (true); - else if (abs__(vec3->at(i)) < abs__(vec3->at(j))) + else if (abs__(std::get<2>(lhs)) < abs__(std::get<2>(rhs))) return (false); } if (dir4 == sort_spec::ascen) - return (vec4->at(i) < vec4->at(j)); + return (std::get<3>(lhs) < std::get<3>(rhs)); else if (dir4 == sort_spec::desce) - return (vec4->at(i) > vec4->at(j)); + return (std::get<3>(lhs) > std::get<3>(rhs)); else if (dir4 == sort_spec::abs_ascen) - return (abs__(vec4->at(i)) < abs__(vec4->at(j))); + return (abs__(std::get<3>(lhs)) < abs__(std::get<3>(rhs))); else // sort_spec::abs_desce - return (abs__(vec4->at(i)) > abs__(vec4->at(j))); + return (abs__(std::get<3>(lhs)) > abs__(std::get<3>(rhs))); }; - sort_common_(*this, std::move(cf), ignore_index); + const size_type idx_s = indices_.size(); + StlVecType sorting_idxs(idx_s); + + std::iota(sorting_idxs.begin(), sorting_idxs.end(), 0); + + auto zip = + std::ranges::views::zip(*vec1, *vec2, *vec3, *vec4, sorting_idxs); + auto zip_idx = + std::ranges::views::zip(*vec1, *vec2, *vec3, *vec4, + indices_, sorting_idxs); + + if (! ignore_index) + std::ranges::sort(zip_idx, cf); + else + std::ranges::sort(zip, cf); + + sort_functor_ functor (sorting_idxs, idx_s); + + for (const auto &citer : column_list_) [[likely]] + if (citer.first != name1 && + citer.first != name2 && + citer.first != name3 && + citer.first != name4) + data_[citer.second].change(functor); return; } @@ -1303,145 +1313,180 @@ sort(const char *name1, sort_spec dir1, const ColumnVecType *vec5 { nullptr}; const SpinGuard guard (lock_); - if (! ::strcmp(name1, DF_INDEX_COL_NAME)) + if (! ::strcmp(name1, DF_INDEX_COL_NAME)) { vec1 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec1 = &(get_column(name1, false)); - if (! ::strcmp(name2, DF_INDEX_COL_NAME)) + if (! ::strcmp(name2, DF_INDEX_COL_NAME)) { vec2 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec2 = &(get_column(name2, false)); - if (! ::strcmp(name3, DF_INDEX_COL_NAME)) + if (! ::strcmp(name3, DF_INDEX_COL_NAME)) { vec3 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec3 = &(get_column(name3, false)); - if (! ::strcmp(name4, DF_INDEX_COL_NAME)) + if (! ::strcmp(name4, DF_INDEX_COL_NAME)) { vec4 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec4 = &(get_column(name4, false)); - if (! ::strcmp(name4, DF_INDEX_COL_NAME)) + if (! ::strcmp(name4, DF_INDEX_COL_NAME)) { vec5 = reinterpret_cast *>(&indices_); + ignore_index = true; + } else vec5 = &(get_column(name5, false)); auto cf = - [vec1, vec2, vec3, vec4, vec5, dir1, dir2, dir3, dir4, dir5] - (size_type i, size_type j) -> bool { + [dir1, dir2, dir3, dir4, dir5] + (const auto &lhs, const auto &rhs) -> bool { if (dir1 == sort_spec::ascen) { - if (vec1->at(i) < vec1->at(j)) + if (std::get<0>(lhs) < std::get<0>(rhs)) return (true); - else if (vec1->at(i) > vec1->at(j)) + else if (std::get<0>(lhs) > std::get<0>(rhs)) return (false); } else if (dir1 == sort_spec::desce) { - if (vec1->at(i) > vec1->at(j)) + if (std::get<0>(lhs) > std::get<0>(rhs)) return (true); - else if (vec1->at(i) < vec1->at(j)) + else if (std::get<0>(lhs) < std::get<0>(rhs)) return (false); } else if (dir1 == sort_spec::abs_ascen) { - if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec1->at(i)) > abs__(vec1->at(j))) + if (abs__(std::get<0>(lhs)) > abs__(std::get<0>(rhs))) return (true); - else if (abs__(vec1->at(i)) < abs__(vec1->at(j))) + else if (abs__(std::get<0>(lhs)) < abs__(std::get<0>(rhs))) return (false); } if (dir2 == sort_spec::ascen) { - if (vec2->at(i) < vec2->at(j)) + if (std::get<1>(lhs) < std::get<1>(rhs)) return (true); - else if (vec2->at(i) > vec2->at(j)) + else if (std::get<1>(lhs) > std::get<1>(rhs)) return (false); } else if (dir2 == sort_spec::desce) { - if (vec2->at(i) > vec2->at(j)) + if (std::get<1>(lhs) > std::get<1>(rhs)) return (true); - else if (vec2->at(i) < vec2->at(j)) + else if (std::get<1>(lhs) < std::get<1>(rhs)) return (false); } else if (dir2 == sort_spec::abs_ascen) { - if (abs__(vec2->at(i)) < abs__(vec2->at(j))) + if (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))) return (true); - else if (abs__(vec2->at(i)) > abs__(vec2->at(j))) + else if (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec2->at(i)) > abs__(vec2->at(j))) + if (abs__(std::get<1>(lhs)) > abs__(std::get<1>(rhs))) return (true); - else if (abs__(vec2->at(i)) < abs__(vec2->at(j))) + else if (abs__(std::get<1>(lhs)) < abs__(std::get<1>(rhs))) return (false); } if (dir3 == sort_spec::ascen) { - if (vec3->at(i) < vec3->at(j)) + if (std::get<2>(lhs) < std::get<2>(rhs)) return (true); - else if (vec3->at(i) > vec3->at(j)) + else if (std::get<2>(lhs) > std::get<2>(rhs)) return (false); } else if (dir3 == sort_spec::desce) { - if (vec3->at(i) > vec3->at(j)) + if (std::get<2>(lhs) > std::get<2>(rhs)) return (true); - else if (vec3->at(i) < vec3->at(j)) + else if (std::get<2>(lhs) < std::get<2>(rhs)) return (false); } else if (dir3 == sort_spec::abs_ascen) { - if (abs__(vec3->at(i)) < abs__(vec3->at(j))) + if (abs__(std::get<2>(lhs)) < abs__(std::get<2>(rhs))) return (true); - else if (abs__(vec3->at(i)) > abs__(vec3->at(j))) + else if (abs__(std::get<2>(lhs)) > abs__(std::get<2>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec3->at(i)) > abs__(vec3->at(j))) + if (abs__(std::get<2>(lhs)) > abs__(std::get<2>(rhs))) return (true); - else if (abs__(vec3->at(i)) < abs__(vec3->at(j))) + else if (abs__(std::get<2>(lhs)) < abs__(std::get<2>(rhs))) return (false); } if (dir4 == sort_spec::ascen) { - if (vec4->at(i) < vec4->at(j)) + if (std::get<3>(lhs) < std::get<3>(rhs)) return (true); - else if (vec4->at(i) > vec4->at(j)) + else if (std::get<3>(lhs) > std::get<3>(rhs)) return (false); } else if (dir4 == sort_spec::desce) { - if (vec4->at(i) > vec4->at(j)) + if (std::get<3>(lhs) > std::get<3>(rhs)) return (true); - else if (vec4->at(i) < vec4->at(j)) + else if (std::get<3>(lhs) < std::get<3>(rhs)) return (false); } else if (dir4 == sort_spec::abs_ascen) { - if (abs__(vec4->at(i)) < abs__(vec4->at(j))) + if (abs__(std::get<3>(lhs)) < abs__(std::get<3>(rhs))) return (true); - else if (abs__(vec4->at(i)) > abs__(vec4->at(j))) + else if (abs__(std::get<3>(lhs)) > abs__(std::get<3>(rhs))) return (false); } else { // sort_spec::abs_desce - if (abs__(vec4->at(i)) > abs__(vec4->at(j))) + if (abs__(std::get<3>(lhs)) > abs__(std::get<3>(rhs))) return (true); - else if (abs__(vec4->at(i)) < abs__(vec4->at(j))) + else if (abs__(std::get<3>(lhs)) < abs__(std::get<3>(rhs))) return (false); } if (dir5 == sort_spec::ascen) - return (vec5->at(i) < vec5->at(j)); + return (std::get<4>(lhs) < std::get<4>(rhs)); else if (dir5 == sort_spec::desce) - return (vec5->at(i) > vec5->at(j)); + return (std::get<4>(lhs) > std::get<4>(rhs)); else if (dir5 == sort_spec::abs_ascen) - return (abs__(vec5->at(i)) < abs__(vec5->at(j))); + return (abs__(std::get<4>(lhs)) < abs__(std::get<4>(rhs))); else // sort_spec::abs_desce - return (abs__(vec5->at(i)) > abs__(vec5->at(j))); + return (abs__(std::get<4>(lhs)) > abs__(std::get<4>(rhs))); }; - sort_common_(*this, std::move(cf), ignore_index); + const size_type idx_s = indices_.size(); + StlVecType sorting_idxs(idx_s); + + std::iota(sorting_idxs.begin(), sorting_idxs.end(), 0); + + auto zip = + std::ranges::views::zip(*vec1, *vec2, *vec3, *vec4, *vec5, + sorting_idxs); + auto zip_idx = + std::ranges::views::zip(*vec1, *vec2, *vec3, *vec4, *vec5, + indices_, sorting_idxs); + + if (! ignore_index) + std::ranges::sort(zip_idx, cf); + else + std::ranges::sort(zip, cf); + + sort_functor_ functor (sorting_idxs, idx_s); + + for (const auto &citer : column_list_) [[likely]] + if (citer.first != name1 && + citer.first != name2 && + citer.first != name3 && + citer.first != name4 && + citer.first != name5) + data_[citer.second].change(functor); return; } diff --git a/include/DataFrame/Internals/DataFrame_functors.h b/include/DataFrame/Internals/DataFrame_functors.h index 5eb7e4e6a..10c4885f3 100644 --- a/include/DataFrame/Internals/DataFrame_functors.h +++ b/include/DataFrame/Internals/DataFrame_functors.h @@ -64,10 +64,11 @@ template struct sort_functor_ : DataVec::template visitor_base { inline sort_functor_ (const StlVecType &si, size_t is) - : sorted_idxs(si), idx_s(is) { } + : sorted_idxs(si), idx_s(is), done_vec(idx_s) { } const StlVecType &sorted_idxs; const size_t idx_s; + StlVecType done_vec; template void operator() (T2 &vec); diff --git a/include/DataFrame/Internals/DataFrame_misc.tcc b/include/DataFrame/Internals/DataFrame_misc.tcc index e6fb1f88d..205f9283e 100644 --- a/include/DataFrame/Internals/DataFrame_misc.tcc +++ b/include/DataFrame/Internals/DataFrame_misc.tcc @@ -77,7 +77,7 @@ template void DataFrame::sort_functor_::operator() (T2 &vec) { - _sort_by_sorted_index_(vec, sorted_idxs, idx_s); + _sort_by_sorted_index_(vec, sorted_idxs, done_vec, idx_s); return; } diff --git a/include/DataFrame/Internals/DataFrame_private_decl.h b/include/DataFrame/Internals/DataFrame_private_decl.h index 750602655..828a2511a 100644 --- a/include/DataFrame/Internals/DataFrame_private_decl.h +++ b/include/DataFrame/Internals/DataFrame_private_decl.h @@ -56,10 +56,6 @@ void read_csv2_(std::istream &file, size_type starting_row, size_type num_rows); -template -static void -sort_common_(DataFrame &df, CF &&comp_func, bool ignore_index); - template static void fill_missing_value_(ColumnVecType &vec, diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index ebeaec5c7..a654ab5c6 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -863,22 +863,24 @@ inline static O _remove_copy_if_(I first, I last, O d_first, PRE predicate) { // ---------------------------------------------------------------------------- -template +template static inline void -_sort_by_sorted_index_(T &to_be_sorted, const V &sorting_idxs, size_t idx_s) { - - std::vector done (idx_s, false); +_sort_by_sorted_index_(T &to_be_sorted, + const V &sorting_idxs, + BV &done_vec, + size_t idx_s) { + std::fill(done_vec.begin(), done_vec.end(), false); for (std::size_t i = 0; i < idx_s; ++i) [[likely]] - if (! done[i]) { - done[i] = true; + if (! done_vec[i]) { + done_vec[i] = true; std::size_t prev_j = i; std::size_t j = sorting_idxs[i]; while (i != j) { std::swap(to_be_sorted[prev_j], to_be_sorted[j]); - done[j] = true; + done_vec[j] = true; prev_j = j; j = sorting_idxs[j]; } From fe702e8f15c71891c2b1f6f993a8e8b9c76b231a Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Sat, 25 Nov 2023 09:49:24 -0500 Subject: [PATCH 3/4] more code for sorting --- benchmarks/dataframe_performance.cc | 23 ++++++++++++++++++----- benchmarks/polars_performance.py | 17 ++++++++++++----- include/DataFrame/Internals/DataFrame.tcc | 6 ++++++ src/Makefile.Linux.GCC64 | 3 ++- src/Makefile.Linux.GCC64D | 3 ++- 5 files changed, 40 insertions(+), 12 deletions(-) diff --git a/benchmarks/dataframe_performance.cc b/benchmarks/dataframe_performance.cc index 16e502e5d..446eca99c 100644 --- a/benchmarks/dataframe_performance.cc +++ b/benchmarks/dataframe_performance.cc @@ -36,7 +36,8 @@ using namespace hmdf; using namespace std::chrono; constexpr std::size_t ALIGNMENT = 64; -constexpr std::size_t SIZE = 300000000; +// constexpr std::size_t SIZE = 300000000; +constexpr std::size_t SIZE = 10000000; typedef StdDataFrame64 MyDataFrame; @@ -57,7 +58,7 @@ int main(int, char *[]) { std::cout << "Data generation/load time: " << double(duration_cast(second - first).count()) / 1000000.0 - << std::endl; + << " secs" << std::endl; MeanVisitor n_mv; VarVisitor ln_vv; @@ -81,14 +82,26 @@ int main(int, char *[]) { const auto fourth = high_resolution_clock::now(); + df.sort("log_normal", sort_spec::ascen, + "exponential", sort_spec::ascen); + // df.sort("log_normal", sort_spec::ascen); + std::cout << "Number of rows after sort: " + << df.get_column("normal").size() << std::endl; + + const auto fifth = high_resolution_clock::now(); + std::cout << "Calculation time: " << double(duration_cast(third - second).count()) / 1000000.0 - << '\n' + << " secs\n" << "Selection time: " << double(duration_cast(fourth - third).count()) / 1000000.0 - << '\n' + << " secs\n" + << "Sorting time: " + << double(duration_cast(fifth - fourth).count()) / 1000000.0 + << " secs\n" << "Overall time: " - << double(duration_cast(fourth - first).count()) / 1000000.0 + << double(duration_cast(fifth - first).count()) / 1000000.0 + << " secs" << std::endl; return (0); } diff --git a/benchmarks/polars_performance.py b/benchmarks/polars_performance.py index 315962446..55c6000f1 100644 --- a/benchmarks/polars_performance.py +++ b/benchmarks/polars_performance.py @@ -4,7 +4,8 @@ # ------------------------------------------------------------------------------ -SIZE: int = 300000000 +# SIZE: int = 300000000 +SIZE: int = 10000000 first = datetime.datetime.now() df = pl.DataFrame({"normal": np.random.normal(size=SIZE), @@ -13,7 +14,7 @@ }) second = datetime.datetime.now() print(f"Data generation/load time: " - f"{(second - first).seconds}.{(second - first).microseconds}") + f"{(second - first).seconds}.{(second - first).microseconds} secs") df2 = df.select( mean = pl.col("normal").mean(), @@ -32,9 +33,15 @@ print(f"Number of rows after select: {df3.select(pl.count()).item()}") fourth = datetime.datetime.now() -print(f"Calculation time: {(third - second).seconds}.{(third - second).microseconds}") -print(f"Selection time: {(fourth - third).seconds}.{(fourth - third).microseconds}") -print(f"Overall time: {(fourth - first).seconds}.{(fourth - first).microseconds}") +df4 = df.sort(["log_normal", "exponential"]); +# df4 = df.sort("log_normal"); +print(f"Number of rows after sort: {df4.select(pl.count()).item()}") +fifth = datetime.datetime.now() + +print(f"Calculation time: {(third - second).seconds}.{(third - second).microseconds} secs") +print(f"Selection time: {(fourth - third).seconds}.{(fourth - third).microseconds} secs") +print(f"Sorting time: {(fifth - fourth).seconds}.{(fifth - fourth).microseconds} secs") +print(f"Overall time: {(fifth - first).seconds}.{(fifth - first).microseconds} secs") # ------------------------------------------------------------------------------ diff --git a/include/DataFrame/Internals/DataFrame.tcc b/include/DataFrame/Internals/DataFrame.tcc index 27c9eb9a6..538a2cb14 100644 --- a/include/DataFrame/Internals/DataFrame.tcc +++ b/include/DataFrame/Internals/DataFrame.tcc @@ -31,6 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -902,6 +903,11 @@ sort(const char *name1, sort_spec dir1, std::ranges::views::zip(*vec1, *vec2, indices_, sorting_idxs); if (dir1 == sort_spec::ascen && dir2 == sort_spec::ascen) { + // if (! ignore_index) + // std::sort(std::execution::par_unseq, + // zip_idx.begin(), zip_idx.end(), a_a); + // else + // std::sort(std::execution::par_unseq, zip.begin(), zip.end(), a_a); if (! ignore_index) std::ranges::sort(zip_idx, a_a); else diff --git a/src/Makefile.Linux.GCC64 b/src/Makefile.Linux.GCC64 index ed055558c..38f35349b 100644 --- a/src/Makefile.Linux.GCC64 +++ b/src/Makefile.Linux.GCC64 @@ -8,7 +8,8 @@ BUILD_DEFINE = Linux_GCC64 CXX = /usr/bin/g++ -INCLUDES = -I/usr/include/c++/7 -I/usr/include +# INCLUDES = -I/usr/include/c++/7 -I/usr/include +INCLUDES = LFLAGS = CXXFLAGS = -O3 $(INCLUDES) $(DEFINES) -std=c++2b diff --git a/src/Makefile.Linux.GCC64D b/src/Makefile.Linux.GCC64D index bbaadbacf..ed4387179 100644 --- a/src/Makefile.Linux.GCC64D +++ b/src/Makefile.Linux.GCC64D @@ -8,7 +8,8 @@ BUILD_DEFINE = Linux_GCC64 CXX = /usr/bin/g++ -INCLUDES = -I/usr/include/c++/7 -I/usr/inc17lude +# INCLUDES = -I/usr/include/c++/7 -I/usr/include +INCLUDES = LFLAGS = CXXFLAGS = -g $(INCLUDES) $(DEFINES) -D_GLIBCXX_DEBUG -pedantic -Wall -Wextra -std=c++2b From 0abd9e4c1206393d3c7e17cad62116006b58752a Mon Sep 17 00:00:00 2001 From: Hossein Moein Date: Sun, 26 Nov 2023 09:34:56 -0500 Subject: [PATCH 4/4] minor changes --- benchmarks/dataframe_performance.cc | 15 +++++++-------- benchmarks/polars_performance.py | 7 +++---- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/benchmarks/dataframe_performance.cc b/benchmarks/dataframe_performance.cc index 446eca99c..f617a91cf 100644 --- a/benchmarks/dataframe_performance.cc +++ b/benchmarks/dataframe_performance.cc @@ -82,11 +82,10 @@ int main(int, char *[]) { const auto fourth = high_resolution_clock::now(); - df.sort("log_normal", sort_spec::ascen, - "exponential", sort_spec::ascen); - // df.sort("log_normal", sort_spec::ascen); - std::cout << "Number of rows after sort: " - << df.get_column("normal").size() << std::endl; + // df.sort("log_normal", sort_spec::ascen, + // "exponential", sort_spec::ascen); + // std::cout << "1001th value in normal column: " + // << df.get_column("normal")[1001] << std::endl; const auto fifth = high_resolution_clock::now(); @@ -96,9 +95,9 @@ int main(int, char *[]) { << "Selection time: " << double(duration_cast(fourth - third).count()) / 1000000.0 << " secs\n" - << "Sorting time: " - << double(duration_cast(fifth - fourth).count()) / 1000000.0 - << " secs\n" + // << "Sorting time: " + // << double(duration_cast(fifth - fourth).count()) / 1000000.0 + // << " secs\n" << "Overall time: " << double(duration_cast(fifth - first).count()) / 1000000.0 << " secs" diff --git a/benchmarks/polars_performance.py b/benchmarks/polars_performance.py index 55c6000f1..f6b9463f5 100644 --- a/benchmarks/polars_performance.py +++ b/benchmarks/polars_performance.py @@ -33,14 +33,13 @@ print(f"Number of rows after select: {df3.select(pl.count()).item()}") fourth = datetime.datetime.now() -df4 = df.sort(["log_normal", "exponential"]); -# df4 = df.sort("log_normal"); -print(f"Number of rows after sort: {df4.select(pl.count()).item()}") +# df4 = df.sort(["log_normal", "exponential"]); +# print(f"1001th value in normal column: {df4['normal'][1001]}") fifth = datetime.datetime.now() print(f"Calculation time: {(third - second).seconds}.{(third - second).microseconds} secs") print(f"Selection time: {(fourth - third).seconds}.{(fourth - third).microseconds} secs") -print(f"Sorting time: {(fifth - fourth).seconds}.{(fifth - fourth).microseconds} secs") +# print(f"Sorting time: {(fifth - fourth).seconds}.{(fifth - fourth).microseconds} secs") print(f"Overall time: {(fifth - first).seconds}.{(fifth - first).microseconds} secs") # ------------------------------------------------------------------------------