From 3354cc38ba2bf066f7b0498e15db3e3b9a20de72 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 19 Sep 2024 13:38:15 -0600 Subject: [PATCH 1/5] batched - dense: Testing and fixing Serial QR The serial QR algorithms does not have unit-tests and is failing for non square matrices. See issue #2328. This first commit fixes the issue with rectangular matrices and adds a basic test for that use case. Next will work on adding a test that exercises the interfaces on multiple matrices of different sizes within a parallel_for. Finally equivalent tests will be added for the square case as well. Fixing unused variable error It looks like the Left NoTranspose ApplyQ is not doing the correct thing. Will have a look at that next. Signed-off-by: Luc Signed-off-by: Luc Berger-Vergiat --- ...KokkosBatched_QR_FormQ_Serial_Internal.hpp | 14 +- .../impl/KokkosBatched_QR_Serial_Internal.hpp | 2 +- .../dense/unit_test/Test_Batched_Dense.hpp | 1 + .../dense/unit_test/Test_Batched_SerialQR.hpp | 407 ++++++++++++++++++ 4 files changed, 418 insertions(+), 6 deletions(-) create mode 100644 batched/dense/unit_test/Test_Batched_SerialQR.hpp diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index aaacb45ede..99b0fa9dc3 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -49,12 +49,16 @@ struct SerialQR_FormQ_Internal { /// B is m x m // set identity - if (is_Q_zero) - SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); - else - SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); + if (is_Q_zero) { + for (int idx = 0; idx < m; ++idx) { + Q[(qs0 + qs1) * idx] = value_type(1); + // SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); + } + } else { + SerialSetIdentityInternal::invoke(m, m, Q, qs0, qs1); + } - return SerialApplyQ_LeftNoTransForwardInternal ::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); + return SerialApplyQ_LeftForwardInternal::invoke(m, m, k, A, as0, as1, t, ts, Q, qs0, qs1, w); } }; diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 8aa4a6361c..851bd85ab9 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -53,7 +53,7 @@ struct SerialQR_Internal { A_part2x2.partWithATL(A, m, n, 0, 0); t_part2x1.partWithAT(t, m, 0); - for (int m_atl = 0; m_atl < m; ++m_atl) { + for (int m_atl = 0; m_atl < Kokkos::min(m, n); ++m_atl) { // part 2x2 into 3x3 A_part3x3.partWithABR(A_part2x2, 1, 1); const int m_A22 = m - m_atl - 1; diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 551f200101..f209ba0f26 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -30,6 +30,7 @@ #include "Test_Batched_SerialLU.hpp" #include "Test_Batched_SerialLU_Real.hpp" #include "Test_Batched_SerialLU_Complex.hpp" +#include "Test_Batched_SerialQR.hpp" #include "Test_Batched_SerialSolveLU.hpp" #include "Test_Batched_SerialSolveLU_Real.hpp" #include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp new file mode 100644 index 0000000000..4ccb81400a --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -0,0 +1,407 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Luc Berger-Vergiat (lberg@sandia.gov) +/// \author Cameron Smith (smithc11@rpi.edu) + +#include "gtest/gtest.h" +#include + +#include // KokkosBatched::QR +#include "KokkosBatched_ApplyQ_Decl.hpp" // KokkosBatched::ApplyQ +#include "KokkosBatched_QR_FormQ_Serial_Internal.hpp" +#include // KokkosBlas::Algo +#include + +template +struct qrFunctor { + using Scalar = typename MatricesType::value_type; + + int maxMatSize; + MatricesType As; + IndexViewType numRows; + IndexViewType numCols; + IndexViewType offsetA; + TauViewType taus; + TmpViewType ws; + MatricesType Qs; + IndexViewType offsetQ; + + qrFunctor(const int maxMatSize_, MatricesType As_, IndexViewType numRows_, IndexViewType numCols_, + IndexViewType offsetA_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, IndexViewType offsetQ_) + : maxMatSize(maxMatSize_), + As(As_), + numRows(numRows_), + numCols(numCols_), + offsetA(offsetA_), + taus(taus_), + ws(ws_), + Qs(Qs_), + offsetQ(offsetQ_) {} + + KOKKOS_FUNCTION + void operator()(const int matIdx) const { + Kokkos::View> A(&As(offsetA(matIdx)), numRows(matIdx), + numCols(matIdx)); + Kokkos::View> tau(&taus(matIdx * maxMatSize), + Kokkos::min(numRows(matIdx), numCols(matIdx))); + Kokkos::View> w(&ws(matIdx * maxMatSize), + Kokkos::min(numRows(matIdx), numCols(matIdx))); + Kokkos::View> Q(&Qs(offsetQ(matIdx)), numRows(matIdx), + numRows(matIdx)); + + KokkosBatched::SerialQR::invoke(A, tau, w); + + // Store identity in Q + for (int idx = 0; idx < numRows(matIdx); ++idx) { + Q(matIdx, matIdx) = Kokkos::ArithTraits::one(); + } + + // Call ApplyQ on Q + KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + } +}; + +template +void test_QR_square() { + // Analytical test with a rectangular matrix + // [12, -51, 4] [150, -69, 58] [14, 21, -14] + // A = [ 6, 167, -68] Q = 1/175 [ 75, 158, -6] R = [ 0, 175, -70] + // [-4, 24, -41] [-50, 30, 165] [ 0, 0, -35] + // + // Expected outputs: + // [ -5, -3] + // tau = [5/8, 10/18] A = [1/2, 5] + // [ 0, 1/3] + // + + using MatrixViewType = Kokkos::View; + using ColVectorViewType = Kokkos::View; + using ColWorkViewType = Kokkos::View; + + const Scalar tol = 10 * Kokkos::ArithTraits::eps(); + constexpr int m = 3, n = 3; + + MatrixViewType A("A", m, n), B("B", m, n), Q("Q", m, m); + ColVectorViewType t("t", n); + ColWorkViewType w("w", n); + + typename MatrixViewType::HostMirror A_h = Kokkos::create_mirror_view(A); + A_h(0, 0) = 12; + A_h(0, 1) = -51; + A_h(0, 2) = 4; + A_h(1, 0) = 6; + A_h(1, 1) = 167; + A_h(1, 2) = -68; + A_h(2, 0) = -4; + A_h(2, 1) = 24; + A_h(2, 2) = -41; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(B, A_h); + + Kokkos::parallel_for( + "serialQR", 1, KOKKOS_LAMBDA(int) { + // compute the QR factorization of A and store the results in A and t + // (tau) - see the lapack dgeqp3(...) documentation: + // www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga1b0500f49e03d2771b797c6e88adabbb.html + KokkosBatched::SerialQR::invoke(A, t, w); + }); + + Kokkos::fence(); + Kokkos::deep_copy(A_h, A); + typename ColVectorViewType::HostMirror tau = Kokkos::create_mirror_view(t); + Kokkos::deep_copy(tau, t); + + Test::EXPECT_NEAR_KK_REL(A_h(0, 0), static_cast(-14), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 1), static_cast(-21), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 2), static_cast(14), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 0), static_cast(6. / 26.), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 1), static_cast(-175), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 2), static_cast(70), tol); + Test::EXPECT_NEAR_KK_REL(A_h(2, 0), static_cast(-4.0 / 26.0), tol); + // Test::EXPECT_NEAR_KK_REL(A_h(2, 1), 35.0, tol); // Analytical expression too painful to compute... + Test::EXPECT_NEAR_KK_REL(A_h(2, 2), static_cast(35), tol); + + Test::EXPECT_NEAR_KK_REL(tau(0), static_cast(7. / 13.), tol); + // Test::EXPECT_NEAR_KK_REL(tau(1), 25. / 32., tol); // Analytical expression too painful to compute... + Test::EXPECT_NEAR_KK_REL(tau(2), static_cast(1. / 2.), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, B, w); + }); + typename MatrixViewType::HostMirror B_h = Kokkos::create_mirror_view(B); + Kokkos::deep_copy(B_h, B); + + Test::EXPECT_NEAR_KK_REL(static_cast(-14), B_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-21), B_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(14), B_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-175), B_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(70), B_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(35), B_h(2, 2), tol); + + Kokkos::parallel_for( + "serialFormQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), + Q.data(), Q.stride(0), Q.stride(1), w.data()); + }); + typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(-6. / 7.), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(69. / 175.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-58. / 175.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-3. / 7.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-158. / 175.), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(6. / 175.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(2. / 7.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-6. / 35.), Q_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-33. / 35.), Q_h(2, 2), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, Q, w); + }); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(2, 2), tol); + + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 1), tol); +} + +template +void test_QR_rectangular() { + // Analytical test with a rectangular matrix + // [3, 5] [-0.60, -0.64, 0.48] [-5, -3] + // A = [4, 0] Q = [-0.80, -0.48, -0.36] R = [ 0, 5] + // [0, -3] [ 0.00, -0.60, -0.80] [ 0, 0] + // + // Expected outputs: + // [ -5, -3] + // tau = [5/8, 10/18] A = [1/2, 5] + // [ 0, 1/3] + // + + using MatrixViewType = Kokkos::View; + using ColVectorViewType = Kokkos::View; + using ColWorkViewType = Kokkos::View; + + const Scalar tol = 10 * Kokkos::ArithTraits::eps(); + constexpr int m = 3, n = 2; + + MatrixViewType A("A", m, n), B("B", m, n), Q("Q", m, m); + ColVectorViewType t("t", n); + ColWorkViewType w("w", n); + + typename MatrixViewType::HostMirror A_h = Kokkos::create_mirror_view(A); + A_h(0, 0) = 3; + A_h(0, 1) = 5; + A_h(1, 0) = 4; + A_h(1, 1) = 0; + A_h(2, 0) = 0; + A_h(2, 1) = -3; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(B, A_h); + + Kokkos::parallel_for( + "serialQR", 1, KOKKOS_LAMBDA(int) { + // compute the QR factorization of A and store the results in A and t + // (tau) - see the lapack dgeqp3(...) documentation: + // www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga1b0500f49e03d2771b797c6e88adabbb.html + KokkosBatched::SerialQR::invoke(A, t, w); + }); + + Kokkos::fence(); + Kokkos::deep_copy(A_h, A); + typename ColVectorViewType::HostMirror tau = Kokkos::create_mirror_view(t); + Kokkos::deep_copy(tau, t); + + Test::EXPECT_NEAR_KK_REL(A_h(0, 0), static_cast(-5.0), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 1), static_cast(-3.0), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 0), static_cast(0.5), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 1), static_cast(5.0), tol); + Test::EXPECT_NEAR_KK(A_h(2, 0), static_cast(0.), tol); + Test::EXPECT_NEAR_KK_REL(A_h(2, 1), static_cast(1. / 3.), tol); + + Test::EXPECT_NEAR_KK_REL(tau(0), 5. / 8., tol); + Test::EXPECT_NEAR_KK_REL(tau(1), 10. / 18., tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, B, w); + }); + typename MatrixViewType::HostMirror B_h = Kokkos::create_mirror_view(B); + Kokkos::deep_copy(B_h, B); + + Test::EXPECT_NEAR_KK_REL(static_cast(-5.0), B_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-3.0), B_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(5.0), B_h(1, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(2, 1), tol); + + Kokkos::parallel_for( + "serialFormQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), + Q.data(), Q.stride(0), Q.stride(1), w.data()); + }); + typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(-0.60), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.64), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.48), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.80), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.48), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.36), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.60), Q_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.80), Q_h(2, 2), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, Q, w); + }); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(2, 2), tol); + + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 1), tol); +} + +template +void test_QR_batch() { + // Generate a batch of matrices + // Compute QR factorization + // Verify that R is triangular + // Verify that Q is unitary + // Check that Q*R = A + + using ExecutionSpace = typename Device::execution_space; + + constexpr int numMat = 314; + constexpr int maxMatSize = 100; + Kokkos::View numRows("rows in matrix", numMat); + Kokkos::View numCols("cols in matrix", numMat); + Kokkos::View offsetA("matrix offset", numMat + 1); + Kokkos::View offsetQ("matrix offset", numMat + 1); + Kokkos::View tau("tau", maxMatSize * numMat); + Kokkos::View tmp("work buffer", maxMatSize * numMat); + + typename Kokkos::View::HostMirror numRows_h = Kokkos::create_mirror_view(numRows); + typename Kokkos::View::HostMirror numCols_h = Kokkos::create_mirror_view(numCols); + typename Kokkos::View::HostMirror offsetA_h = Kokkos::create_mirror_view(offsetA); + typename Kokkos::View::HostMirror offsetQ_h = Kokkos::create_mirror_view(offsetQ); + + std::mt19937 gen; + gen.seed(27182818); + std::uniform_int_distribution distrib(1, maxMatSize); + + offsetA_h(0) = 0; + offsetQ_h(0) = 0; + int a = 0, b = 0; + for (int matIdx = 0; matIdx < numMat; ++matIdx) { + a = distrib(gen); + b = distrib(gen); + + numRows_h(matIdx) = Kokkos::max(a, b); + numCols_h(matIdx) = Kokkos::min(a, b); + offsetA_h(matIdx + 1) = offsetA_h(matIdx) + a * b; + offsetQ_h(matIdx + 1) = offsetQ_h(matIdx) + numRows_h(matIdx) * numRows_h(matIdx); + } + + Kokkos::deep_copy(numRows, numRows_h); + Kokkos::deep_copy(numCols, numCols_h); + Kokkos::deep_copy(offsetA, offsetA_h); + Kokkos::deep_copy(offsetQ, offsetQ_h); + + const int numVals = offsetA_h(numMat); + Kokkos::View mats("matrices", numVals); + Kokkos::View Qs("Q matrices", offsetQ_h(numMat)); + + Kokkos::Random_XorShift64_Pool rand_pool(2718); + constexpr double max_val = 1000; + { + Scalar randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(ExecutionSpace{}, mats, rand_pool, randStart, randEnd); + } + + qrFunctor myFunc(maxMatSize, mats, numRows, numCols, offsetA, tau, tmp, Qs, offsetQ); + Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, serial_qr_square_analytic_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_square(); +} +TEST_F(TestCategory, serial_qr_rectangular_analytic_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_rectangular(); +} +TEST_F(TestCategory, serial_qr_batch_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_batch(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, serial_qr_square_analytic_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_square(); +} +TEST_F(TestCategory, serial_qr_rectangular_analytic_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_rectangular(); +} +TEST_F(TestCategory, serial_qr_batch_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_batch(); +} +#endif + +// #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +// TEST_F(TestCategory, batched_scalar_serial_qr_scomplex) { +// typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; +// test_QR_rectangular, AlgoTagType>(); +// } +// #endif + +// #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +// TEST_F(TestCategory, batched_scalar_serial_qr_dcomplex) { +// typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; +// test_QR_rectangular, AlgoTagType>(); +// } +// #endif From 65bcc139a80f0dfb3769fa6d086992d1375ac6db Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 7 Oct 2024 08:26:22 -0600 Subject: [PATCH 2/5] Batched - Dense Serial QR: fixing issue with work array We did not pass the stride of the work array to internal routines and we are not enforcing a contiguous memory allocation either so when using subviews to pass the work array, we run into troubles. Signed-off-by: Luc Berger-Vergiat --- ...tched_ApplyHouseholder_Serial_Internal.hpp | 21 +- .../impl/KokkosBatched_ApplyQ_Serial_Impl.hpp | 6 +- .../KokkosBatched_ApplyQ_Serial_Internal.hpp | 13 +- ...KokkosBatched_QR_FormQ_Serial_Internal.hpp | 6 +- .../impl/KokkosBatched_QR_Serial_Impl.hpp | 2 +- .../impl/KokkosBatched_QR_Serial_Internal.hpp | 4 +- .../KokkosBatched_SVD_Serial_Internal.hpp | 11 +- .../dense/src/KokkosBatched_ApplyQ_Decl.hpp | 6 +- .../dense/unit_test/Test_Batched_SerialQR.hpp | 224 +++++++++++------- 9 files changed, 179 insertions(+), 114 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index 574280c82b..33b59b81b2 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -34,8 +34,8 @@ struct SerialApplyLeftHouseholderInternal { /* */ ValueType* u2, const int u2s, /* */ ValueType* a1t, const int a1ts, /* */ ValueType* A2, const int as0, const int as1, - /* */ ValueType* w1t) { - typedef ValueType value_type; + /* */ ValueType* w1t, const int ws0) { + using value_type = ValueType; /// u2 m x 1 /// a1t 1 x n @@ -54,15 +54,15 @@ struct SerialApplyLeftHouseholderInternal { for (int j = 0; j < n; ++j) { value_type tmp = a1t[j * a1ts]; for (int i = 0; i < m; ++i) tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; - w1t[j] = tmp * inv_tau; // /= (*tau); + w1t[j * ws0] = tmp * inv_tau; // /= (*tau); } // a1t -= w1t (axpy) - for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j]; + for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j * ws0]; // A2 -= u2 w1t (ger) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; + for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j * ws0]; return 0; } @@ -74,8 +74,8 @@ struct SerialApplyRightHouseholderInternal { /* */ ValueType* u2, const int u2s, /* */ ValueType* a1, const int a1s, /* */ ValueType* A2, const int as0, const int as1, - /* */ ValueType* w1) { - typedef ValueType value_type; + /* */ ValueType* w1, const int ws0) { + using value_type = ValueType; /// u2 n x 1 /// a1 m x 1 /// A2 m x n @@ -93,15 +93,16 @@ struct SerialApplyRightHouseholderInternal { for (int i = 0; i < m; ++i) { value_type tmp = a1[i * a1s]; for (int j = 0; j < n; ++j) tmp += A2[i * as0 + j * as1] * u2[j * u2s]; - w1[i] = tmp * inv_tau; // \= (*tau); + w1[i * ws0] = tmp * inv_tau; // \= (*tau); } // a1 -= w1 (axpy) - for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i]; + for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i * ws0]; // A2 -= w1 * u2' (ger with conjugate) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + for (int i = 0; i < m; ++i) + A2[i * as0 + j * as1] -= w1[i * ws0] * Kokkos::ArithTraits::conj(u2[j * u2s]); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp index f1f2d29089..02335f319f 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp @@ -33,7 +33,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ @@ -42,7 +42,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ @@ -51,7 +51,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ KOKKOS_INLINE_FUNCTION int SerialQR::invoke(const AViewType &A, const tViewType &t, const wViewType &w) { return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(), - t.stride_0(), w.data()); + t.stride_0(), w.data(), w.stride_0()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 851bd85ab9..04eda350f8 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -36,7 +36,7 @@ struct SerialQR_Internal { const int n, // n = NumCols(A) /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *w) { + /* */ ValueType *w, const int ws) { typedef ValueType value_type; /// Given a matrix A, it computes QR decomposition of the matrix @@ -69,7 +69,7 @@ struct SerialQR_Internal { // left apply householder to A22 SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + A_part3x3.A22, as0, as1, w, ws); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 56a5619e06..6b4461e5ec 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -176,11 +176,12 @@ struct SerialSVDInternal { if (n - i > 1) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), - As0, As1, work); + As0, As1, work, 1); } if (U) { - KokkosBatched::SerialApplyRightHouseholderInternal::invoke( - m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work); + KokkosBatched::SerialApplyRightHouseholderInternal::invoke(m, m - i - 1, &tau, &SVDIND(A, i + 1, i), + As0, &SVDIND(U, 0, i), Us0, + &SVDIND(U, 0, i + 1), Us0, Us1, work, 1); } // Zero out A subdiag explicitly (NOTE: may not be necessary...) for (int j = i + 1; j < m; j++) { @@ -193,12 +194,12 @@ struct SerialSVDInternal { if (m - i > 1) { KokkosBatched::SerialApplyRightHouseholderInternal::invoke( m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0, - &SVDIND(A, i + 1, i + 2), As0, As1, work); + &SVDIND(A, i + 1, i + 2), As0, As1, work, 1); } if (Vt) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, - Vts1, work); + Vts1, work, 1); } // Zero out A superdiag row explicitly for (int j = i + 2; j < n; j++) { diff --git a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp index 9d17a8b435..589ca192dd 100644 --- a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp @@ -64,11 +64,11 @@ struct ApplyQ { KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { int r_val = 0; - if (std::is_same::value) { + if constexpr (std::is_same_v) { r_val = SerialApplyQ::invoke(A, t, B, w); - } else if (std::is_same::value) { + } else if constexpr (std::is_same_v) { r_val = TeamApplyQ::invoke(member, A, t, B, w); - } else if (std::is_same::value) { + } else if constexpr (std::is_same_v) { r_val = TeamVectorApplyQ::invoke(member, A, t, B, w); } return r_val; diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp index 4ccb81400a..375b1d1664 100644 --- a/batched/dense/unit_test/Test_Batched_SerialQR.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -25,52 +25,92 @@ #include // KokkosBlas::Algo #include -template +template struct qrFunctor { using Scalar = typename MatricesType::value_type; - int maxMatSize; MatricesType As; - IndexViewType numRows; - IndexViewType numCols; - IndexViewType offsetA; TauViewType taus; TmpViewType ws; - MatricesType Qs; - IndexViewType offsetQ; - - qrFunctor(const int maxMatSize_, MatricesType As_, IndexViewType numRows_, IndexViewType numCols_, - IndexViewType offsetA_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, IndexViewType offsetQ_) - : maxMatSize(maxMatSize_), - As(As_), - numRows(numRows_), - numCols(numCols_), - offsetA(offsetA_), - taus(taus_), - ws(ws_), - Qs(Qs_), - offsetQ(offsetQ_) {} + MatricesType Qs, Bs; + ErrorViewType global_error; + + qrFunctor(MatricesType As_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, MatricesType Bs_, + ErrorViewType global_error_) + : As(As_), taus(taus_), ws(ws_), Qs(Qs_), Bs(Bs_), global_error(global_error_) {} KOKKOS_FUNCTION void operator()(const int matIdx) const { - Kokkos::View> A(&As(offsetA(matIdx)), numRows(matIdx), - numCols(matIdx)); - Kokkos::View> tau(&taus(matIdx * maxMatSize), - Kokkos::min(numRows(matIdx), numCols(matIdx))); - Kokkos::View> w(&ws(matIdx * maxMatSize), - Kokkos::min(numRows(matIdx), numCols(matIdx))); - Kokkos::View> Q(&Qs(offsetQ(matIdx)), numRows(matIdx), - numRows(matIdx)); + auto A = Kokkos::subview(As, matIdx, Kokkos::ALL, Kokkos::ALL); + auto tau = Kokkos::subview(taus, matIdx, Kokkos::ALL); + auto w = Kokkos::subview(ws, matIdx, Kokkos::ALL); + auto Q = Kokkos::subview(Qs, matIdx, Kokkos::ALL, Kokkos::ALL); + auto B = Kokkos::subview(Bs, matIdx, Kokkos::ALL, Kokkos::ALL); + + const Scalar SC_one = Kokkos::ArithTraits::one(); + const Scalar tol = Kokkos::ArithTraits::eps() * 10; + + int error = 0; KokkosBatched::SerialQR::invoke(A, tau, w); // Store identity in Q - for (int idx = 0; idx < numRows(matIdx); ++idx) { - Q(matIdx, matIdx) = Kokkos::ArithTraits::one(); + for (int diagIdx = 0; diagIdx < Q.extent_int(0); ++diagIdx) { + Q(diagIdx, diagIdx) = SC_one; } // Call ApplyQ on Q + for (int idx = 0; idx < w.extent_int(0); ++idx) { + w(idx) = 0.0; + } KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + + // Now apply Q' to Q + for (int idx = 0; idx < w.extent_int(0); ++idx) { + w(idx) = 0.0; + } + KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + + // At this point Q stores Q'Q + // which should be the identity matrix + for (int rowIdx = 0; rowIdx < Q.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < Q.extent_int(1); ++colIdx) { + if (rowIdx == colIdx) { + if (Kokkos::abs(Q(rowIdx, colIdx) - SC_one) > tol) { + error += 1; + Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 1.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); + } + } else { + if (Kokkos::abs(Q(rowIdx, colIdx)) > tol) { + error += 1; + Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); + } + } + } + } + Kokkos::atomic_add(&global_error(), error); + + // Apply Q' to B which holds a copy of the orginal A + // Afterwards B should hold a copy of R and be zero below its diagonal + KokkosBatched::SerialApplyQ::invoke(A, tau, B, w); + for (int rowIdx = 0; rowIdx < B.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < B.extent_int(1); ++colIdx) { + if (rowIdx <= colIdx) { + if (Kokkos::abs(B(rowIdx, colIdx) - A(rowIdx, colIdx)) > tol * Kokkos::abs(A(rowIdx, colIdx))) { + error += 1; + Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of %e.\n", matIdx, rowIdx, colIdx, + B(rowIdx, colIdx), A(rowIdx, colIdx)); + } + } else { + if (Kokkos::abs(B(rowIdx, colIdx)) > 1000 * tol) { + error += 1; + Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, + B(rowIdx, colIdx)); + } + } + } + } + Kokkos::atomic_add(&global_error(), error); } }; @@ -159,7 +199,7 @@ void test_QR_square() { Kokkos::parallel_for( "serialFormQ", 1, KOKKOS_LAMBDA(int) { KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), - Q.data(), Q.stride(0), Q.stride(1), w.data()); + Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0)); }); typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); Kokkos::deep_copy(Q_h, Q); @@ -247,8 +287,8 @@ void test_QR_rectangular() { Test::EXPECT_NEAR_KK(A_h(2, 0), static_cast(0.), tol); Test::EXPECT_NEAR_KK_REL(A_h(2, 1), static_cast(1. / 3.), tol); - Test::EXPECT_NEAR_KK_REL(tau(0), 5. / 8., tol); - Test::EXPECT_NEAR_KK_REL(tau(1), 10. / 18., tol); + Test::EXPECT_NEAR_KK_REL(tau(0), static_cast(5. / 8.), tol); + Test::EXPECT_NEAR_KK_REL(tau(1), static_cast(10. / 18.), tol); Kokkos::parallel_for( "serialApplyQ", 1, KOKKOS_LAMBDA(int) { @@ -267,7 +307,7 @@ void test_QR_rectangular() { Kokkos::parallel_for( "serialFormQ", 1, KOKKOS_LAMBDA(int) { KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), - Q.data(), Q.stride(0), Q.stride(1), w.data()); + Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0)); }); typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); Kokkos::deep_copy(Q_h, Q); @@ -310,56 +350,80 @@ void test_QR_batch() { using ExecutionSpace = typename Device::execution_space; - constexpr int numMat = 314; - constexpr int maxMatSize = 100; - Kokkos::View numRows("rows in matrix", numMat); - Kokkos::View numCols("cols in matrix", numMat); - Kokkos::View offsetA("matrix offset", numMat + 1); - Kokkos::View offsetQ("matrix offset", numMat + 1); - Kokkos::View tau("tau", maxMatSize * numMat); - Kokkos::View tmp("work buffer", maxMatSize * numMat); - - typename Kokkos::View::HostMirror numRows_h = Kokkos::create_mirror_view(numRows); - typename Kokkos::View::HostMirror numCols_h = Kokkos::create_mirror_view(numCols); - typename Kokkos::View::HostMirror offsetA_h = Kokkos::create_mirror_view(offsetA); - typename Kokkos::View::HostMirror offsetQ_h = Kokkos::create_mirror_view(offsetQ); - - std::mt19937 gen; - gen.seed(27182818); - std::uniform_int_distribution distrib(1, maxMatSize); - - offsetA_h(0) = 0; - offsetQ_h(0) = 0; - int a = 0, b = 0; - for (int matIdx = 0; matIdx < numMat; ++matIdx) { - a = distrib(gen); - b = distrib(gen); - - numRows_h(matIdx) = Kokkos::max(a, b); - numCols_h(matIdx) = Kokkos::min(a, b); - offsetA_h(matIdx + 1) = offsetA_h(matIdx) + a * b; - offsetQ_h(matIdx + 1) = offsetQ_h(matIdx) + numRows_h(matIdx) * numRows_h(matIdx); - } + { // Square matrix case + constexpr int numMat = 314; // 314 + constexpr int maxMatSize = 36; // 36 + Kokkos::View tau("tau", numMat, maxMatSize); + Kokkos::View tmp("work buffer", numMat, maxMatSize); + Kokkos::View As("A matrices", numMat, maxMatSize, maxMatSize); + Kokkos::View Bs("B matrices", numMat, maxMatSize, maxMatSize); + Kokkos::View Qs("Q matrices", numMat, maxMatSize, maxMatSize); + Kokkos::View global_error("global number of error"); + + Kokkos::Random_XorShift64_Pool rand_pool(2718); + constexpr double max_val = 1000; + { + Scalar randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd); + } + Kokkos::deep_copy(Bs, As); + + qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error); + Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); - Kokkos::deep_copy(numRows, numRows_h); - Kokkos::deep_copy(numCols, numCols_h); - Kokkos::deep_copy(offsetA, offsetA_h); - Kokkos::deep_copy(offsetQ, offsetQ_h); - - const int numVals = offsetA_h(numMat); - Kokkos::View mats("matrices", numVals); - Kokkos::View Qs("Q matrices", offsetQ_h(numMat)); - - Kokkos::Random_XorShift64_Pool rand_pool(2718); - constexpr double max_val = 1000; - { - Scalar randStart, randEnd; - Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(ExecutionSpace{}, mats, rand_pool, randStart, randEnd); + typename Kokkos::View::HostMirror global_error_h = Kokkos::create_mirror_view(global_error); + Kokkos::deep_copy(global_error_h, global_error); + EXPECT_EQ(global_error_h(), 0); } - qrFunctor myFunc(maxMatSize, mats, numRows, numCols, offsetA, tau, tmp, Qs, offsetQ); - Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); + { // Rectangular matrix case + constexpr int numMat = 2; // 314 + constexpr int numRows = 3; // 42 + constexpr int numCols = 2; // 36 + Kokkos::View tau("tau", numMat, numCols); + Kokkos::View tmp("work buffer", numMat, numCols); + Kokkos::View As("A matrices", numMat, numRows, numCols); + Kokkos::View Bs("B matrices", numMat, numRows, numCols); + Kokkos::View Qs("Q matrices", numMat, numRows, numRows); + Kokkos::View global_error("global number of error"); + + Kokkos::Random_XorShift64_Pool rand_pool(2718); + constexpr double max_val = 1000; + { + Scalar randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd); + } + + { + typename Kokkos::View::HostMirror As_h = Kokkos::create_mirror_view(As); + Kokkos::deep_copy(As_h, As); + As_h(0, 0, 0) = 3.0; + As_h(0, 0, 1) = 5.0; + As_h(0, 1, 0) = 4.0; + As_h(0, 1, 1) = 0.0; + As_h(0, 2, 0) = 0.0; + As_h(0, 2, 1) = -3.0; + + As_h(1, 0, 0) = 3.0; + As_h(1, 0, 1) = 5.0; + As_h(1, 1, 0) = 4.0; + As_h(1, 1, 1) = 0.0; + As_h(1, 2, 0) = 0.0; + As_h(1, 2, 1) = -3.0; + + Kokkos::deep_copy(As, As_h); + } + Kokkos::deep_copy(Bs, As); + + qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error); + Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); + + typename Kokkos::View::HostMirror global_error_h = Kokkos::create_mirror_view(global_error); + Kokkos::deep_copy(global_error_h, global_error); + EXPECT_EQ(global_error_h(), 0); + } } #if defined(KOKKOSKERNELS_INST_FLOAT) From 5f0d50c1e40ec02a06c8dc38e1dfd97ee130beff Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 4 Dec 2024 15:54:43 -0700 Subject: [PATCH 3/5] Batched - Dense QR: small additional fixes Signed-off-by: Luc Berger-Vergiat --- .../KokkosBatched_ApplyQ_Serial_Internal.hpp | 2 +- .../dense/unit_test/Test_Batched_SerialQR.hpp | 46 ++++++------------- 2 files changed, 14 insertions(+), 34 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp index 605e3061a3..2e45867f05 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp @@ -180,7 +180,7 @@ struct SerialApplyQ_RightForwardInternal { /// ----------------------------------------------------- // right apply householder to partitioned B1 and B2 SerialApplyRightHouseholderInternal::invoke(m, n_B2, tau, A_part3x3.A21, as0, B_part1x3.A1, bs0, B_part1x3.A2, - bs0, bs1, ws); + bs0, bs1, w, ws); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp index 375b1d1664..607d8e0413 100644 --- a/batched/dense/unit_test/Test_Batched_SerialQR.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -131,7 +131,7 @@ void test_QR_square() { using ColVectorViewType = Kokkos::View; using ColWorkViewType = Kokkos::View; - const Scalar tol = 10 * Kokkos::ArithTraits::eps(); + const Scalar tol = 20 * Kokkos::ArithTraits::eps(); constexpr int m = 3, n = 3; MatrixViewType A("A", m, n), B("B", m, n), Q("Q", m, m); @@ -350,14 +350,14 @@ void test_QR_batch() { using ExecutionSpace = typename Device::execution_space; - { // Square matrix case - constexpr int numMat = 314; // 314 - constexpr int maxMatSize = 36; // 36 - Kokkos::View tau("tau", numMat, maxMatSize); - Kokkos::View tmp("work buffer", numMat, maxMatSize); - Kokkos::View As("A matrices", numMat, maxMatSize, maxMatSize); - Kokkos::View Bs("B matrices", numMat, maxMatSize, maxMatSize); - Kokkos::View Qs("Q matrices", numMat, maxMatSize, maxMatSize); + { // Square matrix case + constexpr int numMat = 314; + constexpr int numRows = 36; + Kokkos::View tau("tau", numMat, numRows); + Kokkos::View tmp("work buffer", numMat, numRows); + Kokkos::View As("A matrices", numMat, numRows, numRows); + Kokkos::View Bs("B matrices", numMat, numRows, numRows); + Kokkos::View Qs("Q matrices", numMat, numRows, numRows); Kokkos::View global_error("global number of error"); Kokkos::Random_XorShift64_Pool rand_pool(2718); @@ -377,10 +377,10 @@ void test_QR_batch() { EXPECT_EQ(global_error_h(), 0); } - { // Rectangular matrix case - constexpr int numMat = 2; // 314 - constexpr int numRows = 3; // 42 - constexpr int numCols = 2; // 36 + { // Rectangular matrix case + constexpr int numMat = 314; // 314 + constexpr int numRows = 42; // 42 + constexpr int numCols = 36; // 36 Kokkos::View tau("tau", numMat, numCols); Kokkos::View tmp("work buffer", numMat, numCols); Kokkos::View As("A matrices", numMat, numRows, numCols); @@ -395,26 +395,6 @@ void test_QR_batch() { Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd); } - - { - typename Kokkos::View::HostMirror As_h = Kokkos::create_mirror_view(As); - Kokkos::deep_copy(As_h, As); - As_h(0, 0, 0) = 3.0; - As_h(0, 0, 1) = 5.0; - As_h(0, 1, 0) = 4.0; - As_h(0, 1, 1) = 0.0; - As_h(0, 2, 0) = 0.0; - As_h(0, 2, 1) = -3.0; - - As_h(1, 0, 0) = 3.0; - As_h(1, 0, 1) = 5.0; - As_h(1, 1, 0) = 4.0; - As_h(1, 1, 1) = 0.0; - As_h(1, 2, 0) = 0.0; - As_h(1, 2, 1) = -3.0; - - Kokkos::deep_copy(As, As_h); - } Kokkos::deep_copy(Bs, As); qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error); From 01029cb6f669b6045a9bb61d1ae90789675f495c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 10 Dec 2024 08:13:06 -0700 Subject: [PATCH 4/5] Batched - QR: increasing the sized of problems and batch After fixing the issue with the workspace utilization now looking at issues when the problem and batch sizes are increased. Signed-off-by: Luc Berger-Vergiat --- .../dense/unit_test/Test_Batched_SerialQR.hpp | 52 +++++++++++-------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp index 607d8e0413..643c6e63fc 100644 --- a/batched/dense/unit_test/Test_Batched_SerialQR.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -33,11 +33,11 @@ struct qrFunctor { TauViewType taus; TmpViewType ws; MatricesType Qs, Bs; - ErrorViewType global_error; + ErrorViewType error; qrFunctor(MatricesType As_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, MatricesType Bs_, - ErrorViewType global_error_) - : As(As_), taus(taus_), ws(ws_), Qs(Qs_), Bs(Bs_), global_error(global_error_) {} + ErrorViewType error_) + : As(As_), taus(taus_), ws(ws_), Qs(Qs_), Bs(Bs_), error(error_) {} KOKKOS_FUNCTION void operator()(const int matIdx) const { @@ -50,7 +50,7 @@ struct qrFunctor { const Scalar SC_one = Kokkos::ArithTraits::one(); const Scalar tol = Kokkos::ArithTraits::eps() * 10; - int error = 0; + int error_lcl = 0; KokkosBatched::SerialQR::invoke(A, tau, w); @@ -77,18 +77,17 @@ struct qrFunctor { for (int colIdx = 0; colIdx < Q.extent_int(1); ++colIdx) { if (rowIdx == colIdx) { if (Kokkos::abs(Q(rowIdx, colIdx) - SC_one) > tol) { - error += 1; + error_lcl += 1; Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 1.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); } } else { if (Kokkos::abs(Q(rowIdx, colIdx)) > tol) { - error += 1; + error_lcl += 1; Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); } } } } - Kokkos::atomic_add(&global_error(), error); // Apply Q' to B which holds a copy of the orginal A // Afterwards B should hold a copy of R and be zero below its diagonal @@ -97,20 +96,20 @@ struct qrFunctor { for (int colIdx = 0; colIdx < B.extent_int(1); ++colIdx) { if (rowIdx <= colIdx) { if (Kokkos::abs(B(rowIdx, colIdx) - A(rowIdx, colIdx)) > tol * Kokkos::abs(A(rowIdx, colIdx))) { - error += 1; + error_lcl += 1; Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of %e.\n", matIdx, rowIdx, colIdx, B(rowIdx, colIdx), A(rowIdx, colIdx)); } } else { if (Kokkos::abs(B(rowIdx, colIdx)) > 1000 * tol) { - error += 1; + error_lcl += 1; Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, B(rowIdx, colIdx)); } } } } - Kokkos::atomic_add(&global_error(), error); + error(matIdx) = error_lcl; } }; @@ -358,7 +357,7 @@ void test_QR_batch() { Kokkos::View As("A matrices", numMat, numRows, numRows); Kokkos::View Bs("B matrices", numMat, numRows, numRows); Kokkos::View Qs("Q matrices", numMat, numRows, numRows); - Kokkos::View global_error("global number of error"); + Kokkos::View error("global number of error", numMat); Kokkos::Random_XorShift64_Pool rand_pool(2718); constexpr double max_val = 1000; @@ -369,16 +368,19 @@ void test_QR_batch() { } Kokkos::deep_copy(Bs, As); - qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error); + qrFunctor myFunc(As, tau, tmp, Qs, Bs, error); Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); + Kokkos::fence(); - typename Kokkos::View::HostMirror global_error_h = Kokkos::create_mirror_view(global_error); - Kokkos::deep_copy(global_error_h, global_error); - EXPECT_EQ(global_error_h(), 0); + typename Kokkos::View::HostMirror error_h = Kokkos::create_mirror_view(error); + Kokkos::deep_copy(error_h, error); + int global_error = 0; + for(int matIdx = 0; matIdx < numMat; ++matIdx) { global_error += error_h(matIdx); } + EXPECT_EQ(global_error, 0); } { // Rectangular matrix case - constexpr int numMat = 314; // 314 + constexpr int numMat = 25; // 314 constexpr int numRows = 42; // 42 constexpr int numCols = 36; // 36 Kokkos::View tau("tau", numMat, numCols); @@ -386,7 +388,7 @@ void test_QR_batch() { Kokkos::View As("A matrices", numMat, numRows, numCols); Kokkos::View Bs("B matrices", numMat, numRows, numCols); Kokkos::View Qs("Q matrices", numMat, numRows, numRows); - Kokkos::View global_error("global number of error"); + Kokkos::View error("global number of error", numMat); Kokkos::Random_XorShift64_Pool rand_pool(2718); constexpr double max_val = 1000; @@ -397,12 +399,20 @@ void test_QR_batch() { } Kokkos::deep_copy(Bs, As); - qrFunctor myFunc(As, tau, tmp, Qs, Bs, global_error); + qrFunctor myFunc(As, tau, tmp, Qs, Bs, error); Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); - typename Kokkos::View::HostMirror global_error_h = Kokkos::create_mirror_view(global_error); - Kokkos::deep_copy(global_error_h, global_error); - EXPECT_EQ(global_error_h(), 0); + typename Kokkos::View::HostMirror error_h = Kokkos::create_mirror_view(error); + Kokkos::deep_copy(error_h, error); + + int global_error = 0; + for(int matIdx = 0; matIdx < numMat; ++matIdx) { + if(0 < error_h(matIdx)) { + std::cout << "Errors found for matrix " << matIdx << ": " << error_h(matIdx) << std::endl; + } + global_error += error_h(matIdx); + } + EXPECT_EQ(global_error, 0); } } From e2771fdb4f808f895a1624948bdd91291ff8ec21 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 21 Jan 2025 07:20:43 -0700 Subject: [PATCH 5/5] Batched - QR: making test more generic for all shapes Signed-off-by: Luc Berger-Vergiat --- .../dense/unit_test/Test_Batched_SerialQR.hpp | 67 +++++++++---------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp index 643c6e63fc..465a3747cb 100644 --- a/batched/dense/unit_test/Test_Batched_SerialQR.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -340,7 +340,7 @@ void test_QR_rectangular() { } template -void test_QR_batch() { +void test_QR_batch(const int numMat, const int numRows, const int numCols) { // Generate a batch of matrices // Compute QR factorization // Verify that R is triangular @@ -349,40 +349,8 @@ void test_QR_batch() { using ExecutionSpace = typename Device::execution_space; - { // Square matrix case - constexpr int numMat = 314; - constexpr int numRows = 36; - Kokkos::View tau("tau", numMat, numRows); - Kokkos::View tmp("work buffer", numMat, numRows); - Kokkos::View As("A matrices", numMat, numRows, numRows); - Kokkos::View Bs("B matrices", numMat, numRows, numRows); - Kokkos::View Qs("Q matrices", numMat, numRows, numRows); - Kokkos::View error("global number of error", numMat); - - Kokkos::Random_XorShift64_Pool rand_pool(2718); - constexpr double max_val = 1000; - { - Scalar randStart, randEnd; - Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd); - } - Kokkos::deep_copy(Bs, As); - - qrFunctor myFunc(As, tau, tmp, Qs, Bs, error); - Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); - Kokkos::fence(); - - typename Kokkos::View::HostMirror error_h = Kokkos::create_mirror_view(error); - Kokkos::deep_copy(error_h, error); - int global_error = 0; - for(int matIdx = 0; matIdx < numMat; ++matIdx) { global_error += error_h(matIdx); } - EXPECT_EQ(global_error, 0); - } - - { // Rectangular matrix case - constexpr int numMat = 25; // 314 - constexpr int numRows = 42; // 42 - constexpr int numCols = 36; // 36 + std::cout << "batched QR, running (" << numMat << ", " << numRows << ", " << numCols << ") case" << std::endl; + { Kokkos::View tau("tau", numMat, numCols); Kokkos::View tmp("work buffer", numMat, numCols); Kokkos::View As("A matrices", numMat, numRows, numCols); @@ -401,14 +369,22 @@ void test_QR_batch() { qrFunctor myFunc(As, tau, tmp, Qs, Bs, error); Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); + Kokkos::fence(); typename Kokkos::View::HostMirror error_h = Kokkos::create_mirror_view(error); Kokkos::deep_copy(error_h, error); + bool first_error = true; int global_error = 0; for(int matIdx = 0; matIdx < numMat; ++matIdx) { if(0 < error_h(matIdx)) { std::cout << "Errors found for matrix " << matIdx << ": " << error_h(matIdx) << std::endl; + if(first_error == true) { + auto mat_view = Kokkos::subview(As, matIdx, Kokkos::ALL, Kokkos::ALL); + auto mat_host = Kokkos::create_mirror_view(mat_view); + + first_error = false; + } } global_error += error_h(matIdx); } @@ -416,6 +392,12 @@ void test_QR_batch() { } } +template +void test_QR_batch(const int numMat, const int numRows) { + test_QR_batch(numMat, numRows, numRows); +} + + #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F(TestCategory, serial_qr_square_analytic_float) { typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; @@ -427,7 +409,12 @@ TEST_F(TestCategory, serial_qr_rectangular_analytic_float) { } TEST_F(TestCategory, serial_qr_batch_float) { typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; - test_QR_batch(); + test_QR_batch(314, 36); + test_QR_batch(10, 42, 36); + test_QR_batch(100, 42, 36); + test_QR_batch(200, 42, 36); + test_QR_batch(250, 42, 36); + test_QR_batch(300, 42, 36); } #endif @@ -442,7 +429,13 @@ TEST_F(TestCategory, serial_qr_rectangular_analytic_double) { } TEST_F(TestCategory, serial_qr_batch_double) { typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; - test_QR_batch(); + test_QR_batch(314, 36); + + test_QR_batch(10, 42, 36); + test_QR_batch(100, 42, 36); + test_QR_batch(200, 42, 36); + test_QR_batch(250, 42, 36); + test_QR_batch(300, 42, 36); } #endif