diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index 574280c82b..33b59b81b2 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -34,8 +34,8 @@ struct SerialApplyLeftHouseholderInternal { /* */ ValueType* u2, const int u2s, /* */ ValueType* a1t, const int a1ts, /* */ ValueType* A2, const int as0, const int as1, - /* */ ValueType* w1t) { - typedef ValueType value_type; + /* */ ValueType* w1t, const int ws0) { + using value_type = ValueType; /// u2 m x 1 /// a1t 1 x n @@ -54,15 +54,15 @@ struct SerialApplyLeftHouseholderInternal { for (int j = 0; j < n; ++j) { value_type tmp = a1t[j * a1ts]; for (int i = 0; i < m; ++i) tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; - w1t[j] = tmp * inv_tau; // /= (*tau); + w1t[j * ws0] = tmp * inv_tau; // /= (*tau); } // a1t -= w1t (axpy) - for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j]; + for (int j = 0; j < n; ++j) a1t[j * a1ts] -= w1t[j * ws0]; // A2 -= u2 w1t (ger) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; + for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j * ws0]; return 0; } @@ -74,8 +74,8 @@ struct SerialApplyRightHouseholderInternal { /* */ ValueType* u2, const int u2s, /* */ ValueType* a1, const int a1s, /* */ ValueType* A2, const int as0, const int as1, - /* */ ValueType* w1) { - typedef ValueType value_type; + /* */ ValueType* w1, const int ws0) { + using value_type = ValueType; /// u2 n x 1 /// a1 m x 1 /// A2 m x n @@ -93,15 +93,16 @@ struct SerialApplyRightHouseholderInternal { for (int i = 0; i < m; ++i) { value_type tmp = a1[i * a1s]; for (int j = 0; j < n; ++j) tmp += A2[i * as0 + j * as1] * u2[j * u2s]; - w1[i] = tmp * inv_tau; // \= (*tau); + w1[i * ws0] = tmp * inv_tau; // \= (*tau); } // a1 -= w1 (axpy) - for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i]; + for (int i = 0; i < m; ++i) a1[i * a1s] -= w1[i * ws0]; // A2 -= w1 * u2' (ger with conjugate) for (int j = 0; j < n; ++j) - for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); + for (int i = 0; i < m; ++i) + A2[i * as0 + j * as1] -= w1[i * ws0] * Kokkos::ArithTraits::conj(u2[j * u2s]); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp index f1f2d29089..02335f319f 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp @@ -33,7 +33,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ @@ -42,7 +42,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ @@ -51,7 +51,7 @@ KOKKOS_INLINE_FUNCTION int SerialApplyQ KOKKOS_INLINE_FUNCTION int SerialQR::invoke(const AViewType &A, const tViewType &t, const wViewType &w) { return SerialQR_Internal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), t.data(), - t.stride_0(), w.data()); + t.stride_0(), w.data(), w.stride_0()); } } // namespace KokkosBatched diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 8aa4a6361c..04eda350f8 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -36,7 +36,7 @@ struct SerialQR_Internal { const int n, // n = NumCols(A) /* */ ValueType *A, const int as0, const int as1, /* */ ValueType *t, const int ts, - /* */ ValueType *w) { + /* */ ValueType *w, const int ws) { typedef ValueType value_type; /// Given a matrix A, it computes QR decomposition of the matrix @@ -53,7 +53,7 @@ struct SerialQR_Internal { A_part2x2.partWithATL(A, m, n, 0, 0); t_part2x1.partWithAT(t, m, 0); - for (int m_atl = 0; m_atl < m; ++m_atl) { + for (int m_atl = 0; m_atl < Kokkos::min(m, n); ++m_atl) { // part 2x2 into 3x3 A_part3x3.partWithABR(A_part2x2, 1, 1); const int m_A22 = m - m_atl - 1; @@ -69,7 +69,7 @@ struct SerialQR_Internal { // left apply householder to A22 SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22, tau, A_part3x3.A21, as0, A_part3x3.A12, as1, - A_part3x3.A22, as0, as1, w); + A_part3x3.A22, as0, as1, w, ws); /// ----------------------------------------------------- A_part2x2.mergeToATL(A_part3x3); t_part2x1.mergeToAT(t_part3x1); diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 56a5619e06..6b4461e5ec 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -176,11 +176,12 @@ struct SerialSVDInternal { if (n - i > 1) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( m - i - 1, n - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(A, i, i + 1), As1, &SVDIND(A, i + 1, i + 1), - As0, As1, work); + As0, As1, work, 1); } if (U) { - KokkosBatched::SerialApplyRightHouseholderInternal::invoke( - m, m - i - 1, &tau, &SVDIND(A, i + 1, i), As0, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, i + 1), Us0, Us1, work); + KokkosBatched::SerialApplyRightHouseholderInternal::invoke(m, m - i - 1, &tau, &SVDIND(A, i + 1, i), + As0, &SVDIND(U, 0, i), Us0, + &SVDIND(U, 0, i + 1), Us0, Us1, work, 1); } // Zero out A subdiag explicitly (NOTE: may not be necessary...) for (int j = i + 1; j < m; j++) { @@ -193,12 +194,12 @@ struct SerialSVDInternal { if (m - i > 1) { KokkosBatched::SerialApplyRightHouseholderInternal::invoke( m - i - 1, n - i - 2, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(A, i + 1, i + 1), As0, - &SVDIND(A, i + 1, i + 2), As0, As1, work); + &SVDIND(A, i + 1, i + 2), As0, As1, work, 1); } if (Vt) { KokkosBatched::SerialApplyLeftHouseholderInternal::invoke( n - i - 2, n, &tau, &SVDIND(A, i, i + 2), As1, &SVDIND(Vt, i + 1, 0), Vts1, &SVDIND(Vt, i + 2, 0), Vts0, - Vts1, work); + Vts1, work, 1); } // Zero out A superdiag row explicitly for (int j = i + 2; j < n; j++) { diff --git a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp index 9d17a8b435..589ca192dd 100644 --- a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp @@ -64,11 +64,11 @@ struct ApplyQ { KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const tViewType &t, const BViewType &B, const wViewType &w) { int r_val = 0; - if (std::is_same::value) { + if constexpr (std::is_same_v) { r_val = SerialApplyQ::invoke(A, t, B, w); - } else if (std::is_same::value) { + } else if constexpr (std::is_same_v) { r_val = TeamApplyQ::invoke(member, A, t, B, w); - } else if (std::is_same::value) { + } else if constexpr (std::is_same_v) { r_val = TeamVectorApplyQ::invoke(member, A, t, B, w); } return r_val; diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 551f200101..f209ba0f26 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -30,6 +30,7 @@ #include "Test_Batched_SerialLU.hpp" #include "Test_Batched_SerialLU_Real.hpp" #include "Test_Batched_SerialLU_Complex.hpp" +#include "Test_Batched_SerialQR.hpp" #include "Test_Batched_SerialSolveLU.hpp" #include "Test_Batched_SerialSolveLU_Real.hpp" #include "Test_Batched_SerialSolveLU_Complex.hpp" diff --git a/batched/dense/unit_test/Test_Batched_SerialQR.hpp b/batched/dense/unit_test/Test_Batched_SerialQR.hpp new file mode 100644 index 0000000000..465a3747cb --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialQR.hpp @@ -0,0 +1,454 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Luc Berger-Vergiat (lberg@sandia.gov) +/// \author Cameron Smith (smithc11@rpi.edu) + +#include "gtest/gtest.h" +#include + +#include // KokkosBatched::QR +#include "KokkosBatched_ApplyQ_Decl.hpp" // KokkosBatched::ApplyQ +#include "KokkosBatched_QR_FormQ_Serial_Internal.hpp" +#include // KokkosBlas::Algo +#include + +template +struct qrFunctor { + using Scalar = typename MatricesType::value_type; + + MatricesType As; + TauViewType taus; + TmpViewType ws; + MatricesType Qs, Bs; + ErrorViewType error; + + qrFunctor(MatricesType As_, TauViewType taus_, TmpViewType ws_, MatricesType Qs_, MatricesType Bs_, + ErrorViewType error_) + : As(As_), taus(taus_), ws(ws_), Qs(Qs_), Bs(Bs_), error(error_) {} + + KOKKOS_FUNCTION + void operator()(const int matIdx) const { + auto A = Kokkos::subview(As, matIdx, Kokkos::ALL, Kokkos::ALL); + auto tau = Kokkos::subview(taus, matIdx, Kokkos::ALL); + auto w = Kokkos::subview(ws, matIdx, Kokkos::ALL); + auto Q = Kokkos::subview(Qs, matIdx, Kokkos::ALL, Kokkos::ALL); + auto B = Kokkos::subview(Bs, matIdx, Kokkos::ALL, Kokkos::ALL); + + const Scalar SC_one = Kokkos::ArithTraits::one(); + const Scalar tol = Kokkos::ArithTraits::eps() * 10; + + int error_lcl = 0; + + KokkosBatched::SerialQR::invoke(A, tau, w); + + // Store identity in Q + for (int diagIdx = 0; diagIdx < Q.extent_int(0); ++diagIdx) { + Q(diagIdx, diagIdx) = SC_one; + } + + // Call ApplyQ on Q + for (int idx = 0; idx < w.extent_int(0); ++idx) { + w(idx) = 0.0; + } + KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + + // Now apply Q' to Q + for (int idx = 0; idx < w.extent_int(0); ++idx) { + w(idx) = 0.0; + } + KokkosBatched::SerialApplyQ::invoke(A, tau, Q, w); + + // At this point Q stores Q'Q + // which should be the identity matrix + for (int rowIdx = 0; rowIdx < Q.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < Q.extent_int(1); ++colIdx) { + if (rowIdx == colIdx) { + if (Kokkos::abs(Q(rowIdx, colIdx) - SC_one) > tol) { + error_lcl += 1; + Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 1.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); + } + } else { + if (Kokkos::abs(Q(rowIdx, colIdx)) > tol) { + error_lcl += 1; + Kokkos::printf("matIdx=%d, Q(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, Q(rowIdx, colIdx)); + } + } + } + } + + // Apply Q' to B which holds a copy of the orginal A + // Afterwards B should hold a copy of R and be zero below its diagonal + KokkosBatched::SerialApplyQ::invoke(A, tau, B, w); + for (int rowIdx = 0; rowIdx < B.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < B.extent_int(1); ++colIdx) { + if (rowIdx <= colIdx) { + if (Kokkos::abs(B(rowIdx, colIdx) - A(rowIdx, colIdx)) > tol * Kokkos::abs(A(rowIdx, colIdx))) { + error_lcl += 1; + Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of %e.\n", matIdx, rowIdx, colIdx, + B(rowIdx, colIdx), A(rowIdx, colIdx)); + } + } else { + if (Kokkos::abs(B(rowIdx, colIdx)) > 1000 * tol) { + error_lcl += 1; + Kokkos::printf("B stores R\nmatIdx=%d, B(%d, %d)=%e instead of 0.0.\n", matIdx, rowIdx, colIdx, + B(rowIdx, colIdx)); + } + } + } + } + error(matIdx) = error_lcl; + } +}; + +template +void test_QR_square() { + // Analytical test with a rectangular matrix + // [12, -51, 4] [150, -69, 58] [14, 21, -14] + // A = [ 6, 167, -68] Q = 1/175 [ 75, 158, -6] R = [ 0, 175, -70] + // [-4, 24, -41] [-50, 30, 165] [ 0, 0, -35] + // + // Expected outputs: + // [ -5, -3] + // tau = [5/8, 10/18] A = [1/2, 5] + // [ 0, 1/3] + // + + using MatrixViewType = Kokkos::View; + using ColVectorViewType = Kokkos::View; + using ColWorkViewType = Kokkos::View; + + const Scalar tol = 20 * Kokkos::ArithTraits::eps(); + constexpr int m = 3, n = 3; + + MatrixViewType A("A", m, n), B("B", m, n), Q("Q", m, m); + ColVectorViewType t("t", n); + ColWorkViewType w("w", n); + + typename MatrixViewType::HostMirror A_h = Kokkos::create_mirror_view(A); + A_h(0, 0) = 12; + A_h(0, 1) = -51; + A_h(0, 2) = 4; + A_h(1, 0) = 6; + A_h(1, 1) = 167; + A_h(1, 2) = -68; + A_h(2, 0) = -4; + A_h(2, 1) = 24; + A_h(2, 2) = -41; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(B, A_h); + + Kokkos::parallel_for( + "serialQR", 1, KOKKOS_LAMBDA(int) { + // compute the QR factorization of A and store the results in A and t + // (tau) - see the lapack dgeqp3(...) documentation: + // www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga1b0500f49e03d2771b797c6e88adabbb.html + KokkosBatched::SerialQR::invoke(A, t, w); + }); + + Kokkos::fence(); + Kokkos::deep_copy(A_h, A); + typename ColVectorViewType::HostMirror tau = Kokkos::create_mirror_view(t); + Kokkos::deep_copy(tau, t); + + Test::EXPECT_NEAR_KK_REL(A_h(0, 0), static_cast(-14), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 1), static_cast(-21), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 2), static_cast(14), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 0), static_cast(6. / 26.), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 1), static_cast(-175), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 2), static_cast(70), tol); + Test::EXPECT_NEAR_KK_REL(A_h(2, 0), static_cast(-4.0 / 26.0), tol); + // Test::EXPECT_NEAR_KK_REL(A_h(2, 1), 35.0, tol); // Analytical expression too painful to compute... + Test::EXPECT_NEAR_KK_REL(A_h(2, 2), static_cast(35), tol); + + Test::EXPECT_NEAR_KK_REL(tau(0), static_cast(7. / 13.), tol); + // Test::EXPECT_NEAR_KK_REL(tau(1), 25. / 32., tol); // Analytical expression too painful to compute... + Test::EXPECT_NEAR_KK_REL(tau(2), static_cast(1. / 2.), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, B, w); + }); + typename MatrixViewType::HostMirror B_h = Kokkos::create_mirror_view(B); + Kokkos::deep_copy(B_h, B); + + Test::EXPECT_NEAR_KK_REL(static_cast(-14), B_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-21), B_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(14), B_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-175), B_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(70), B_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0), B_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(35), B_h(2, 2), tol); + + Kokkos::parallel_for( + "serialFormQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), + Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0)); + }); + typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(-6. / 7.), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(69. / 175.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-58. / 175.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-3. / 7.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-158. / 175.), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(6. / 175.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(2. / 7.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-6. / 35.), Q_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-33. / 35.), Q_h(2, 2), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, Q, w); + }); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.), Q_h(2, 2), tol); + + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 1), tol); +} + +template +void test_QR_rectangular() { + // Analytical test with a rectangular matrix + // [3, 5] [-0.60, -0.64, 0.48] [-5, -3] + // A = [4, 0] Q = [-0.80, -0.48, -0.36] R = [ 0, 5] + // [0, -3] [ 0.00, -0.60, -0.80] [ 0, 0] + // + // Expected outputs: + // [ -5, -3] + // tau = [5/8, 10/18] A = [1/2, 5] + // [ 0, 1/3] + // + + using MatrixViewType = Kokkos::View; + using ColVectorViewType = Kokkos::View; + using ColWorkViewType = Kokkos::View; + + const Scalar tol = 10 * Kokkos::ArithTraits::eps(); + constexpr int m = 3, n = 2; + + MatrixViewType A("A", m, n), B("B", m, n), Q("Q", m, m); + ColVectorViewType t("t", n); + ColWorkViewType w("w", n); + + typename MatrixViewType::HostMirror A_h = Kokkos::create_mirror_view(A); + A_h(0, 0) = 3; + A_h(0, 1) = 5; + A_h(1, 0) = 4; + A_h(1, 1) = 0; + A_h(2, 0) = 0; + A_h(2, 1) = -3; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(B, A_h); + + Kokkos::parallel_for( + "serialQR", 1, KOKKOS_LAMBDA(int) { + // compute the QR factorization of A and store the results in A and t + // (tau) - see the lapack dgeqp3(...) documentation: + // www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga1b0500f49e03d2771b797c6e88adabbb.html + KokkosBatched::SerialQR::invoke(A, t, w); + }); + + Kokkos::fence(); + Kokkos::deep_copy(A_h, A); + typename ColVectorViewType::HostMirror tau = Kokkos::create_mirror_view(t); + Kokkos::deep_copy(tau, t); + + Test::EXPECT_NEAR_KK_REL(A_h(0, 0), static_cast(-5.0), tol); + Test::EXPECT_NEAR_KK_REL(A_h(0, 1), static_cast(-3.0), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 0), static_cast(0.5), tol); + Test::EXPECT_NEAR_KK_REL(A_h(1, 1), static_cast(5.0), tol); + Test::EXPECT_NEAR_KK(A_h(2, 0), static_cast(0.), tol); + Test::EXPECT_NEAR_KK_REL(A_h(2, 1), static_cast(1. / 3.), tol); + + Test::EXPECT_NEAR_KK_REL(tau(0), static_cast(5. / 8.), tol); + Test::EXPECT_NEAR_KK_REL(tau(1), static_cast(10. / 18.), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, B, w); + }); + typename MatrixViewType::HostMirror B_h = Kokkos::create_mirror_view(B); + Kokkos::deep_copy(B_h, B); + + Test::EXPECT_NEAR_KK_REL(static_cast(-5.0), B_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-3.0), B_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(5.0), B_h(1, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), B_h(2, 1), tol); + + Kokkos::parallel_for( + "serialFormQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialQR_FormQ_Internal::invoke(m, n, A.data(), A.stride(0), A.stride(1), t.data(), t.stride(0), + Q.data(), Q.stride(0), Q.stride(1), w.data(), w.stride(0)); + }); + typename MatrixViewType::HostMirror Q_h = Kokkos::create_mirror_view(Q); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(-0.60), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.64), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.48), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.80), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.48), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.36), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(-0.60), Q_h(2, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(0.80), Q_h(2, 2), tol); + + Kokkos::parallel_for( + "serialApplyQ", 1, KOKKOS_LAMBDA(int) { + KokkosBatched::SerialApplyQ::invoke(A, t, Q, w); + }); + Kokkos::deep_copy(Q_h, Q); + + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(0, 0), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(1, 1), tol); + Test::EXPECT_NEAR_KK_REL(static_cast(1.0), Q_h(2, 2), tol); + + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 1), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(0, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(1, 2), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 0), tol); + Test::EXPECT_NEAR_KK(static_cast(0.), Q_h(2, 1), tol); +} + +template +void test_QR_batch(const int numMat, const int numRows, const int numCols) { + // Generate a batch of matrices + // Compute QR factorization + // Verify that R is triangular + // Verify that Q is unitary + // Check that Q*R = A + + using ExecutionSpace = typename Device::execution_space; + + std::cout << "batched QR, running (" << numMat << ", " << numRows << ", " << numCols << ") case" << std::endl; + { + Kokkos::View tau("tau", numMat, numCols); + Kokkos::View tmp("work buffer", numMat, numCols); + Kokkos::View As("A matrices", numMat, numRows, numCols); + Kokkos::View Bs("B matrices", numMat, numRows, numCols); + Kokkos::View Qs("Q matrices", numMat, numRows, numRows); + Kokkos::View error("global number of error", numMat); + + Kokkos::Random_XorShift64_Pool rand_pool(2718); + constexpr double max_val = 1000; + { + Scalar randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(ExecutionSpace{}, As, rand_pool, randStart, randEnd); + } + Kokkos::deep_copy(Bs, As); + + qrFunctor myFunc(As, tau, tmp, Qs, Bs, error); + Kokkos::parallel_for("KokkosBatched::test_QR_batch", Kokkos::RangePolicy(0, numMat), myFunc); + Kokkos::fence(); + + typename Kokkos::View::HostMirror error_h = Kokkos::create_mirror_view(error); + Kokkos::deep_copy(error_h, error); + + bool first_error = true; + int global_error = 0; + for(int matIdx = 0; matIdx < numMat; ++matIdx) { + if(0 < error_h(matIdx)) { + std::cout << "Errors found for matrix " << matIdx << ": " << error_h(matIdx) << std::endl; + if(first_error == true) { + auto mat_view = Kokkos::subview(As, matIdx, Kokkos::ALL, Kokkos::ALL); + auto mat_host = Kokkos::create_mirror_view(mat_view); + + first_error = false; + } + } + global_error += error_h(matIdx); + } + EXPECT_EQ(global_error, 0); + } +} + +template +void test_QR_batch(const int numMat, const int numRows) { + test_QR_batch(numMat, numRows, numRows); +} + + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, serial_qr_square_analytic_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_square(); +} +TEST_F(TestCategory, serial_qr_rectangular_analytic_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_rectangular(); +} +TEST_F(TestCategory, serial_qr_batch_float) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_batch(314, 36); + test_QR_batch(10, 42, 36); + test_QR_batch(100, 42, 36); + test_QR_batch(200, 42, 36); + test_QR_batch(250, 42, 36); + test_QR_batch(300, 42, 36); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, serial_qr_square_analytic_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_square(); +} +TEST_F(TestCategory, serial_qr_rectangular_analytic_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_rectangular(); +} +TEST_F(TestCategory, serial_qr_batch_double) { + typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; + test_QR_batch(314, 36); + + test_QR_batch(10, 42, 36); + test_QR_batch(100, 42, 36); + test_QR_batch(200, 42, 36); + test_QR_batch(250, 42, 36); + test_QR_batch(300, 42, 36); +} +#endif + +// #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +// TEST_F(TestCategory, batched_scalar_serial_qr_scomplex) { +// typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; +// test_QR_rectangular, AlgoTagType>(); +// } +// #endif + +// #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +// TEST_F(TestCategory, batched_scalar_serial_qr_dcomplex) { +// typedef KokkosBlas::Algo::QR::Unblocked AlgoTagType; +// test_QR_rectangular, AlgoTagType>(); +// } +// #endif