From 9c8eedf0fa5d1dcbf5f232b9e75e5090da07acae Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Mon, 27 Jan 2025 22:10:11 +0900 Subject: [PATCH 1/3] fix: getrs serial internal implementations Signed-off-by: Yuuichi Asahi --- .../KokkosBatched_Getrs_Serial_Internal.hpp | 64 +++++++++---------- .../unit_test/Test_Batched_SerialGetrs.hpp | 2 - 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Getrs_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Getrs_Serial_Internal.hpp index bc6a981fc9..6042e4911c 100644 --- a/batched/dense/impl/KokkosBatched_Getrs_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Getrs_Serial_Internal.hpp @@ -30,48 +30,46 @@ struct SerialGetrsInternal { //// Non-transpose //// template <> -struct SerialGetrsInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const PivViewType &piv, const BViewType &b) { - KokkosBatched::SerialLaswp::invoke(piv, b); - KokkosBatched::SerialTrsm::invoke( - 1.0, A, b); - KokkosBatched::SerialTrsm::invoke(1.0, A, b); +template +KOKKOS_INLINE_FUNCTION int SerialGetrsInternal::invoke( + const AViewType &A, const PivViewType &piv, const BViewType &b) { + KokkosBatched::SerialLaswp::invoke(piv, b); + KokkosBatched::SerialTrsm::invoke( + 1.0, A, b); + KokkosBatched::SerialTrsm::invoke( + 1.0, A, b); - return 0; - } -}; + return 0; +} //// Transpose //// template <> -struct SerialGetrsInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const PivViewType &piv, const BViewType &b) { - KokkosBatched::SerialTrsm::invoke( - 1.0, A, b); - KokkosBatched::SerialTrsm::invoke( - 1.0, A, b); - KokkosBatched::SerialLaswp::invoke(piv, b); +template +KOKKOS_INLINE_FUNCTION int SerialGetrsInternal::invoke(const AViewType &A, + const PivViewType &piv, + const BViewType &b) { + KokkosBatched::SerialTrsm::invoke( + 1.0, A, b); + KokkosBatched::SerialTrsm::invoke(1.0, + A, b); + KokkosBatched::SerialLaswp::invoke(piv, b); - return 0; - } -}; + return 0; +} //// Conj-Transpose //// template <> -struct SerialGetrsInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const PivViewType &piv, const BViewType &b) { - KokkosBatched::SerialTrsm::invoke(1.0, A, b); - KokkosBatched::SerialTrsm::invoke( - 1.0, A, b); - KokkosBatched::SerialLaswp::invoke(piv, b); +template +KOKKOS_INLINE_FUNCTION int SerialGetrsInternal::invoke( + const AViewType &A, const PivViewType &piv, const BViewType &b) { + KokkosBatched::SerialTrsm::invoke(1.0, A, b); + KokkosBatched::SerialTrsm::invoke( + 1.0, A, b); + KokkosBatched::SerialLaswp::invoke(piv, b); - return 0; - } -}; + return 0; +} } // namespace Impl } // namespace KokkosBatched diff --git a/batched/dense/unit_test/Test_Batched_SerialGetrs.hpp b/batched/dense/unit_test/Test_Batched_SerialGetrs.hpp index 22f4ff58a2..9cfef9d8a0 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGetrs.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGetrs.hpp @@ -23,8 +23,6 @@ #include #include "Test_Batched_DenseUtils.hpp" -using namespace KokkosBatched; - namespace Test { namespace Getrs { From 3e27b0d43f989781432e92ad3807fa9fa9b0a14f Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 29 Jan 2025 06:53:06 +0900 Subject: [PATCH 2/3] disallow optimization to fix serial getrs Signed-off-by: Yuuichi Asahi --- batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp | 2 +- .../dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp index 445251a647..9a59d74832 100644 --- a/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp @@ -83,7 +83,7 @@ struct SerialLaswp { template <> struct SerialLaswp { template - KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType piv, const AViewType &A) { + KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType &piv, const AViewType &A) { auto info = KokkosBatched::Impl::checkLaswpInput(piv, A); if (info) return info; diff --git a/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp index dc49f367b1..3516cca430 100644 --- a/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp @@ -96,6 +96,15 @@ struct SerialLaswpVectorBackwardInternal { /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { for (int i = (plen - 1); i >= 0; --i) { const int piv = p[i * ps0]; + +// On H100 with Cuda 12.0.0, the compiler seems to apply +// an aggressive optimization which crashes this function +// Insert unnecessary operation to disallow optimization +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_HOPPER90) +#if CUDA_VERSION == 12000 + if (piv < 0) return 0; +#endif +#endif if (piv != i) { const int idx_i = i * as0, idx_p = piv * as0; const ValueType tmp = A[idx_i]; From dda5d2ea59aac044c6e2294ede1b3acbbc25f195 Mon Sep 17 00:00:00 2001 From: Yuuichi Asahi Date: Wed, 29 Jan 2025 18:40:58 +0900 Subject: [PATCH 3/3] no unroll in Laswp Signed-off-by: Yuuichi Asahi --- .../impl/KokkosBatched_Laswp_Serial_Internal.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp index 3516cca430..fd0056ddf8 100644 --- a/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp @@ -94,17 +94,16 @@ struct SerialLaswpVectorBackwardInternal { template KOKKOS_INLINE_FUNCTION static int invoke(const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { - for (int i = (plen - 1); i >= 0; --i) { - const int piv = p[i * ps0]; - // On H100 with Cuda 12.0.0, the compiler seems to apply // an aggressive optimization which crashes this function -// Insert unnecessary operation to disallow optimization +// Disabling loop unrolling fixes the issue #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_HOPPER90) -#if CUDA_VERSION == 12000 - if (piv < 0) return 0; +#if CUDA_VERSION >= 12000 && CUDA_VERSION < 12100 +#pragma unroll 1 #endif #endif + for (int i = (plen - 1); i >= 0; --i) { + const int piv = p[i * ps0]; if (piv != i) { const int idx_i = i * as0, idx_p = piv * as0; const ValueType tmp = A[idx_i];