ROCm · zoranjovanovic-ns · Dec 14, 2024
diff --git a/third_party/llvm/capture.patch b/third_party/llvm/capture.patch
@@ -0,0 +1,11 @@
+--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
++++ a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+@@ -119,7 +119,7 @@
+
+ std::optional<SmallVector<int64_t>>
+ getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
+-  bool failed = false;
++  bool failed = false;__asm__("":"+r"(failed));
+   SmallVector<int64_t> res = llvm::map_to_vector(ofrs, [&](OpFoldResult ofr) {
+     auto cv = getConstantIntValue(ofr);
+     if (!cv.has_value())
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
@@ -23,6 +23,7 @@ def repo(name):
             "//third_party/llvm:toolchains.patch",
             "//third_party/llvm:zstd.patch",
             "//third_party/llvm:rocdl_shuffle_down.patch",
+            "//third_party/llvm:capture.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
     )
diff --git a/third_party/tsl/third_party/llvm/capture.patch b/third_party/tsl/third_party/llvm/capture.patch
@@ -0,0 +1,11 @@
+--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
++++ a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+@@ -119,7 +119,7 @@
+
+ std::optional<SmallVector<int64_t>>
+ getConstantIntValues(ArrayRef<OpFoldResult> ofrs) {
+-  bool failed = false;
++  bool failed = false;__asm__("":"+r"(failed));
+   SmallVector<int64_t> res = llvm::map_to_vector(ofrs, [&](OpFoldResult ofr) {
+     auto cv = getConstantIntValue(ofr);
+     if (!cv.has_value())
diff --git a/third_party/tsl/third_party/llvm/workspace.bzl b/third_party/tsl/third_party/llvm/workspace.bzl
@@ -23,6 +23,7 @@ def repo(name):
             "//third_party/llvm:toolchains.patch",
             "//third_party/llvm:zstd.patch",
             "//third_party/llvm:rocdl_shuffle_down.patch",
+            "//third_party/llvm:capture.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
     )
diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
@@ -484,12 +484,89 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gemm_fusion_autotuner_cuda",
+    srcs = [
+        "gemm_fusion_autotuner.h",
+        "gemm_fusion_autotuner_cuda.cc",
+    ],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":autotuner_compile_util",
+        ":autotuner_util",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:algorithm_util",
+        "//xla/service:executable",
+        "//xla/service:shaped_buffer",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu/transforms:cudnn_fusion_compiler",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:semantic_version",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@tsl//tsl/platform:env",
+    ],
+)
+
+cc_library(
+    name = "gemm_fusion_autotuner_rocm",
+    srcs = [
+        "gemm_fusion_autotuner.h",
+        "gemm_fusion_autotuner_rocm.cc",
+    ],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":autotuner_compile_util",
+        ":autotuner_util",
+        "//xla:autotuning_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/service:executable",
+        "//xla/service:shaped_buffer",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        #"//xla/stream_executor:semantic_version",
+        "//xla/stream_executor/rocm:rocblas_plugin",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@local_config_rocm//rocm:rocm_headers",
+        "@tsl//tsl/platform:env",
+    ],
+)
+
 cc_library(
     name = "gemm_fusion_autotuner",
-    srcs = if_cuda_is_configured(["gemm_fusion_autotuner.cc"]),
-    hdrs = if_cuda_is_configured(["gemm_fusion_autotuner.h"]),
+    srcs = ["gemm_fusion_autotuner.cc"],
+    hdrs = ["gemm_fusion_autotuner.h"],
+    tags = ["gpu"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    deps = if_cuda_is_configured([
+    deps = if_cuda_is_configured([":gemm_fusion_autotuner_cuda"]) + if_rocm_is_configured([
+        ":gemm_fusion_autotuner_rocm",
+    ]) + [
         ":autotuner_compile_util",
         ":autotuner_util",
         ":backend_configs_cc",
@@ -552,15 +629,12 @@ cc_library(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "@tsl//tsl/platform:path",
-    ]),
+    ],
 )
 
 xla_test(
     name = "gemm_fusion_autotuner_test",
-    srcs = if_cuda_is_configured(["gemm_fusion_autotuner_test.cc"]),
-    backend_tags = {"gpu": [
-        "requires-gpu-sm80",
-    ]},
+    srcs = if_gpu_is_configured(["gemm_fusion_autotuner_test.cc"]),
     backends = [
         "gpu",
     ],
@@ -3803,6 +3877,7 @@ cc_library(
         ":cudnn_fused_conv_rewriter",
         ":cusolver_rewriter",
         ":gemm_algorithm_picker",
+        ":gemm_fusion_autotuner",
         ":gpu_algebraic_simplifier",
         ":gpu_compiler",
         ":gpu_conv_padding_legalization",

diff --git a/xla/service/gpu/amdgpu_compiler.cc b/xla/service/gpu/amdgpu_compiler.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/conv_algorithm_picker.h"
 #include "xla/service/gpu/cublas_pad_for_gemms.h"
+#include "xla/service/gpu/gemm_algorithm_picker.h"
+#include "xla/service/gpu/gemm_fusion_autotuner.h"
 #include "xla/service/gpu/cublas_padding_requirements.h"
 #include "xla/service/gpu/cudnn_fused_conv_rewriter.h"
 #include "xla/service/gpu/cusolver_rewriter.h"
@@ -277,5 +279,14 @@ AMDGPUCompiler::CompileTargetBinary(const HloModuleConfig& module_config,
   return BackendCompileResult{"", std::move(hsaco)};
 }
 
+absl::Status AMDGPUCompiler::AddGemmFusionAutotuningPasses(
+    HloPassPipeline* pipeline, HloModule* hlo_module,
+    AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
+    const MultiProcessKeyValueStore& key_value_store) {
+  pipeline->AddPass<GemmFusionAutotuner>(autotune_config, GetToolkitVersion(),
+                                         thread_pool, key_value_store);
+  return absl::OkStatus();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/xla/service/gpu/amdgpu_compiler.h b/xla/service/gpu/amdgpu_compiler.h
@@ -66,6 +66,11 @@ class AMDGPUCompiler : public GpuCompiler {
       se::GpuComputeCapability gpu_version, bool relocatable,
       const HloModule* debug_module, const CompileOptions& options) override;
 
+  absl::Status AddGemmFusionAutotuningPasses(
+      HloPassPipeline* pipeline, HloModule* hlo_module,
+      AutotuneConfig& autotune_config, tsl::thread::ThreadPool* thread_pool,
+      const MultiProcessKeyValueStore& key_value_store) override;
+
  private:
   AMDGPUCompiler(const AMDGPUCompiler&) = delete;
   AMDGPUCompiler& operator=(const AMDGPUCompiler&) = delete;

diff --git a/xla/service/gpu/gemm_fusion_autotuner.cc b/xla/service/gpu/gemm_fusion_autotuner.cc
@@ -40,7 +40,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "xla/autotune_results.pb.h"
 #include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -61,7 +61,6 @@ limitations under the License.
 #include "xla/service/gpu/autotuner_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/buffer_comparator.h"
-#include "xla/service/gpu/cudnn_fusion_compiler.h"
 #include "xla/service/gpu/fusion_wrapper.h"
 #include "xla/service/gpu/gemm_rewriter.h"
 #include "xla/service/gpu/gpu_float_support.h"
@@ -438,29 +437,11 @@ absl::StatusOr<std::unique_ptr<HloModule>> CuDnnFusionExtractor(
   return module;
 }
 
-bool IsFusionKind(const HloInstruction& hlo, absl::string_view kind) {
-  auto gpu_config = hlo.backend_config<GpuBackendConfig>();
-  if (!gpu_config.ok()) {
-    return false;
-  }
-  return gpu_config->fusion_backend_config().kind() == kind;
-}
-
-int GetCuDnnPlanCount(const HloInstruction& hlo,
-                      const AutotuneConfig& autotune_config) {
-  if (auto gpu_config = hlo.backend_config<GpuBackendConfig>();
-      !gpu_config.ok() ||
-      gpu_config->fusion_backend_config().has_cudnn_fusion_config()) {
-    return {};
-  }
-  return CuDnnFusionCompiler::GetAvailablePlanCount(
-      *autotune_config.GetExecutor(), *DynCast<HloFusionInstruction>(&hlo));
-}
-
 AutotuneResult FromConfig(const Config& config) {
   AutotuneResult res;
   if (std::holds_alternative<GemmFusionAutotunerImpl::CuBlasConfig>(config)) {
-    res.mutable_gemm()->set_algorithm(CUBLAS_GEMM_DEFAULT);
+    res.mutable_gemm()->set_algorithm(
+        GemmFusionAutotunerImpl::BLAS_GEMM_DEFAULT);
   } else if (std::holds_alternative<GemmFusionAutotunerImpl::CuDnnConfig>(
                  config)) {
     res.mutable_algorithm()->set_algo_id(
@@ -550,6 +531,15 @@ std::string Serialize(const Config& config) {
 
 }  // anonymous namespace
 
+bool GemmFusionAutotunerImpl::IsFusionKind(const HloInstruction& hlo,
+                                           absl::string_view kind) {
+  auto gpu_config = hlo.backend_config<GpuBackendConfig>();
+  if (!gpu_config.ok()) {
+    return false;
+  }
+  return gpu_config->fusion_backend_config().kind() == kind;
+}
+
 // Methods required for sorting the configs.
 bool GemmFusionAutotunerImpl::CuBlasConfig::operator<(
     const CuBlasConfig& other) const {
@@ -584,30 +574,17 @@ absl::StatusOr<std::vector<Config>> GemmFusionAutotunerImpl::GenerateConfigs(
       Cast<HloDotInstruction>(hlo_query::GetFirstInstructionWithOpcode(
           *fusion.called_computations().at(0), HloOpcode::kDot));
 
-  // Add cuBLAS reference config, if available.
-  std::vector<Config> configs;
-  if (algorithm_util::IsSupportedByCublasOrCublasLt(
-          dot->precision_config().algorithm()) &&
-      !dot->sparse_operands() && IsAutotuningEnabled()) {
-    configs.push_back(CuBlasConfig{});
-  }
-
-  // Add cuDNN plans, if available.
-  bool is_hopper =
-      !config_.IsDeviceless() && GetComputeCapability().IsAtLeastHopper();
-  bool is_cudnn_enabled =
-      debug_options_.xla_gpu_cudnn_gemm_fusion_level() > 0 && is_hopper &&
-      GetDnnVersionInfoOrDefault(config_.GetExecutor()).major_version() >= 9;
-  if ((IsFusionKind(fusion, kCuDnnFusionKind) && IsAutotuningEnabled()) ||
-      (IsFusionKind(fusion, kTritonGemmFusionKind) && is_cudnn_enabled &&
-       algorithm_util::IsSupportedByCudnn(
-           dot->precision_config().algorithm()) &&
-       !dot->sparse_operands() && IsAutotuningEnabled())) {
-    const int plan_count = GetCuDnnPlanCount(fusion, config_);
-    for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
-      configs.push_back(CuDnnConfig{plan_id});
+    // Add cuBLAS reference config, if available.
+    std::vector<Config> configs;
+    if (algorithm_util::IsSupportedByCublasOrCublasLt(
+            dot->precision_config().algorithm()) &&
+        !dot->sparse_operands() && IsAutotuningEnabled()) {
+      configs.push_back(CuBlasConfig{});
     }
-  }
+
+    // Add lib (e.g. cuDNN) plans, if available.
+    if (AddLibConfigs(fusion, dot, configs)) return configs;
+
   if (IsFusionKind(fusion, kCuDnnFusionKind)) {
     if (!IsAutotuningEnabled()) {
       configs.push_back(CuDnnConfig{-1});
@@ -675,8 +652,6 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
 
   // Triton configurations are adjusted and deduplicated.
   absl::flat_hash_set<TritonGemmConfig> added;
-  bool is_hopper =
-      !config_.IsDeviceless() && GetComputeCapability().IsAtLeastHopper();
   for (TritonGemmConfig& config : triton_configs) {
     config.block_m = std::min(config.block_m, limits.block_m);
     config.block_n = std::min(config.block_n, limits.block_n);
@@ -699,10 +674,8 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
     // Sparse meta should have at least one element per thread.
     // Note: only 2:4 structured sparsity is currently supported.
     if (dot.sparse_operands()) {
-      if (is_hopper) {
-        config.block_m = std::max(config.block_m, 64);
-        config.num_warps = std::max(config.num_warps, 4);
-      }
+      config.block_m = std::max(config.block_m, 64);
+      config.num_warps = std::max(config.num_warps, 4);
       config.block_k = std::max(
           config.block_k,
           2 * std::max(kMinTileSize, kLdmatrixGranularity / minBitWidth));
@@ -972,15 +945,15 @@ absl::StatusOr<std::vector<AutotuneResult>> GemmFusionAutotunerImpl::Profile(
 std::vector<TritonGemmConfig>
 GemmFusionAutotunerImpl::GetExhaustiveTritonConfigs() const {
   std::vector<TritonGemmConfig> configs;
-  se::CudaComputeCapability cc = GetComputeCapability();
-  bool tune_ctas =
-      debug_options_.xla_gpu_enable_triton_hopper() && cc.IsAtLeastHopper();
+  se::GpuComputeCapability gcc = GetComputeCapability();
+  bool tune_ctas = false;
+
+  if (!isRocm()) {
+    auto cc = std::get<se::CudaComputeCapability>(gcc);
+    debug_options_.xla_gpu_enable_triton_hopper() && cc.IsAtLeastHopper();
+  }
 
   for (int num_stages : kNumStages) {
-    // Volta doesn't support num_stages > 2.
-    if (!cc.IsAtLeastAmpere() && num_stages > 2) {
-      break;
-    }
     for (int tile_m : kBlockSizes) {
       for (int tile_n : kBlockSizes) {
         for (int tile_k : kBlockSizes) {
@@ -1019,44 +992,6 @@ GemmFusionAutotunerImpl::GetExhaustiveTritonConfigs() const {
   return configs;
 }
 
-std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
-    const {
-  using Config = TritonGemmConfig;
-  std::vector<Config> configs = {
-      Config(32, 32, 256, 1, 1, 4), Config(64, 32, 32, 16, 1, 4),
-      Config(32, 64, 64, 4, 1, 4),  Config(128, 128, 64, 4, 1, 4),
-      Config(16, 16, 256, 1, 1, 4), Config(16, 128, 32, 16, 1, 4),
-      Config(16, 64, 128, 1, 1, 4), Config(16, 128, 32, 8, 1, 4),
-      Config(16, 16, 512, 1, 1, 4), Config(32, 16, 512, 1, 1, 4),
-      Config(64, 32, 64, 1, 2, 8)};
-  if (GetComputeCapability().IsAtLeastAmpere()) {
-    absl::c_copy(
-        std::vector<Config>{
-            Config(128, 256, 32, 1, 3, 8),  Config(256, 128, 32, 1, 3, 8),
-            Config(256, 64, 32, 1, 4, 4),   Config(64, 256, 32, 1, 4, 4),
-            Config(128, 64, 32, 1, 4, 4),   Config(64, 128, 32, 1, 4, 4),
-            Config(256, 128, 128, 1, 3, 8), Config(256, 64, 128, 1, 4, 4),
-            Config(64, 256, 128, 1, 4, 4),  Config(128, 128, 128, 1, 4, 4),
-            Config(128, 64, 64, 1, 4, 4),   Config(64, 128, 64, 1, 4, 4),
-            Config(128, 32, 64, 1, 4, 4),   Config(64, 32, 64, 1, 4, 4),
-            Config(32, 128, 32, 1, 4, 4),   Config(128, 128, 32, 1, 4, 4),
-            Config(16, 16, 256, 1, 3, 4),   Config(128, 128, 64, 2, 1, 8),
-            Config(64, 64, 64, 1, 2, 4),    Config(16, 64, 256, 8, 1, 4),
-            Config(256, 256, 128, 1, 3, 8)},
-        std::back_inserter(configs));
-  }
-  if (GetComputeCapability().IsAtLeastHopper()) {
-    absl::c_copy(
-        std::vector<Config>{
-            Config(16, 32, 32, 8, 1, 2),
-            Config(16, 64, 128, 8, 1, 4),
-            Config(16, 64, 128, 16, 3, 4),
-        },
-        std::back_inserter(configs));
-  }
-  return configs;
-}
-
 absl::Status DumpAutotuningLogs(const DebugOptions& debug_opts,
                                 const AutotuningLogs& autotuning_logs) {
   if (absl::string_view file_path = debug_opts.xla_gpu_dump_autotune_logs_to();