From c2c10b78356130a7750d68a9c3d6f9ffcae05776 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Fri, 3 Jan 2025 12:01:41 -0800 Subject: [PATCH] Add launch bounds --- csrc/codegen.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp index 3a5f31c74d5..bbbf7b70939 100644 --- a/csrc/codegen.cpp +++ b/csrc/codegen.cpp @@ -274,6 +274,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor { // Generates the kernel function declaration void genDeclaration(const std::string& kernel_name) { code_ << "__global__ void "; + if (kernel_->hasManaged("warp_specialized_num_registers")) { + constexpr int64_t threads_per_cta = 384; + code_ << "__launch_bounds__(/*MAX_THREADS_PER_BLOCK=*/" << threads_per_cta + << ") "; + } if (kernel_->hasManaged("cluster_dims")) { auto cluster_dims = kernel_->getManaged>(