Skip to content

Commit

Permalink
Add launch bounds
Browse files Browse the repository at this point in the history
  • Loading branch information
rdspring1 committed Jan 3, 2025
1 parent a82c3fe commit c2c10b7
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,11 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
// Generates the kernel function declaration
void genDeclaration(const std::string& kernel_name) {
code_ << "__global__ void ";
if (kernel_->hasManaged("warp_specialized_num_registers")) {
constexpr int64_t threads_per_cta = 384;
code_ << "__launch_bounds__(/*MAX_THREADS_PER_BLOCK=*/" << threads_per_cta
<< ") ";
}
if (kernel_->hasManaged("cluster_dims")) {
auto cluster_dims =
kernel_->getManaged<std::tuple<int64_t, int64_t, int64_t>>(
Expand Down

0 comments on commit c2c10b7

Please sign in to comment.