From d9141a2119a6fba6daf8dd3f244afc2dbd4b1d5a Mon Sep 17 00:00:00 2001
From: Troels Henriksen <athas@sigkill.dk>
Date: Thu, 22 Aug 2024 13:01:50 +0200
Subject: [PATCH] Add tuning params for GPU cache and shared memory.

---
 CHANGELOG.md            |  4 +++-
 rts/c/backends/cuda.h   | 22 +++++++++++++++-------
 rts/c/backends/hip.h    | 15 +++++++++++++--
 rts/c/backends/opencl.h | 25 +++++++++++++++++--------
 rts/c/gpu.h             |  9 +++++++++
 rts/c/gpu_prototypes.h  |  1 +
 6 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3e7a890469..2d22bbbbdf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 * Logging now prints more GPU information on context initialisation.
 
-* GPU cache size can now be configured.
+* GPU cache size can now be configured (tuning param: `default_cache`).
+
+* GPU shared memory can now be configured (tuning param: `default_shared_memory`).
 
 * GPU register capacity can now be configured.
 
diff --git a/rts/c/backends/cuda.h b/rts/c/backends/cuda.h
index 274100cded..8c91d8cf4e 100644
--- a/rts/c/backends/cuda.h
+++ b/rts/c/backends/cuda.h
@@ -779,28 +779,36 @@ int backend_context_setup(struct futhark_context* ctx) {
     }
   }
 
-  // MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably
-  // for backwards compatibility.  Add _OPTIN and you seem to get the
-  // right number.
-  ctx->max_shared_memory = device_query(ctx->dev, MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
-#if CUDART_VERSION >= 12000
-  ctx->max_shared_memory -= device_query(ctx->dev, RESERVED_SHARED_MEMORY_PER_BLOCK);
-#endif
   ctx->max_thread_block_size = device_query(ctx->dev, MAX_THREADS_PER_BLOCK);
   ctx->max_grid_size = device_query(ctx->dev, MAX_GRID_DIM_X);
   ctx->max_tile_size = sqrt(ctx->max_thread_block_size);
   ctx->max_threshold = 1U<<31; // No limit.
   ctx->max_bespoke = 1U<<31; // No limit.
+
   if (ctx->cfg->gpu.default_registers != 0) {
     ctx->max_registers = ctx->cfg->gpu.default_registers;
   } else {
     ctx->max_registers = device_query(ctx->dev, MAX_REGISTERS_PER_BLOCK);
   }
+
+  if (ctx->cfg->gpu.default_shared_memory != 0) {
+    ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory;
+  } else {
+    // MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably
+    // for backwards compatibility.  Add _OPTIN and you seem to get the
+    // right number.
+    ctx->max_shared_memory = device_query(ctx->dev, MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
+#if CUDART_VERSION >= 12000
+    ctx->max_shared_memory -= device_query(ctx->dev, RESERVED_SHARED_MEMORY_PER_BLOCK);
+#endif
+  }
+
   if (ctx->cfg->gpu.default_cache != 0) {
     ctx->max_cache = ctx->cfg->gpu.default_cache;
   } else {
     ctx->max_cache = device_query(ctx->dev, L2_CACHE_SIZE);
   }
+
   ctx->lockstep_width = device_query(ctx->dev, WARP_SIZE);
   CUDA_SUCCEED_FATAL(cuStreamCreate(&ctx->stream, CU_STREAM_DEFAULT));
   cuda_size_setup(ctx);
diff --git a/rts/c/backends/hip.h b/rts/c/backends/hip.h
index dcaed3e7bb..63aad6b042 100644
--- a/rts/c/backends/hip.h
+++ b/rts/c/backends/hip.h
@@ -634,14 +634,25 @@ int backend_context_setup(struct futhark_context* ctx) {
     }
   }
 
-  ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock);
   ctx->max_thread_block_size = device_query(ctx->dev, hipDeviceAttributeMaxThreadsPerBlock);
   ctx->max_grid_size = device_query(ctx->dev, hipDeviceAttributeMaxGridDimX);
   ctx->max_tile_size = sqrt(ctx->max_thread_block_size);
   ctx->max_threshold = 1U<<31; // No limit.
   ctx->max_bespoke = 0;
   ctx->max_registers = device_query(ctx->dev, hipDeviceAttributeMaxRegistersPerBlock);
-  ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize);
+
+  if (ctx->cfg->gpu.default_shared_memory != 0) {
+    ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory;
+  } else {
+    ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock);
+  }
+
+  if (ctx->cfg->gpu.default_cache != 0) {
+    ctx->max_cache = ctx->cfg->gpu.default_cache;
+  } else {
+      ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize);
+  }
+
   // FIXME: in principle we should query hipDeviceAttributeWarpSize
   // from the device, which will provide 64 on AMD GPUs.
   // Unfortunately, we currently do nasty implicit intra-warp
diff --git a/rts/c/backends/opencl.h b/rts/c/backends/opencl.h
index 56d3230716..e5bae526af 100644
--- a/rts/c/backends/opencl.h
+++ b/rts/c/backends/opencl.h
@@ -710,6 +710,9 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
     }
   }
 
+  bool is_amd = strstr(device_option.platform_name, "AMD") != NULL;
+  bool is_nvidia = strstr(device_option.platform_name, "NVIDIA CUDA") != NULL;
+
   size_t max_thread_block_size;
   OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
                                        sizeof(size_t), &max_thread_block_size, NULL));
@@ -723,9 +726,6 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
   // Futhark reserves 4 bytes for bookkeeping information.
   max_shared_memory -= 4;
 
-  bool is_amd = strstr(device_option.platform_name, "AMD") != NULL;
-  bool is_nvidia = strstr(device_option.platform_name, "NVIDIA CUDA") != NULL;
-
   // The OpenCL implementation may reserve some local memory bytes for
   // various purposes.  In principle, we should use
   // clGetKernelWorkGroupInfo() to figure out for each kernel how much
@@ -792,7 +792,15 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
     }
   }
 
-  ctx->max_cache = l2_cache_size;
+  ctx->max_thread_block_size = max_thread_block_size;
+  ctx->max_tile_size = max_tile_size; // No limit.
+  ctx->max_threshold = ctx->max_grid_size = 1U<<31; // No limit.
+
+  if (ctx->cfg->gpu.default_cache != 0) {
+    ctx->max_cache = ctx->cfg->gpu.default_cache;
+  } else {
+    ctx->max_cache = l2_cache_size;
+  }
 
   if (ctx->cfg->gpu.default_registers != 0) {
     ctx->max_registers = ctx->cfg->gpu.default_registers;
@@ -800,10 +808,11 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
     ctx->max_registers = 1<<16; // I cannot find a way to query for this.
   }
 
-  ctx->max_thread_block_size = max_thread_block_size;
-  ctx->max_tile_size = max_tile_size; // No limit.
-  ctx->max_threshold = ctx->max_grid_size = 1U<<31; // No limit.
-  ctx->max_shared_memory = max_shared_memory;
+  if (ctx->cfg->gpu.default_shared_memory != 0) {
+    ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory;
+  } else {
+    ctx->max_shared_memory = max_shared_memory;
+  }
 
   // Now we go through all the sizes, clamp them to the valid range,
   // or set them to the default.
diff --git a/rts/c/gpu.h b/rts/c/gpu.h
index 52ec479777..69e741cb8f 100644
--- a/rts/c/gpu.h
+++ b/rts/c/gpu.h
@@ -120,6 +120,15 @@ int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg,
     cfg->gpu.default_reg_tile_size = new_value;
     return 0;
   }
+  if (strcmp(param_name, "default_cache") == 0) {
+    cfg->gpu.default_cache = new_value;
+    return 0;
+  }
+  if (strcmp(param_name, "default_shared_memory") == 0) {
+    cfg->gpu.default_shared_memory = new_value;
+    return 0;
+  }
+
   return 1;
 }
 
diff --git a/rts/c/gpu_prototypes.h b/rts/c/gpu_prototypes.h
index 13521be2e8..34ec093c30 100644
--- a/rts/c/gpu_prototypes.h
+++ b/rts/c/gpu_prototypes.h
@@ -12,6 +12,7 @@ struct gpu_config {
   size_t default_tile_size;
   size_t default_reg_tile_size;
   size_t default_cache;
+  size_t default_shared_memory;
   size_t default_registers;
   size_t default_threshold;