From d9141a2119a6fba6daf8dd3f244afc2dbd4b1d5a Mon Sep 17 00:00:00 2001 From: Troels Henriksen Date: Thu, 22 Aug 2024 13:01:50 +0200 Subject: [PATCH] Add tuning params for GPU cache and shared memory. --- CHANGELOG.md | 4 +++- rts/c/backends/cuda.h | 22 +++++++++++++++------- rts/c/backends/hip.h | 15 +++++++++++++-- rts/c/backends/opencl.h | 25 +++++++++++++++++-------- rts/c/gpu.h | 9 +++++++++ rts/c/gpu_prototypes.h | 1 + 6 files changed, 58 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e7a890469..2d22bbbbdf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * Logging now prints more GPU information on context initialisation. -* GPU cache size can now be configured. +* GPU cache size can now be configured (tuning param: `default_cache`). + +* GPU shared memory can now be configured (tuning param: `default_shared_memory`). * GPU register capacity can now be configured. diff --git a/rts/c/backends/cuda.h b/rts/c/backends/cuda.h index 274100cded..8c91d8cf4e 100644 --- a/rts/c/backends/cuda.h +++ b/rts/c/backends/cuda.h @@ -779,28 +779,36 @@ int backend_context_setup(struct futhark_context* ctx) { } } - // MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably - // for backwards compatibility. Add _OPTIN and you seem to get the - // right number. - ctx->max_shared_memory = device_query(ctx->dev, MAX_SHARED_MEMORY_PER_BLOCK_OPTIN); -#if CUDART_VERSION >= 12000 - ctx->max_shared_memory -= device_query(ctx->dev, RESERVED_SHARED_MEMORY_PER_BLOCK); -#endif ctx->max_thread_block_size = device_query(ctx->dev, MAX_THREADS_PER_BLOCK); ctx->max_grid_size = device_query(ctx->dev, MAX_GRID_DIM_X); ctx->max_tile_size = sqrt(ctx->max_thread_block_size); ctx->max_threshold = 1U<<31; // No limit. ctx->max_bespoke = 1U<<31; // No limit. + if (ctx->cfg->gpu.default_registers != 0) { ctx->max_registers = ctx->cfg->gpu.default_registers; } else { ctx->max_registers = device_query(ctx->dev, MAX_REGISTERS_PER_BLOCK); } + + if (ctx->cfg->gpu.default_shared_memory != 0) { + ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory; + } else { + // MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably + // for backwards compatibility. Add _OPTIN and you seem to get the + // right number. + ctx->max_shared_memory = device_query(ctx->dev, MAX_SHARED_MEMORY_PER_BLOCK_OPTIN); +#if CUDART_VERSION >= 12000 + ctx->max_shared_memory -= device_query(ctx->dev, RESERVED_SHARED_MEMORY_PER_BLOCK); +#endif + } + if (ctx->cfg->gpu.default_cache != 0) { ctx->max_cache = ctx->cfg->gpu.default_cache; } else { ctx->max_cache = device_query(ctx->dev, L2_CACHE_SIZE); } + ctx->lockstep_width = device_query(ctx->dev, WARP_SIZE); CUDA_SUCCEED_FATAL(cuStreamCreate(&ctx->stream, CU_STREAM_DEFAULT)); cuda_size_setup(ctx); diff --git a/rts/c/backends/hip.h b/rts/c/backends/hip.h index dcaed3e7bb..63aad6b042 100644 --- a/rts/c/backends/hip.h +++ b/rts/c/backends/hip.h @@ -634,14 +634,25 @@ int backend_context_setup(struct futhark_context* ctx) { } } - ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock); ctx->max_thread_block_size = device_query(ctx->dev, hipDeviceAttributeMaxThreadsPerBlock); ctx->max_grid_size = device_query(ctx->dev, hipDeviceAttributeMaxGridDimX); ctx->max_tile_size = sqrt(ctx->max_thread_block_size); ctx->max_threshold = 1U<<31; // No limit. ctx->max_bespoke = 0; ctx->max_registers = device_query(ctx->dev, hipDeviceAttributeMaxRegistersPerBlock); - ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize); + + if (ctx->cfg->gpu.default_shared_memory != 0) { + ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory; + } else { + ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock); + } + + if (ctx->cfg->gpu.default_cache != 0) { + ctx->max_cache = ctx->cfg->gpu.default_cache; + } else { + ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize); + } + // FIXME: in principle we should query hipDeviceAttributeWarpSize // from the device, which will provide 64 on AMD GPUs. // Unfortunately, we currently do nasty implicit intra-warp diff --git a/rts/c/backends/opencl.h b/rts/c/backends/opencl.h index 56d3230716..e5bae526af 100644 --- a/rts/c/backends/opencl.h +++ b/rts/c/backends/opencl.h @@ -710,6 +710,9 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx, } } + bool is_amd = strstr(device_option.platform_name, "AMD") != NULL; + bool is_nvidia = strstr(device_option.platform_name, "NVIDIA CUDA") != NULL; + size_t max_thread_block_size; OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_thread_block_size, NULL)); @@ -723,9 +726,6 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx, // Futhark reserves 4 bytes for bookkeeping information. max_shared_memory -= 4; - bool is_amd = strstr(device_option.platform_name, "AMD") != NULL; - bool is_nvidia = strstr(device_option.platform_name, "NVIDIA CUDA") != NULL; - // The OpenCL implementation may reserve some local memory bytes for // various purposes. In principle, we should use // clGetKernelWorkGroupInfo() to figure out for each kernel how much @@ -792,7 +792,15 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx, } } - ctx->max_cache = l2_cache_size; + ctx->max_thread_block_size = max_thread_block_size; + ctx->max_tile_size = max_tile_size; // No limit. + ctx->max_threshold = ctx->max_grid_size = 1U<<31; // No limit. + + if (ctx->cfg->gpu.default_cache != 0) { + ctx->max_cache = ctx->cfg->gpu.default_cache; + } else { + ctx->max_cache = l2_cache_size; + } if (ctx->cfg->gpu.default_registers != 0) { ctx->max_registers = ctx->cfg->gpu.default_registers; @@ -800,10 +808,11 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx, ctx->max_registers = 1<<16; // I cannot find a way to query for this. } - ctx->max_thread_block_size = max_thread_block_size; - ctx->max_tile_size = max_tile_size; // No limit. - ctx->max_threshold = ctx->max_grid_size = 1U<<31; // No limit. - ctx->max_shared_memory = max_shared_memory; + if (ctx->cfg->gpu.default_shared_memory != 0) { + ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory; + } else { + ctx->max_shared_memory = max_shared_memory; + } // Now we go through all the sizes, clamp them to the valid range, // or set them to the default. diff --git a/rts/c/gpu.h b/rts/c/gpu.h index 52ec479777..69e741cb8f 100644 --- a/rts/c/gpu.h +++ b/rts/c/gpu.h @@ -120,6 +120,15 @@ int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg, cfg->gpu.default_reg_tile_size = new_value; return 0; } + if (strcmp(param_name, "default_cache") == 0) { + cfg->gpu.default_cache = new_value; + return 0; + } + if (strcmp(param_name, "default_shared_memory") == 0) { + cfg->gpu.default_shared_memory = new_value; + return 0; + } + return 1; } diff --git a/rts/c/gpu_prototypes.h b/rts/c/gpu_prototypes.h index 13521be2e8..34ec093c30 100644 --- a/rts/c/gpu_prototypes.h +++ b/rts/c/gpu_prototypes.h @@ -12,6 +12,7 @@ struct gpu_config { size_t default_tile_size; size_t default_reg_tile_size; size_t default_cache; + size_t default_shared_memory; size_t default_registers; size_t default_threshold;