Skip to content

Commit

Permalink
Add tuning params for GPU cache and shared memory.
Browse files Browse the repository at this point in the history
  • Loading branch information
athas committed Aug 22, 2024
1 parent 1b13667 commit d9141a2
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 18 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

* Logging now prints more GPU information on context initialisation.

* GPU cache size can now be configured.
* GPU cache size can now be configured (tuning param: `default_cache`).

* GPU shared memory can now be configured (tuning param: `default_shared_memory`).

* GPU register capacity can now be configured.

Expand Down
22 changes: 15 additions & 7 deletions rts/c/backends/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -779,28 +779,36 @@ int backend_context_setup(struct futhark_context* ctx) {
}
}

// MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably
// for backwards compatibility. Add _OPTIN and you seem to get the
// right number.
ctx->max_shared_memory = device_query(ctx->dev, MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
#if CUDART_VERSION >= 12000
ctx->max_shared_memory -= device_query(ctx->dev, RESERVED_SHARED_MEMORY_PER_BLOCK);
#endif
ctx->max_thread_block_size = device_query(ctx->dev, MAX_THREADS_PER_BLOCK);
ctx->max_grid_size = device_query(ctx->dev, MAX_GRID_DIM_X);
ctx->max_tile_size = sqrt(ctx->max_thread_block_size);
ctx->max_threshold = 1U<<31; // No limit.
ctx->max_bespoke = 1U<<31; // No limit.

if (ctx->cfg->gpu.default_registers != 0) {
ctx->max_registers = ctx->cfg->gpu.default_registers;
} else {
ctx->max_registers = device_query(ctx->dev, MAX_REGISTERS_PER_BLOCK);
}

if (ctx->cfg->gpu.default_shared_memory != 0) {
ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory;
} else {
// MAX_SHARED_MEMORY_PER_BLOCK gives bogus numbers (48KiB); probably
// for backwards compatibility. Add _OPTIN and you seem to get the
// right number.
ctx->max_shared_memory = device_query(ctx->dev, MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
#if CUDART_VERSION >= 12000
ctx->max_shared_memory -= device_query(ctx->dev, RESERVED_SHARED_MEMORY_PER_BLOCK);
#endif
}

if (ctx->cfg->gpu.default_cache != 0) {
ctx->max_cache = ctx->cfg->gpu.default_cache;
} else {
ctx->max_cache = device_query(ctx->dev, L2_CACHE_SIZE);
}

ctx->lockstep_width = device_query(ctx->dev, WARP_SIZE);
CUDA_SUCCEED_FATAL(cuStreamCreate(&ctx->stream, CU_STREAM_DEFAULT));
cuda_size_setup(ctx);
Expand Down
15 changes: 13 additions & 2 deletions rts/c/backends/hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -634,14 +634,25 @@ int backend_context_setup(struct futhark_context* ctx) {
}
}

ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock);
ctx->max_thread_block_size = device_query(ctx->dev, hipDeviceAttributeMaxThreadsPerBlock);
ctx->max_grid_size = device_query(ctx->dev, hipDeviceAttributeMaxGridDimX);
ctx->max_tile_size = sqrt(ctx->max_thread_block_size);
ctx->max_threshold = 1U<<31; // No limit.
ctx->max_bespoke = 0;
ctx->max_registers = device_query(ctx->dev, hipDeviceAttributeMaxRegistersPerBlock);
ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize);

if (ctx->cfg->gpu.default_shared_memory != 0) {
ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory;
} else {
ctx->max_shared_memory = device_query(ctx->dev, hipDeviceAttributeMaxSharedMemoryPerBlock);
}

if (ctx->cfg->gpu.default_cache != 0) {
ctx->max_cache = ctx->cfg->gpu.default_cache;
} else {
ctx->max_cache = device_query(ctx->dev, hipDeviceAttributeL2CacheSize);
}

// FIXME: in principle we should query hipDeviceAttributeWarpSize
// from the device, which will provide 64 on AMD GPUs.
// Unfortunately, we currently do nasty implicit intra-warp
Expand Down
25 changes: 17 additions & 8 deletions rts/c/backends/opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,9 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
}
}

bool is_amd = strstr(device_option.platform_name, "AMD") != NULL;
bool is_nvidia = strstr(device_option.platform_name, "NVIDIA CUDA") != NULL;

size_t max_thread_block_size;
OPENCL_SUCCEED_FATAL(clGetDeviceInfo(device_option.device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), &max_thread_block_size, NULL));
Expand All @@ -723,9 +726,6 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
// Futhark reserves 4 bytes for bookkeeping information.
max_shared_memory -= 4;

bool is_amd = strstr(device_option.platform_name, "AMD") != NULL;
bool is_nvidia = strstr(device_option.platform_name, "NVIDIA CUDA") != NULL;

// The OpenCL implementation may reserve some local memory bytes for
// various purposes. In principle, we should use
// clGetKernelWorkGroupInfo() to figure out for each kernel how much
Expand Down Expand Up @@ -792,18 +792,27 @@ static void setup_opencl_with_command_queue(struct futhark_context *ctx,
}
}

ctx->max_cache = l2_cache_size;
ctx->max_thread_block_size = max_thread_block_size;
ctx->max_tile_size = max_tile_size; // No limit.
ctx->max_threshold = ctx->max_grid_size = 1U<<31; // No limit.

if (ctx->cfg->gpu.default_cache != 0) {
ctx->max_cache = ctx->cfg->gpu.default_cache;
} else {
ctx->max_cache = l2_cache_size;
}

if (ctx->cfg->gpu.default_registers != 0) {
ctx->max_registers = ctx->cfg->gpu.default_registers;
} else {
ctx->max_registers = 1<<16; // I cannot find a way to query for this.
}

ctx->max_thread_block_size = max_thread_block_size;
ctx->max_tile_size = max_tile_size; // No limit.
ctx->max_threshold = ctx->max_grid_size = 1U<<31; // No limit.
ctx->max_shared_memory = max_shared_memory;
if (ctx->cfg->gpu.default_shared_memory != 0) {
ctx->max_shared_memory = ctx->cfg->gpu.default_shared_memory;
} else {
ctx->max_shared_memory = max_shared_memory;
}

// Now we go through all the sizes, clamp them to the valid range,
// or set them to the default.
Expand Down
9 changes: 9 additions & 0 deletions rts/c/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,15 @@ int futhark_context_config_set_tuning_param(struct futhark_context_config *cfg,
cfg->gpu.default_reg_tile_size = new_value;
return 0;
}
if (strcmp(param_name, "default_cache") == 0) {
cfg->gpu.default_cache = new_value;
return 0;
}
if (strcmp(param_name, "default_shared_memory") == 0) {
cfg->gpu.default_shared_memory = new_value;
return 0;
}

return 1;
}

Expand Down
1 change: 1 addition & 0 deletions rts/c/gpu_prototypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ struct gpu_config {
size_t default_tile_size;
size_t default_reg_tile_size;
size_t default_cache;
size_t default_shared_memory;
size_t default_registers;
size_t default_threshold;

Expand Down

0 comments on commit d9141a2

Please sign in to comment.