Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocl: avoid overhead/regression and code-cleanup #750

Merged
merged 1 commit into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 9 additions & 38 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,6 @@ int c_dbcsr_acc_opencl_order_devices(const void* dev_a, const void* dev_b) {
}


int c_dbcsr_acc_opencl_order_streams(const void* /*a*/, const void* /*b*/);
int c_dbcsr_acc_opencl_order_streams(const void* a, const void* b) { /* NULL-pointers are sorted to the upper end */
const cl_command_queue *const p = (const cl_command_queue*)a, *const q = (const cl_command_queue*)b;
assert(NULL != p && NULL != q);
return *p < *q ? -1 : (*p > *q ? 1 : 0);
}


LIBXSMM_ATTRIBUTE_CTOR void c_dbcsr_acc_opencl_init(void) {
/* attempt to automatically initialize backend */
ACC_OPENCL_EXPECT(EXIT_SUCCESS == c_dbcsr_acc_init());
Expand Down Expand Up @@ -229,8 +221,11 @@ int c_dbcsr_acc_init(void) {
# if defined(_OPENMP)
const int max_threads = omp_get_max_threads(), num_threads = omp_get_num_threads();
c_dbcsr_acc_opencl_config.nthreads = (num_threads < max_threads ? max_threads : num_threads);
c_dbcsr_acc_opencl_config.nstreams = (num_threads < max_threads ? (ACC_OPENCL_STREAMS_MAXCOUNT + max_threads)
: (ACC_OPENCL_STREAMS_MAXCOUNT));
# else
c_dbcsr_acc_opencl_config.nthreads = 1;
c_dbcsr_acc_opencl_config.nstreams = ACC_OPENCL_STREAMS_MAXCOUNT;
# endif
c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose));
c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority));
Expand Down Expand Up @@ -574,12 +569,9 @@ int c_dbcsr_acc_init(void) {
}
# endif
if (EXIT_SUCCESS == result) {
const int nelements = ACC_OPENCL_STREAMS_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads;
const int nelements = c_dbcsr_acc_opencl_config.nthreads * c_dbcsr_acc_opencl_config.nstreams;
c_dbcsr_acc_opencl_config.streams = (void**)calloc(nelements, sizeof(void*)); /* allocate streams */
if (NULL != c_dbcsr_acc_opencl_config.streams) { /* allocate counters */
c_dbcsr_acc_opencl_config.stats = (cl_command_queue*)calloc(nelements, sizeof(cl_command_queue));
}
else result = EXIT_FAILURE;
if (NULL == c_dbcsr_acc_opencl_config.streams) result = EXIT_FAILURE;
}
}
}
Expand Down Expand Up @@ -627,29 +619,6 @@ int c_dbcsr_acc_finalize(void) {
{
fprintf(stderr, " device=%i", d);
}
if (NULL != c_dbcsr_acc_opencl_config.stats) {
const int nelements = ACC_OPENCL_STREAMS_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads;
cl_command_queue s = NULL;
int nstreams, j;
fprintf(stderr, " streams={");
for (i = 0; i < nelements; i += ACC_OPENCL_STREAMS_MAXCOUNT) {
for (j = 0, nstreams = 0; j < ACC_OPENCL_STREAMS_MAXCOUNT; ++j) {
if (NULL != c_dbcsr_acc_opencl_config.stats[i + j]) ++nstreams;
}
if (0 != nstreams || 0 == i) fprintf(stderr, 0 < i ? " %i" : "%i", nstreams);
}
qsort(c_dbcsr_acc_opencl_config.stats, nelements, sizeof(cl_command_queue),
c_dbcsr_acc_opencl_order_streams); /* NULL -> upper end */
for (i = 0, nstreams = 0; i < nelements; ++i) {
const cl_command_queue q = c_dbcsr_acc_opencl_config.stats[i];
if (NULL != q && s != q) {
s = q;
++nstreams;
}
}
free(c_dbcsr_acc_opencl_config.stats); /* release buffer */
fprintf(stderr, "} nstreams=%i", nstreams);
}
fprintf(stderr, "\n");
}
# if defined(__DBCSR_ACC)
Expand Down Expand Up @@ -1086,18 +1055,20 @@ int c_dbcsr_acc_set_active_device(int device_id) {


int c_dbcsr_acc_opencl_device_synchronize(int thread_id) {
void** const streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * thread_id;
void** const streams = c_dbcsr_acc_opencl_config.streams + thread_id * c_dbcsr_acc_opencl_config.nstreams;
int result = EXIT_SUCCESS;
int i = 0;
assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
for (; i < c_dbcsr_acc_opencl_config.nstreams; ++i) {
void* const stream = streams[i];
if (NULL != stream) {
result = c_dbcsr_acc_stream_sync(stream);
if (EXIT_SUCCESS != result) break;
}
# if defined(ACC_OPENCL_STREAM_COMPACT)
else break;
# endif
}
return result;
}
Expand Down
12 changes: 8 additions & 4 deletions src/acc/opencl/acc_opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@
#endif
/** Counted on a per-thread basis! */
#if !defined(ACC_OPENCL_HANDLES_MAXCOUNT)
# define ACC_OPENCL_HANDLES_MAXCOUNT 1024
# define ACC_OPENCL_HANDLES_MAXCOUNT 64
#endif
/** Counted on a per-thread basis! */
#if !defined(ACC_OPENCL_STREAMS_MAXCOUNT)
# define ACC_OPENCL_STREAMS_MAXCOUNT 1024
# define ACC_OPENCL_STREAMS_MAXCOUNT 64
#endif
#if !defined(ACC_OPENCL_OVERMALLOC)
# if defined(__DBCSR_ACC) || 1
Expand All @@ -115,6 +115,10 @@
# define ACC_OPENCL_STREAM_PRIORITIES
# endif
#endif
/** Streams are registered in compact/consecutive fashion */
#if !defined(ACC_OPENCL_STREAM_COMPACT) && 1
# define ACC_OPENCL_STREAM_COMPACT
#endif
/** Stream-argument (ACC-interface) can be NULL (synchronous) */
#if !defined(ACC_OPENCL_STREAM_NULL) && 1
# define ACC_OPENCL_STREAM_NULL
Expand Down Expand Up @@ -251,8 +255,6 @@ typedef struct c_dbcsr_acc_opencl_config_t {
void **clmems, **events, *storage;
/** All created streams partitioned by thread-ID (thread-local slots). */
void** streams;
/** Counts number of streams created (thread-local). */
cl_command_queue* stats;
/** Kind of timer used for built-in execution-profile. */
c_dbcsr_acc_opencl_timer_t timer; /* c_dbcsr_acc_opencl_device_t? */
/** Kernel-parameters are matched against device's UID */
Expand All @@ -263,6 +265,8 @@ typedef struct c_dbcsr_acc_opencl_config_t {
cl_int ndevices;
/** Maximum number of threads (omp_get_max_threads). */
cl_int nthreads;
/** Maximum number of streams per thread. */
cl_int nstreams;
/** How to apply/use stream priorities. */
cl_int priority;
/** How to zero/copy device-side buffers. */
Expand Down
25 changes: 18 additions & 7 deletions src/acc/opencl/acc_opencl_mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,11 @@ void* c_dbcsr_acc_opencl_info_devptr(const void* memory, const size_t* amount, s
char* const mem = (char*)(NULL != handle ? *handle : NULL);
if (mem == buffer) { /* fast-path */
if (NULL != offset) *offset = 0;
assert(NULL != mem);
result = handle;
break;
}
else {
else if (NULL != mem) {
size_t d = buffer - mem, s = 0;
if (d < hit && NULL != offset &&
(NULL == amount ||
Expand Down Expand Up @@ -148,7 +149,9 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
*host_mem = NULL;
}
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
else { /* error: mapping host buffer */
Expand Down Expand Up @@ -195,7 +198,9 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) {
}
# endif
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
result_release = clReleaseMemObject(info.memory);
if (EXIT_SUCCESS == result) result = result_release;
Expand Down Expand Up @@ -397,7 +402,9 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
result = clEnqueueWriteBuffer(
queue, buffer, 0 == (1 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL);
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
}
Expand Down Expand Up @@ -445,7 +452,7 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v
queue, buffer, 0 == (2 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL);
if (CL_SUCCESS == result) {
# if defined(ACC_OPENCL_STREAM_NULL)
result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream) result = c_dbcsr_acc_stream_sync(&queue);
# endif
}
else { /* synchronous */
Expand Down Expand Up @@ -533,7 +540,9 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED);
}
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
}
Expand Down Expand Up @@ -604,7 +613,9 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb
LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED);
}
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
}
Expand Down
43 changes: 24 additions & 19 deletions src/acc/opencl/acc_opencl_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,12 @@ const int* c_dbcsr_acc_opencl_stream_priority(const void* stream) {


void* c_dbcsr_acc_opencl_stream_default(void) {
const int tid = ACC_OPENCL_OMP_TID();
const int base = ACC_OPENCL_STREAMS_MAXCOUNT * tid;
const int tid = ACC_OPENCL_OMP_TID(), base = tid * c_dbcsr_acc_opencl_config.nstreams;
void* result = NULL;
int i = base;
assert(tid < c_dbcsr_acc_opencl_config.nthreads);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; i < (base + ACC_OPENCL_STREAMS_MAXCOUNT); ++i) {
for (; i < (base + c_dbcsr_acc_opencl_config.nstreams); ++i) {
if (NULL != c_dbcsr_acc_opencl_config.streams[i]) {
result = c_dbcsr_acc_opencl_config.streams + i;
break;
Expand All @@ -74,7 +73,6 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
int result, i, tid = 0, offset = 0;
cl_command_queue queue = NULL;
cl_context context = NULL;
void** streams = NULL;
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
int routine_handle;
static const char* const routine_name_ptr = LIBXSMM_FUNCNAME;
Expand Down Expand Up @@ -181,14 +179,15 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
else {
result = EXIT_FAILURE;
}
# if defined(_OPENMP) && 0
# pragma omp critical(c_dbcsr_acc_opencl_stream)
# endif
if (EXIT_SUCCESS == result) {
const int base = ACC_OPENCL_STREAMS_MAXCOUNT * tid;
cl_command_queue* const stats = c_dbcsr_acc_opencl_config.stats + base;
streams = c_dbcsr_acc_opencl_config.streams + base;
for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams;
for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) {
if (NULL == streams[i]) break;
}
if (i < ACC_OPENCL_STREAMS_MAXCOUNT) { /* register stream */
if (i < c_dbcsr_acc_opencl_config.nstreams) { /* register stream */
const size_t size_info = sizeof(c_dbcsr_acc_opencl_info_stream_t);
const size_t size = sizeof(cl_command_queue) + sizeof(void*) + size_info - 1;
void* const handle = malloc(size);
Expand All @@ -201,7 +200,7 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
info->pointer = (void*)address;
info->priority = priority;
info->tid = tid;
stats[i] = *(cl_command_queue*)aligned = queue;
*(cl_command_queue*)aligned = queue;
streams[i] = *stream_p = (void*)aligned;
assert(queue == *ACC_OPENCL_STREAM(streams[i]));
assert(queue == *ACC_OPENCL_STREAM(*stream_p));
Expand Down Expand Up @@ -241,26 +240,32 @@ int c_dbcsr_acc_stream_destroy(void* stream) {
# endif
if (NULL != stream) {
const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
if (NULL != queue) {
int tid = 0, i = ACC_OPENCL_STREAMS_MAXCOUNT;
void** streams = NULL;
const int result_release = clReleaseCommandQueue(queue); /* soft-error */
int tid = 0, i = c_dbcsr_acc_opencl_config.nstreams;
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; tid < c_dbcsr_acc_opencl_config.nthreads; ++tid) { /* unregister */
streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * tid;
for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams;
for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) {
if (stream == streams[i]) {
const int j = i + 1, result_release = clReleaseCommandQueue(queue); /* soft-error */
if (j < ACC_OPENCL_STREAMS_MAXCOUNT && NULL != streams[j]) { /* compacting streams is not thread-safe */
memmove(streams + i, streams + j, sizeof(void*) * (ACC_OPENCL_STREAMS_MAXCOUNT - j));
# if defined(ACC_OPENCL_STREAM_COMPACT)
const int j = i + 1, k = c_dbcsr_acc_opencl_config.nstreams - j;
if (j < c_dbcsr_acc_opencl_config.nstreams && NULL != streams[j]) { /* compacting streams is not thread-safe */
memmove(streams + i, streams + j, sizeof(void*) * k);
}
streams[ACC_OPENCL_STREAMS_MAXCOUNT - j] = NULL;
# else
const int k = i;
# endif
streams[k] = NULL;
tid = c_dbcsr_acc_opencl_config.nthreads; /* leave outer loop */
result = result_release; /* promote */
break;
}
# if defined(ACC_OPENCL_STREAM_COMPACT)
else if (NULL == streams[i]) { /* compact streams */
break;
}
# endif
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/acc/opencl/smm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ add_custom_target(
${LIBSMM_ACC_HEADER_KERNELS}
DEPENDS ${SMM_ACC_KERNEL_SCRIPT} ${SMM_ACC_KERNELS}
BYPRODUCTS ${LIBSMM_ACC_HEADER_KERNELS}
COMMENT "libsmm_acc: generating kernels")
COMMENT "ACC/LIBSMM OpenCL: collecting tuned kernel parameters...")

add_dependencies(dbcsr parameters)
target_include_directories(dbcsr PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
Expand Down
Loading