Skip to content

Commit

Permalink
ocl: avoid overhead/regression and code-cleanup
Browse files Browse the repository at this point in the history
* Adjusted ACC_OPENCL_HANDLES_MAXCOUNT and ACC_OPENCL_STREAMS_MAXCOUNT.
* Avoid overhead of supporting ACC_OPENCL_STREAM_NULL.
* Introduced ACC_OPENCL_STREAM_COMPACT.
* Removed stream statistics.
  • Loading branch information
hfp committed Jan 11, 2024
1 parent 34178f4 commit cf6cfbe
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 67 deletions.
47 changes: 9 additions & 38 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,6 @@ int c_dbcsr_acc_opencl_order_devices(const void* dev_a, const void* dev_b) {
}


int c_dbcsr_acc_opencl_order_streams(const void* /*a*/, const void* /*b*/);
int c_dbcsr_acc_opencl_order_streams(const void* a, const void* b) { /* NULL-pointers are sorted to the upper end */
const cl_command_queue *const p = (const cl_command_queue*)a, *const q = (const cl_command_queue*)b;
assert(NULL != p && NULL != q);
return *p < *q ? -1 : (*p > *q ? 1 : 0);
}


LIBXSMM_ATTRIBUTE_CTOR void c_dbcsr_acc_opencl_init(void) {
/* attempt to automatically initialize backend */
ACC_OPENCL_EXPECT(EXIT_SUCCESS == c_dbcsr_acc_init());
Expand Down Expand Up @@ -229,8 +221,11 @@ int c_dbcsr_acc_init(void) {
# if defined(_OPENMP)
const int max_threads = omp_get_max_threads(), num_threads = omp_get_num_threads();
c_dbcsr_acc_opencl_config.nthreads = (num_threads < max_threads ? max_threads : num_threads);
c_dbcsr_acc_opencl_config.nstreams = (num_threads < max_threads ? (ACC_OPENCL_STREAMS_MAXCOUNT + max_threads)
: (ACC_OPENCL_STREAMS_MAXCOUNT));
# else
c_dbcsr_acc_opencl_config.nthreads = 1;
c_dbcsr_acc_opencl_config.nstreams = ACC_OPENCL_STREAMS_MAXCOUNT;
# endif
c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose));
c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority));
Expand Down Expand Up @@ -574,12 +569,9 @@ int c_dbcsr_acc_init(void) {
}
# endif
if (EXIT_SUCCESS == result) {
const int nelements = ACC_OPENCL_STREAMS_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads;
const int nelements = c_dbcsr_acc_opencl_config.nthreads * c_dbcsr_acc_opencl_config.nstreams;
c_dbcsr_acc_opencl_config.streams = (void**)calloc(nelements, sizeof(void*)); /* allocate streams */
if (NULL != c_dbcsr_acc_opencl_config.streams) { /* allocate counters */
c_dbcsr_acc_opencl_config.stats = (cl_command_queue*)calloc(nelements, sizeof(cl_command_queue));
}
else result = EXIT_FAILURE;
if (NULL == c_dbcsr_acc_opencl_config.streams) result = EXIT_FAILURE;
}
}
}
Expand Down Expand Up @@ -627,29 +619,6 @@ int c_dbcsr_acc_finalize(void) {
{
fprintf(stderr, " device=%i", d);
}
if (NULL != c_dbcsr_acc_opencl_config.stats) {
const int nelements = ACC_OPENCL_STREAMS_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads;
cl_command_queue s = NULL;
int nstreams, j;
fprintf(stderr, " streams={");
for (i = 0; i < nelements; i += ACC_OPENCL_STREAMS_MAXCOUNT) {
for (j = 0, nstreams = 0; j < ACC_OPENCL_STREAMS_MAXCOUNT; ++j) {
if (NULL != c_dbcsr_acc_opencl_config.stats[i + j]) ++nstreams;
}
if (0 != nstreams || 0 == i) fprintf(stderr, 0 < i ? " %i" : "%i", nstreams);
}
qsort(c_dbcsr_acc_opencl_config.stats, nelements, sizeof(cl_command_queue),
c_dbcsr_acc_opencl_order_streams); /* NULL -> upper end */
for (i = 0, nstreams = 0; i < nelements; ++i) {
const cl_command_queue q = c_dbcsr_acc_opencl_config.stats[i];
if (NULL != q && s != q) {
s = q;
++nstreams;
}
}
free(c_dbcsr_acc_opencl_config.stats); /* release buffer */
fprintf(stderr, "} nstreams=%i", nstreams);
}
fprintf(stderr, "\n");
}
# if defined(__DBCSR_ACC)
Expand Down Expand Up @@ -1086,18 +1055,20 @@ int c_dbcsr_acc_set_active_device(int device_id) {


int c_dbcsr_acc_opencl_device_synchronize(int thread_id) {
void** const streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * thread_id;
void** const streams = c_dbcsr_acc_opencl_config.streams + thread_id * c_dbcsr_acc_opencl_config.nstreams;
int result = EXIT_SUCCESS;
int i = 0;
assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
for (; i < c_dbcsr_acc_opencl_config.nstreams; ++i) {
void* const stream = streams[i];
if (NULL != stream) {
result = c_dbcsr_acc_stream_sync(stream);
if (EXIT_SUCCESS != result) break;
}
# if defined(ACC_OPENCL_STREAM_COMPACT)
else break;
# endif
}
return result;
}
Expand Down
12 changes: 8 additions & 4 deletions src/acc/opencl/acc_opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,11 @@
#endif
/** Counted on a per-thread basis! */
#if !defined(ACC_OPENCL_HANDLES_MAXCOUNT)
# define ACC_OPENCL_HANDLES_MAXCOUNT 1024
# define ACC_OPENCL_HANDLES_MAXCOUNT 64
#endif
/** Counted on a per-thread basis! */
#if !defined(ACC_OPENCL_STREAMS_MAXCOUNT)
# define ACC_OPENCL_STREAMS_MAXCOUNT 1024
# define ACC_OPENCL_STREAMS_MAXCOUNT 64
#endif
#if !defined(ACC_OPENCL_OVERMALLOC)
# if defined(__DBCSR_ACC) || 1
Expand All @@ -115,6 +115,10 @@
# define ACC_OPENCL_STREAM_PRIORITIES
# endif
#endif
/** Streams are registered in compact/consecutive fashion */
#if !defined(ACC_OPENCL_STREAM_COMPACT) && 1
# define ACC_OPENCL_STREAM_COMPACT
#endif
/** Stream-argument (ACC-interface) can be NULL (synchronous) */
#if !defined(ACC_OPENCL_STREAM_NULL) && 1
# define ACC_OPENCL_STREAM_NULL
Expand Down Expand Up @@ -251,8 +255,6 @@ typedef struct c_dbcsr_acc_opencl_config_t {
void **clmems, **events, *storage;
/** All created streams partitioned by thread-ID (thread-local slots). */
void** streams;
/** Counts number of streams created (thread-local). */
cl_command_queue* stats;
/** Kind of timer used for built-in execution-profile. */
c_dbcsr_acc_opencl_timer_t timer; /* c_dbcsr_acc_opencl_device_t? */
/** Kernel-parameters are matched against device's UID */
Expand All @@ -263,6 +265,8 @@ typedef struct c_dbcsr_acc_opencl_config_t {
cl_int ndevices;
/** Maximum number of threads (omp_get_max_threads). */
cl_int nthreads;
/** Maximum number of streams per thread. */
cl_int nstreams;
/** How to apply/use stream priorities. */
cl_int priority;
/** How to zero/copy device-side buffers. */
Expand Down
22 changes: 16 additions & 6 deletions src/acc/opencl/acc_opencl_mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,9 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
*host_mem = NULL;
}
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
else { /* error: mapping host buffer */
Expand Down Expand Up @@ -195,7 +197,9 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) {
}
# endif
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
result_release = clReleaseMemObject(info.memory);
if (EXIT_SUCCESS == result) result = result_release;
Expand Down Expand Up @@ -397,7 +401,9 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
result = clEnqueueWriteBuffer(
queue, buffer, 0 == (1 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL);
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
}
Expand Down Expand Up @@ -445,7 +451,7 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v
queue, buffer, 0 == (2 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL);
if (CL_SUCCESS == result) {
# if defined(ACC_OPENCL_STREAM_NULL)
result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream) result = c_dbcsr_acc_stream_sync(&queue);
# endif
}
else { /* synchronous */
Expand Down Expand Up @@ -533,7 +539,9 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED);
}
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
}
Expand Down Expand Up @@ -604,7 +612,9 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb
LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED);
}
# if defined(ACC_OPENCL_STREAM_NULL)
if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue);
if (NULL == stream && EXIT_SUCCESS == result) {
result = c_dbcsr_acc_stream_sync(&queue);
}
# endif
}
}
Expand Down
43 changes: 24 additions & 19 deletions src/acc/opencl/acc_opencl_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,12 @@ const int* c_dbcsr_acc_opencl_stream_priority(const void* stream) {


void* c_dbcsr_acc_opencl_stream_default(void) {
const int tid = ACC_OPENCL_OMP_TID();
const int base = ACC_OPENCL_STREAMS_MAXCOUNT * tid;
const int tid = ACC_OPENCL_OMP_TID(), base = tid * c_dbcsr_acc_opencl_config.nstreams;
void* result = NULL;
int i = base;
assert(tid < c_dbcsr_acc_opencl_config.nthreads);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; i < (base + ACC_OPENCL_STREAMS_MAXCOUNT); ++i) {
for (; i < (base + c_dbcsr_acc_opencl_config.nstreams); ++i) {
if (NULL != c_dbcsr_acc_opencl_config.streams[i]) {
result = c_dbcsr_acc_opencl_config.streams + i;
break;
Expand All @@ -74,7 +73,6 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
int result, i, tid = 0, offset = 0;
cl_command_queue queue = NULL;
cl_context context = NULL;
void** streams = NULL;
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
int routine_handle;
static const char* const routine_name_ptr = LIBXSMM_FUNCNAME;
Expand Down Expand Up @@ -181,14 +179,15 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
else {
result = EXIT_FAILURE;
}
# if defined(_OPENMP) && 0
# pragma omp critical(c_dbcsr_acc_opencl_stream)
# endif
if (EXIT_SUCCESS == result) {
const int base = ACC_OPENCL_STREAMS_MAXCOUNT * tid;
cl_command_queue* const stats = c_dbcsr_acc_opencl_config.stats + base;
streams = c_dbcsr_acc_opencl_config.streams + base;
for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams;
for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) {
if (NULL == streams[i]) break;
}
if (i < ACC_OPENCL_STREAMS_MAXCOUNT) { /* register stream */
if (i < c_dbcsr_acc_opencl_config.nstreams) { /* register stream */
const size_t size_info = sizeof(c_dbcsr_acc_opencl_info_stream_t);
const size_t size = sizeof(cl_command_queue) + sizeof(void*) + size_info - 1;
void* const handle = malloc(size);
Expand All @@ -201,7 +200,7 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
info->pointer = (void*)address;
info->priority = priority;
info->tid = tid;
stats[i] = *(cl_command_queue*)aligned = queue;
*(cl_command_queue*)aligned = queue;
streams[i] = *stream_p = (void*)aligned;
assert(queue == *ACC_OPENCL_STREAM(streams[i]));
assert(queue == *ACC_OPENCL_STREAM(*stream_p));
Expand Down Expand Up @@ -241,26 +240,32 @@ int c_dbcsr_acc_stream_destroy(void* stream) {
# endif
if (NULL != stream) {
const cl_command_queue queue = *ACC_OPENCL_STREAM(stream);
assert(NULL != c_dbcsr_acc_opencl_config.streams);
if (NULL != queue) {
int tid = 0, i = ACC_OPENCL_STREAMS_MAXCOUNT;
void** streams = NULL;
const int result_release = clReleaseCommandQueue(queue); /* soft-error */
int tid = 0, i = c_dbcsr_acc_opencl_config.nstreams;
assert(NULL != c_dbcsr_acc_opencl_config.streams);
for (; tid < c_dbcsr_acc_opencl_config.nthreads; ++tid) { /* unregister */
streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * tid;
for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) {
void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams;
for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) {
if (stream == streams[i]) {
const int j = i + 1, result_release = clReleaseCommandQueue(queue); /* soft-error */
if (j < ACC_OPENCL_STREAMS_MAXCOUNT && NULL != streams[j]) { /* compacting streams is not thread-safe */
memmove(streams + i, streams + j, sizeof(void*) * (ACC_OPENCL_STREAMS_MAXCOUNT - j));
# if defined(ACC_OPENCL_STREAM_COMPACT)
const int j = i + 1, k = c_dbcsr_acc_opencl_config.nstreams - j;
if (j < c_dbcsr_acc_opencl_config.nstreams && NULL != streams[j]) { /* compacting streams is not thread-safe */
memmove(streams + i, streams + j, sizeof(void*) * k);
}
streams[ACC_OPENCL_STREAMS_MAXCOUNT - j] = NULL;
# else
const int k = i;
# endif
streams[k] = NULL;
tid = c_dbcsr_acc_opencl_config.nthreads; /* leave outer loop */
result = result_release; /* promote */
break;
}
# if defined(ACC_OPENCL_STREAM_COMPACT)
else if (NULL == streams[i]) { /* compact streams */
break;
}
# endif
}
}
}
Expand Down

0 comments on commit cf6cfbe

Please sign in to comment.