diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 9c6fd919bfc..b4488b1e524 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -170,14 +170,6 @@ int c_dbcsr_acc_opencl_order_devices(const void* dev_a, const void* dev_b) { } -int c_dbcsr_acc_opencl_order_streams(const void* /*a*/, const void* /*b*/); -int c_dbcsr_acc_opencl_order_streams(const void* a, const void* b) { /* NULL-pointers are sorted to the upper end */ - const cl_command_queue *const p = (const cl_command_queue*)a, *const q = (const cl_command_queue*)b; - assert(NULL != p && NULL != q); - return *p < *q ? -1 : (*p > *q ? 1 : 0); -} - - LIBXSMM_ATTRIBUTE_CTOR void c_dbcsr_acc_opencl_init(void) { /* attempt to automatically initialize backend */ ACC_OPENCL_EXPECT(EXIT_SUCCESS == c_dbcsr_acc_init()); @@ -229,8 +221,11 @@ int c_dbcsr_acc_init(void) { # if defined(_OPENMP) const int max_threads = omp_get_max_threads(), num_threads = omp_get_num_threads(); c_dbcsr_acc_opencl_config.nthreads = (num_threads < max_threads ? max_threads : num_threads); + c_dbcsr_acc_opencl_config.nstreams = (num_threads < max_threads ? (ACC_OPENCL_STREAMS_MAXCOUNT + max_threads) + : (ACC_OPENCL_STREAMS_MAXCOUNT)); # else c_dbcsr_acc_opencl_config.nthreads = 1; + c_dbcsr_acc_opencl_config.nstreams = ACC_OPENCL_STREAMS_MAXCOUNT; # endif c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose)); c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority)); @@ -574,12 +569,9 @@ int c_dbcsr_acc_init(void) { } # endif if (EXIT_SUCCESS == result) { - const int nelements = ACC_OPENCL_STREAMS_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; + const int nelements = c_dbcsr_acc_opencl_config.nthreads * c_dbcsr_acc_opencl_config.nstreams; c_dbcsr_acc_opencl_config.streams = (void**)calloc(nelements, sizeof(void*)); /* allocate streams */ - if (NULL != c_dbcsr_acc_opencl_config.streams) { /* allocate counters */ - c_dbcsr_acc_opencl_config.stats = (cl_command_queue*)calloc(nelements, sizeof(cl_command_queue)); - } - else result = EXIT_FAILURE; + if (NULL == c_dbcsr_acc_opencl_config.streams) result = EXIT_FAILURE; } } } @@ -627,29 +619,6 @@ int c_dbcsr_acc_finalize(void) { { fprintf(stderr, " device=%i", d); } - if (NULL != c_dbcsr_acc_opencl_config.stats) { - const int nelements = ACC_OPENCL_STREAMS_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; - cl_command_queue s = NULL; - int nstreams, j; - fprintf(stderr, " streams={"); - for (i = 0; i < nelements; i += ACC_OPENCL_STREAMS_MAXCOUNT) { - for (j = 0, nstreams = 0; j < ACC_OPENCL_STREAMS_MAXCOUNT; ++j) { - if (NULL != c_dbcsr_acc_opencl_config.stats[i + j]) ++nstreams; - } - if (0 != nstreams || 0 == i) fprintf(stderr, 0 < i ? " %i" : "%i", nstreams); - } - qsort(c_dbcsr_acc_opencl_config.stats, nelements, sizeof(cl_command_queue), - c_dbcsr_acc_opencl_order_streams); /* NULL -> upper end */ - for (i = 0, nstreams = 0; i < nelements; ++i) { - const cl_command_queue q = c_dbcsr_acc_opencl_config.stats[i]; - if (NULL != q && s != q) { - s = q; - ++nstreams; - } - } - free(c_dbcsr_acc_opencl_config.stats); /* release buffer */ - fprintf(stderr, "} nstreams=%i", nstreams); - } fprintf(stderr, "\n"); } # if defined(__DBCSR_ACC) @@ -1086,18 +1055,20 @@ int c_dbcsr_acc_set_active_device(int device_id) { int c_dbcsr_acc_opencl_device_synchronize(int thread_id) { - void** const streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * thread_id; + void** const streams = c_dbcsr_acc_opencl_config.streams + thread_id * c_dbcsr_acc_opencl_config.nstreams; int result = EXIT_SUCCESS; int i = 0; assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads); assert(NULL != c_dbcsr_acc_opencl_config.streams); - for (; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { + for (; i < c_dbcsr_acc_opencl_config.nstreams; ++i) { void* const stream = streams[i]; if (NULL != stream) { result = c_dbcsr_acc_stream_sync(stream); if (EXIT_SUCCESS != result) break; } +# if defined(ACC_OPENCL_STREAM_COMPACT) else break; +# endif } return result; } diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index 8207201754d..352cf8a1e93 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -89,11 +89,11 @@ #endif /** Counted on a per-thread basis! */ #if !defined(ACC_OPENCL_HANDLES_MAXCOUNT) -# define ACC_OPENCL_HANDLES_MAXCOUNT 1024 +# define ACC_OPENCL_HANDLES_MAXCOUNT 64 #endif /** Counted on a per-thread basis! */ #if !defined(ACC_OPENCL_STREAMS_MAXCOUNT) -# define ACC_OPENCL_STREAMS_MAXCOUNT 1024 +# define ACC_OPENCL_STREAMS_MAXCOUNT 64 #endif #if !defined(ACC_OPENCL_OVERMALLOC) # if defined(__DBCSR_ACC) || 1 @@ -115,6 +115,10 @@ # define ACC_OPENCL_STREAM_PRIORITIES # endif #endif +/** Streams are registered in compact/consecutive fashion */ +#if !defined(ACC_OPENCL_STREAM_COMPACT) && 1 +# define ACC_OPENCL_STREAM_COMPACT +#endif /** Stream-argument (ACC-interface) can be NULL (synchronous) */ #if !defined(ACC_OPENCL_STREAM_NULL) && 1 # define ACC_OPENCL_STREAM_NULL @@ -251,8 +255,6 @@ typedef struct c_dbcsr_acc_opencl_config_t { void **clmems, **events, *storage; /** All created streams partitioned by thread-ID (thread-local slots). */ void** streams; - /** Counts number of streams created (thread-local). */ - cl_command_queue* stats; /** Kind of timer used for built-in execution-profile. */ c_dbcsr_acc_opencl_timer_t timer; /* c_dbcsr_acc_opencl_device_t? */ /** Kernel-parameters are matched against device's UID */ @@ -263,6 +265,8 @@ typedef struct c_dbcsr_acc_opencl_config_t { cl_int ndevices; /** Maximum number of threads (omp_get_max_threads). */ cl_int nthreads; + /** Maximum number of streams per thread. */ + cl_int nstreams; /** How to apply/use stream priorities. */ cl_int priority; /** How to zero/copy device-side buffers. */ diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index 5ef0c32712c..19420c73dd1 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -148,7 +148,9 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) *host_mem = NULL; } # if defined(ACC_OPENCL_STREAM_NULL) - if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue); + if (NULL == stream && EXIT_SUCCESS == result) { + result = c_dbcsr_acc_stream_sync(&queue); + } # endif } else { /* error: mapping host buffer */ @@ -195,7 +197,9 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) { } # endif # if defined(ACC_OPENCL_STREAM_NULL) - if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue); + if (NULL == stream && EXIT_SUCCESS == result) { + result = c_dbcsr_acc_stream_sync(&queue); + } # endif result_release = clReleaseMemObject(info.memory); if (EXIT_SUCCESS == result) result = result_release; @@ -397,7 +401,9 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v result = clEnqueueWriteBuffer( queue, buffer, 0 == (1 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL); # if defined(ACC_OPENCL_STREAM_NULL) - if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue); + if (NULL == stream && EXIT_SUCCESS == result) { + result = c_dbcsr_acc_stream_sync(&queue); + } # endif } } @@ -445,7 +451,7 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v queue, buffer, 0 == (2 & c_dbcsr_acc_opencl_config.async), offset, nbytes, host_mem, 0, NULL, NULL); if (CL_SUCCESS == result) { # if defined(ACC_OPENCL_STREAM_NULL) - result = c_dbcsr_acc_stream_sync(&queue); + if (NULL == stream) result = c_dbcsr_acc_stream_sync(&queue); # endif } else { /* synchronous */ @@ -533,7 +539,9 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED); } # if defined(ACC_OPENCL_STREAM_NULL) - if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue); + if (NULL == stream && EXIT_SUCCESS == result) { + result = c_dbcsr_acc_stream_sync(&queue); + } # endif } } @@ -604,7 +612,9 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED); } # if defined(ACC_OPENCL_STREAM_NULL) - if (EXIT_SUCCESS == result) result = c_dbcsr_acc_stream_sync(&queue); + if (NULL == stream && EXIT_SUCCESS == result) { + result = c_dbcsr_acc_stream_sync(&queue); + } # endif } } diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index d52b7910b30..0b432d6061d 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -51,13 +51,12 @@ const int* c_dbcsr_acc_opencl_stream_priority(const void* stream) { void* c_dbcsr_acc_opencl_stream_default(void) { - const int tid = ACC_OPENCL_OMP_TID(); - const int base = ACC_OPENCL_STREAMS_MAXCOUNT * tid; + const int tid = ACC_OPENCL_OMP_TID(), base = tid * c_dbcsr_acc_opencl_config.nstreams; void* result = NULL; int i = base; assert(tid < c_dbcsr_acc_opencl_config.nthreads); assert(NULL != c_dbcsr_acc_opencl_config.streams); - for (; i < (base + ACC_OPENCL_STREAMS_MAXCOUNT); ++i) { + for (; i < (base + c_dbcsr_acc_opencl_config.nstreams); ++i) { if (NULL != c_dbcsr_acc_opencl_config.streams[i]) { result = c_dbcsr_acc_opencl_config.streams + i; break; @@ -74,7 +73,6 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { int result, i, tid = 0, offset = 0; cl_command_queue queue = NULL; cl_context context = NULL; - void** streams = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; @@ -181,14 +179,15 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { else { result = EXIT_FAILURE; } +# if defined(_OPENMP) && 0 +# pragma omp critical(c_dbcsr_acc_opencl_stream) +# endif if (EXIT_SUCCESS == result) { - const int base = ACC_OPENCL_STREAMS_MAXCOUNT * tid; - cl_command_queue* const stats = c_dbcsr_acc_opencl_config.stats + base; - streams = c_dbcsr_acc_opencl_config.streams + base; - for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { + void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams; + for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) { if (NULL == streams[i]) break; } - if (i < ACC_OPENCL_STREAMS_MAXCOUNT) { /* register stream */ + if (i < c_dbcsr_acc_opencl_config.nstreams) { /* register stream */ const size_t size_info = sizeof(c_dbcsr_acc_opencl_info_stream_t); const size_t size = sizeof(cl_command_queue) + sizeof(void*) + size_info - 1; void* const handle = malloc(size); @@ -201,7 +200,7 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { info->pointer = (void*)address; info->priority = priority; info->tid = tid; - stats[i] = *(cl_command_queue*)aligned = queue; + *(cl_command_queue*)aligned = queue; streams[i] = *stream_p = (void*)aligned; assert(queue == *ACC_OPENCL_STREAM(streams[i])); assert(queue == *ACC_OPENCL_STREAM(*stream_p)); @@ -241,26 +240,32 @@ int c_dbcsr_acc_stream_destroy(void* stream) { # endif if (NULL != stream) { const cl_command_queue queue = *ACC_OPENCL_STREAM(stream); - assert(NULL != c_dbcsr_acc_opencl_config.streams); if (NULL != queue) { - int tid = 0, i = ACC_OPENCL_STREAMS_MAXCOUNT; - void** streams = NULL; + const int result_release = clReleaseCommandQueue(queue); /* soft-error */ + int tid = 0, i = c_dbcsr_acc_opencl_config.nstreams; + assert(NULL != c_dbcsr_acc_opencl_config.streams); for (; tid < c_dbcsr_acc_opencl_config.nthreads; ++tid) { /* unregister */ - streams = c_dbcsr_acc_opencl_config.streams + ACC_OPENCL_STREAMS_MAXCOUNT * tid; - for (i = 0; i < ACC_OPENCL_STREAMS_MAXCOUNT; ++i) { + void** const streams = c_dbcsr_acc_opencl_config.streams + tid * c_dbcsr_acc_opencl_config.nstreams; + for (i = 0; i < c_dbcsr_acc_opencl_config.nstreams; ++i) { if (stream == streams[i]) { - const int j = i + 1, result_release = clReleaseCommandQueue(queue); /* soft-error */ - if (j < ACC_OPENCL_STREAMS_MAXCOUNT && NULL != streams[j]) { /* compacting streams is not thread-safe */ - memmove(streams + i, streams + j, sizeof(void*) * (ACC_OPENCL_STREAMS_MAXCOUNT - j)); +# if defined(ACC_OPENCL_STREAM_COMPACT) + const int j = i + 1, k = c_dbcsr_acc_opencl_config.nstreams - j; + if (j < c_dbcsr_acc_opencl_config.nstreams && NULL != streams[j]) { /* compacting streams is not thread-safe */ + memmove(streams + i, streams + j, sizeof(void*) * k); } - streams[ACC_OPENCL_STREAMS_MAXCOUNT - j] = NULL; +# else + const int k = i; +# endif + streams[k] = NULL; tid = c_dbcsr_acc_opencl_config.nthreads; /* leave outer loop */ result = result_release; /* promote */ break; } +# if defined(ACC_OPENCL_STREAM_COMPACT) else if (NULL == streams[i]) { /* compact streams */ break; } +# endif } } }