diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index b4488b1e524..db5d2d79d6f 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -56,7 +56,6 @@ extern "C" { c_dbcsr_acc_opencl_config_t c_dbcsr_acc_opencl_config; -# if !defined(NDEBUG) void c_dbcsr_acc_opencl_notify(const char /*errinfo*/[], const void* /*private_info*/, size_t /*cb*/, void* /*user_data*/); void c_dbcsr_acc_opencl_notify(const char errinfo[], const void* private_info, size_t cb, void* user_data) { LIBXSMM_UNUSED(private_info); @@ -64,7 +63,6 @@ void c_dbcsr_acc_opencl_notify(const char errinfo[], const void* private_info, s LIBXSMM_UNUSED(user_data); fprintf(stderr, "ERROR ACC/OpenCL: %s\n", errinfo); } -# endif cl_context c_dbcsr_acc_opencl_context(int* thread_id) { @@ -919,11 +917,8 @@ int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id active_id) { result = clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL); assert(CL_SUCCESS != result || NULL != platform); if (CL_SUCCESS == result) { -# if defined(NDEBUG) - void (*const notify)(const char*, const void*, size_t, void*) = NULL; -# else - void (*const notify)(const char*, const void*, size_t, void*) = c_dbcsr_acc_opencl_notify; -# endif + void (*const notify)( + const char*, const void*, size_t, void*) = (0 != c_dbcsr_acc_opencl_config.verbosity ? c_dbcsr_acc_opencl_notify : NULL); cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, 0 /*placeholder*/, 0 /* end of properties */ }; @@ -1007,27 +1002,46 @@ int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id) { } } if (EXIT_SUCCESS == result) { /* update/cache device-specific information */ - char devname[ACC_OPENCL_BUFFERSIZE]; + result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device[thread_id].level, + c_dbcsr_acc_opencl_config.device[thread_id].level + 1, NULL /*cl_std*/, + &c_dbcsr_acc_opencl_config.device[thread_id].type); + if (EXIT_SUCCESS == result) { + char devname[ACC_OPENCL_BUFFERSIZE]; # if defined(CL_VERSION_2_0) - const char* const env_svm = getenv("ACC_OPENCL_SVM"); - int level_major = 0; - const int nok = (NULL == env_svm || EXIT_SUCCESS != c_dbcsr_acc_opencl_device_level(active_id, &level_major, - NULL /*level_minor*/, NULL /*cl_std*/, NULL /*type*/)); - c_dbcsr_acc_opencl_config.device[thread_id].svm_interop = ((0 != nok || 2 > level_major) ? 0 : atoi(env_svm)); + const char* const env_svm = getenv("ACC_OPENCL_SVM"); + c_dbcsr_acc_opencl_config.device[thread_id].svm_interop = + ((NULL == env_svm || 2 > *c_dbcsr_acc_opencl_config.device[thread_id].level) ? 0 : atoi(env_svm)); # endif - if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), - &c_dbcsr_acc_opencl_config.device[thread_id].unified, NULL)) - { - c_dbcsr_acc_opencl_config.device[thread_id].unified = CL_FALSE; - } - if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, - 0 /*platform_maxlen*/, /*cleanup*/ 1) || - EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device[thread_id].uid)) - { - c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1; + if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), + &c_dbcsr_acc_opencl_config.device[thread_id].unified, NULL)) + { + c_dbcsr_acc_opencl_config.device[thread_id].unified = CL_FALSE; + } + if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, + 0 /*platform_maxlen*/, /*cleanup*/ 1) || + EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device[thread_id].uid)) + { + c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1; + } + c_dbcsr_acc_opencl_config.device[thread_id].intel = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor( + active_id, "intel", 0 /*use_platform_name*/)); + c_dbcsr_acc_opencl_config.device[thread_id].nv = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor( + active_id, "nvidia", 0 /*use_platform_name*/)); + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) || + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/)) + { + char buffer[ACC_OPENCL_BUFFERSIZE]; + c_dbcsr_acc_opencl_config.device[thread_id].amd = 1; + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_id, buffer, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, + 0 /*platform_maxlen*/, /*cleanup*/ 1)) + { + const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx"); + if (NULL != gfxname && 90 <= atoi(gfxname + 3)) { + c_dbcsr_acc_opencl_config.device[thread_id].amd = 2; + } + } + } } - c_dbcsr_acc_opencl_config.device[thread_id].intel = - (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/) ? CL_TRUE : CL_FALSE); } } } @@ -1135,7 +1149,148 @@ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, size_t* max } -int c_dbcsr_acc_opencl_build_flags(const char build_params[], const char build_options[], const char try_build_options[], +int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_atomic_fp_t kind, + const c_dbcsr_acc_opencl_device_t* devinfo, const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen) { + int result = 0, ext1, ext2; + for (ext1 = 0; ext1 < exts_maxlen; ++ext1) + if (NULL == exts[ext1] || '\0' == *exts[ext1]) break; + for (ext2 = ext1 + 1; ext2 < exts_maxlen; ++ext2) + if (NULL == exts[ext2] || '\0' == *exts[ext2]) break; + if (NULL != devinfo && ext2 < exts_maxlen) { + const char* atomic_type = ""; + switch (kind) { + case c_dbcsr_acc_opencl_atomic_fp_64: { + exts[ext1] = "cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"; + if (2 <= *devinfo->level && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) { + atomic_type = "-DTA=long -DTA2=atomic_long -DTF=atomic_double"; + } + else { + exts[ext1] = "cl_khr_fp64 cl_khr_int64_base_atomics"; + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) { + atomic_type = "-DTA=long"; + } + else { /* fallback */ + exts[ext1] = "cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics"; + if (2 <= *devinfo->level && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) { + atomic_type = "-DATOMIC32_ADD64 -DTA=int -DTA2=atomic_int -DTF=atomic_double"; + } + else { + exts[ext1] = "cl_khr_fp64 cl_khr_global_int32_base_atomics"; + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) { + atomic_type = "-DATOMIC32_ADD64 -DTA=int"; + } + else kind = c_dbcsr_acc_opencl_atomic_fp_no; + } + } + } + } break; + case c_dbcsr_acc_opencl_atomic_fp_32: { + exts[ext1] = "cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics"; + if (2 <= *devinfo->level && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) { + exts[ext2] = "cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"; + atomic_type = "-DTA=int -DTA2=atomic_int -DTF=atomic_float"; + } + else { + exts[ext1] = "cl_khr_global_int32_base_atomics"; + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) { + exts[ext2] = "cl_khr_int64_base_atomics"; + atomic_type = "-DTA=int"; + } + else kind = c_dbcsr_acc_opencl_atomic_fp_no; + } + } break; + default: assert(c_dbcsr_acc_opencl_atomic_fp_no == kind); + } + if (c_dbcsr_acc_opencl_atomic_fp_no != kind) { + const char *barrier_expr = NULL, *atomic_exp = NULL, *atomic_ops = ""; + const char* const env_barrier = getenv("ACC_OPENCL_BARRIER"); + const char* const env_atomics = getenv("ACC_OPENCL_ATOMICS"); + if (NULL == env_barrier || '0' != *env_barrier) { + barrier_expr = ((2 <= *devinfo->level && (0 == devinfo->intel || (CL_DEVICE_TYPE_CPU != devinfo->type))) + ? "-D\"BARRIER(A)=work_group_barrier(A,memory_scope_work_group)\"" + : "-D\"BARRIER(A)=barrier(A)\""); + } + else barrier_expr = ""; /* no barrier */ + assert(NULL != barrier_expr); + if (NULL == env_atomics || '0' != *env_atomics) { + /* can signal/force atomics without confirmation */ + const int force_atomics = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics)); + if (NULL == env_atomics || '\0' == *env_atomics || 0 != force_atomics) { + cl_bitfield fp_atomics; + if (CL_SUCCESS == clGetDeviceInfo(device_id, (cl_device_info)(c_dbcsr_acc_opencl_atomic_fp_64 == kind ? 0x4232 : 0x4231), + sizeof(cl_bitfield), &fp_atomics, NULL) && + 0 != (/*add*/ (1 << 1) & fp_atomics)) + { + exts[ext2] = "cl_ext_float_atomics"; + atomic_exp = (c_dbcsr_acc_opencl_atomic_fp_64 == kind + ? "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_double)*)A,B," + "memory_order_relaxed,memory_scope_work_group)" + : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_float)*)A,B," + "memory_order_relaxed,memory_scope_work_group)"); + } + else if (0 != force_atomics || (0 != devinfo->intel && ((0x4905 != devinfo->uid && 0 == devinfo->unified)))) { + if ((((0 != force_atomics || (0 != devinfo->intel && ((0x0bd0 <= devinfo->uid && 0x0bdb >= devinfo->uid) || + c_dbcsr_acc_opencl_atomic_fp_32 == kind)))))) + { + if (0 == force_atomics && (0 == devinfo->intel || 0x0bd0 > devinfo->uid || 0x0bdb < devinfo->uid)) { + exts[ext2] = "cl_intel_global_float_atomics"; + atomic_ops = "-Dcl_intel_global_float_atomics"; + } + else { + atomic_ops = ((2 > *devinfo->level && 2 > force_atomics) + ? "-DATOMIC_PROTOTYPES=1" + : (3 > force_atomics ? "-DATOMIC_PROTOTYPES=2" : "-DATOMIC_PROTOTYPES=3")); + } + atomic_exp = ((2 > *devinfo->level && 2 > force_atomics) ? "atomic_add(A,B)" + : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(TF)*)A,B," + "memory_order_relaxed,memory_scope_work_group)"); + } + else { + atomic_exp = "atomic_add_global_cmpxchg(A,B)"; + atomic_ops = "-DCMPXCHG=atom_cmpxchg"; + } + } + else if (0 == devinfo->nv) { + if (1 >= devinfo->amd) { + atomic_ops = (c_dbcsr_acc_opencl_atomic_fp_32 == kind ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); + atomic_exp = "atomic_add_global_cmpxchg(A,B)"; + exts[ext2] = NULL; + } + else { /* GCN */ + atomic_exp = (c_dbcsr_acc_opencl_atomic_fp_64 == kind + ? "__builtin_amdgcn_global_atomic_fadd_f64(A,B,__ATOMIC_RELAXED)" + : "__builtin_amdgcn_global_atomic_fadd_f32(A,B,__ATOMIC_RELAXED)"); + } + } + else { /* xchg */ + assert(NULL != atomic_ops && '\0' == *atomic_ops); + atomic_exp = "atomic_add_global_xchg(A,B)"; + } + } + else if (NULL != LIBXSMM_STRISTR(env_atomics, "cmpxchg")) { + atomic_ops = (c_dbcsr_acc_opencl_atomic_fp_32 == kind ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); + atomic_exp = "atomic_add_global_cmpxchg(A,B)"; + exts[ext2] = NULL; + } + else { /* xchg */ + atomic_exp = "atomic_add_global_xchg(A,B)"; + atomic_ops = (c_dbcsr_acc_opencl_atomic_fp_32 == kind ? "-DXCHG=atomic_xchg" : "-DXCHG=atom_xchg"); + } + } + else { /* unsynchronized */ + atomic_exp = "*(A)+=(B)"; /* non-atomic update */ + } + assert(NULL != atomic_exp); + /* compose build parameters and flags */ + result = LIBXSMM_SNPRINTF(flags, flags_maxlen, "-DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type, + atomic_ops, atomic_exp, barrier_expr); + } + } + return result; +} + + +int c_dbcsr_acc_opencl_flags(const char build_params[], const char build_options[], const char try_build_options[], const char cl_std[], char buffer[], size_t buffer_size) { int result; if (NULL != buffer) { @@ -1334,13 +1489,12 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha program = clCreateProgramWithSource(context, 1 /*nlines*/, &ext_source, NULL, &result); if (CL_SUCCESS == result) { assert(NULL != program); - result = c_dbcsr_acc_opencl_build_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer)); + result = c_dbcsr_acc_opencl_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/); } if (CL_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) { - result = c_dbcsr_acc_opencl_build_flags( - build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer)); + result = c_dbcsr_acc_opencl_flags(build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */ program = clCreateProgramWithSource(context, 1 /*nlines*/, &ext_source, NULL, &result); @@ -1428,13 +1582,12 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha } if (CL_SUCCESS == result) { assert(NULL != program); - result = c_dbcsr_acc_opencl_build_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer)); + result = c_dbcsr_acc_opencl_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/); } if (CL_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) { - result = c_dbcsr_acc_opencl_build_flags( - build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer)); + result = c_dbcsr_acc_opencl_flags(build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer)); if (EXIT_SUCCESS == result) { ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */ # if defined(CL_VERSION_2_1) diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index 352cf8a1e93..58e1e4ea9e9 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -218,28 +218,32 @@ extern "C" { #endif -/** Enumeration of timer kinds used for built-in execution-profile. */ -typedef enum c_dbcsr_acc_opencl_timer_t { - c_dbcsr_acc_opencl_timer_device, - c_dbcsr_acc_opencl_timer_host -} c_dbcsr_acc_opencl_timer_t; - /** Settings updated during c_dbcsr_acc_set_active_device. */ typedef struct c_dbcsr_acc_opencl_device_t { /** Activated device context. */ cl_context context; + /** OpenCL support-level of device. */ + cl_int level[2]; + /** Kind of device (GPU, CPU, or other). */ + cl_device_type type; #if defined(CL_VERSION_2_0) /** Runtime SVM support. */ cl_bool svm_interop; #endif + /** Whether host memory is unified. */ + cl_bool unified; /** Device-ID. */ cl_uint uid; - /** Intel device? */ - cl_bool intel; - /** Whether host memory is unified or not. */ - cl_bool unified; + /** Main vendor? */ + cl_int intel, amd, nv; } c_dbcsr_acc_opencl_device_t; +/** Enumeration of timer kinds used for built-in execution-profile. */ +typedef enum c_dbcsr_acc_opencl_timer_t { + c_dbcsr_acc_opencl_timer_device, + c_dbcsr_acc_opencl_timer_host +} c_dbcsr_acc_opencl_timer_t; + /** * Settings discovered/setup during c_dbcsr_acc_init (independent of the device) * and settings updated during c_dbcsr_acc_set_active_device (devinfo). @@ -295,7 +299,7 @@ typedef struct c_dbcsr_acc_opencl_info_hostptr_t { c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory); /** Determines cl_mem object and offset of memory. */ -void* c_dbcsr_acc_opencl_info_devptr(const void* memory, const size_t* amount, size_t* offset); +void* c_dbcsr_acc_opencl_info_devptr(const void* memory, size_t elsize, const size_t* amount, size_t* offset); /** Information about streams (c_dbcsr_acc_stream_create). */ typedef struct c_dbcsr_acc_opencl_info_stream_t { @@ -324,7 +328,7 @@ int c_dbcsr_acc_opencl_device_uid(cl_device_id device, const char devname[], uns /** Based on the device-ID, return the device's UID (capture or calculate), device name, and platform name. */ int c_dbcsr_acc_opencl_device_name( cl_device_id device, char name[], size_t name_maxlen, char platform[], size_t platform_maxlen, int cleanup); -/** Return the OpenCL support level for the given device. */ +/** Return the OpenCL support-level for the given device. */ int c_dbcsr_acc_opencl_device_level(cl_device_id device, int* level_major, int* level_minor, char cl_std[16], cl_device_type* type); /** Check if given device supports the extensions. */ int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char* const extnames[], int num_exts); @@ -334,10 +338,6 @@ int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id device_id); int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id); /** Get preferred multiple and max. size of workgroup (kernel- or device-specific). */ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, size_t* max_value, size_t* preferred_multiple); -/** Assemble various flags for calling clBuildProgram into the given buffer.*/ -/** Combines build-params and build-options, some optional flags (try_build_options), and applies language std. (cl_std). */ -int c_dbcsr_acc_opencl_build_flags(const char build_params[], const char build_options[], const char try_build_options[], - const char cl_std[], char buffer[], size_t buffer_size); /** * Build kernel from source with given kernel_name, build_params and build_options. * The build_params are meant to instantiate the kernel (-D) whereas build_options @@ -351,6 +351,21 @@ int c_dbcsr_acc_opencl_device_synchronize(int thread_id); /** Create user-event if not created and sets initial state. */ int c_dbcsr_acc_opencl_event_create(cl_event* event_p); +/** Enumeration of FP-atomic kinds. */ +typedef enum c_dbcsr_acc_opencl_atomic_fp_t { + c_dbcsr_acc_opencl_atomic_fp_no = 0, + c_dbcsr_acc_opencl_atomic_fp_32 = 1, + c_dbcsr_acc_opencl_atomic_fp_64 = 2 +} c_dbcsr_acc_opencl_atomic_fp_t; + +/** Assemble flags to support atomic operations. */ +int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_atomic_fp_t kind, + const c_dbcsr_acc_opencl_device_t* devinfo, const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen); + +/** Combines build-params and build-options, some optional flags (try_build_options), and applies language std. (cl_std). */ +int c_dbcsr_acc_opencl_flags(const char build_params[], const char build_options[], const char try_build_options[], + const char cl_std[], char buffer[], size_t buffer_size); + #if defined(__cplusplus) } #endif diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index fa25994fedc..f35b258e572 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -61,11 +61,11 @@ c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory) } -void* c_dbcsr_acc_opencl_info_devptr(const void* memory, const size_t* amount, size_t* offset) { +void* c_dbcsr_acc_opencl_info_devptr(const void* memory, size_t elsize, const size_t* amount, size_t* offset) { void* result = NULL; # if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) - if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != memory) { + if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != memory && 0 < elsize) { const char* const buffer = (const char*)memory; const size_t n = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads; size_t i = c_dbcsr_acc_opencl_config.nclmems, hit = (size_t)-1; @@ -78,20 +78,21 @@ void* c_dbcsr_acc_opencl_info_devptr(const void* memory, const size_t* amount, s result = handle; break; } - else if (NULL != mem) { + else if (NULL != mem && mem < buffer && NULL != offset) { size_t d = buffer - mem, s = 0; - if (d < hit && NULL != offset && - (NULL == amount || - (CL_SUCCESS == clGetMemObjectInfo((cl_mem)mem, CL_MEM_SIZE, sizeof(size_t), &s, NULL) && (*amount + d) <= s))) + if (d < hit && CL_SUCCESS == clGetMemObjectInfo((cl_mem)mem, CL_MEM_SIZE, sizeof(size_t), &s, NULL) && + (1 == elsize || (0 == (d % elsize) && 0 == (s % elsize))) && (NULL == amount || (*amount + d) <= s)) { - *offset = hit = d; + *offset = (1 == elsize ? d : (d / elsize)); result = handle; + hit = d; } } } } # else LIBXSMM_UNUSED(memory); + LIBXSMM_UNUSED(elsize); LIBXSMM_UNUSED(amount); LIBXSMM_UNUSED(offset); # endif @@ -311,7 +312,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) { # pragma omp critical(c_dbcsr_acc_dev_mem_deallocate) # endif { - void** handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, NULL /*amount*/, NULL /*offset*/); + void** handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/); if (NULL != handle) { void** const pfree = c_dbcsr_acc_opencl_config.clmems[c_dbcsr_acc_opencl_config.nclmems]; libxsmm_pfree(pfree, c_dbcsr_acc_opencl_config.clmems, &c_dbcsr_acc_opencl_config.nclmems); @@ -383,7 +384,7 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) assert(NULL != c_dbcsr_acc_opencl_config.clmems); { - void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, &nbytes, &offset); + void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); if (NULL != handle) buffer = *(cl_mem*)handle; # if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) else result = EXIT_FAILURE; @@ -432,7 +433,7 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) assert(NULL != c_dbcsr_acc_opencl_config.clmems); { - void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, &nbytes, &offset); + void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); if (NULL != handle) buffer = *(cl_mem*)handle; # if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) else result = EXIT_FAILURE; @@ -489,8 +490,8 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) assert(NULL != c_dbcsr_acc_opencl_config.clmems); { - void* const handle_src = c_dbcsr_acc_opencl_info_devptr(devmem_src, &nbytes, &src_offset); - void* const handle_dst = c_dbcsr_acc_opencl_info_devptr(devmem_dst, &nbytes, &dst_offset); + void* const handle_src = c_dbcsr_acc_opencl_info_devptr(devmem_src, 1 /*elsize*/, &nbytes, &src_offset); + void* const handle_dst = c_dbcsr_acc_opencl_info_devptr(devmem_dst, 1 /*elsize*/, &nbytes, &dst_offset); if (NULL != handle_src) src = *(cl_mem*)handle_src; # if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) else result = EXIT_FAILURE; @@ -567,7 +568,7 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb # if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \ defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT) if (0 == offset && NULL != c_dbcsr_acc_opencl_config.clmems) { - void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, &nbytes, &offset); + void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset); if (NULL != handle) buffer = *(cl_mem*)handle; # if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG) else result = EXIT_FAILURE; diff --git a/src/acc/opencl/common/opencl_atomics.h b/src/acc/opencl/common/opencl_atomics.h index 9d285b9a7c5..7809a5216cd 100644 --- a/src/acc/opencl/common/opencl_atomics.h +++ b/src/acc/opencl/common/opencl_atomics.h @@ -11,6 +11,20 @@ #include "opencl_common.h" +#if (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) +# if !defined(T) +# define T double +# endif +# define ZERO 0.0 +#elif (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/) +# if !defined(T) +# define T float +# endif +# define ZERO 0.f +#elif defined(T) /*c_dbcsr_acc_opencl_atomic_fp_no*/ +# define ZERO 0 +#endif + #define GLOBAL_VOLATILE(A) global volatile A #if defined(ATOMIC_PROTOTYPES) || defined(__opencl_c_ext_fp64_global_atomic_add) @@ -38,7 +52,7 @@ __attribute__((overloadable)) T atomic_add(GLOBAL_VOLATILE(T) *, T); #define ACCUMULATE(A, B) ATOMIC_ADD_GLOBAL(A, B) -#if !defined(cl_intel_global_float_atomics) || (1 != TN) +#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) # if defined(ATOMIC32_ADD64) __attribute__((always_inline)) inline void atomic32_add64_global(GLOBAL_VOLATILE(double) * dst, double inc) { *dst += inc; /* TODO */ @@ -47,7 +61,7 @@ __attribute__((always_inline)) inline void atomic32_add64_global(GLOBAL_VOLATILE #endif -#if !defined(cl_intel_global_float_atomics) || (1 != TN) +#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) # if defined(CMPXCHG) __attribute__((always_inline)) inline void atomic_add_global_cmpxchg(GLOBAL_VOLATILE(T) * dst, T inc) { # if !defined(ATOMIC32_ADD64) @@ -74,8 +88,8 @@ __attribute__((always_inline)) inline void atomic_add_global_cmpxchg(GLOBAL_VOLA #endif -#if !defined(cl_intel_global_float_atomics) || (1 != TN) -# if defined(ATOMIC_ADD2_GLOBAL) && (1 == TN) +#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) +# if defined(ATOMIC_ADD2_GLOBAL) && (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/) __attribute__((always_inline)) inline void atomic_add_global_cmpxchg2(GLOBAL_VOLATILE(float) * dst, float2 inc) { union { float2 f; @@ -97,13 +111,13 @@ __attribute__((always_inline)) inline void atomic_add_global_cmpxchg2(GLOBAL_VOL #endif -#if !defined(cl_intel_global_float_atomics) || (1 != TN) +#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) # if defined(XCHG) || (defined(__NV_CL_C_VERSION) && !defined(CMPXCHG) && !defined(ATOMIC_PROTOTYPES)) __attribute__((always_inline)) inline void atomic_add_global_xchg(GLOBAL_VOLATILE(T) * dst, T inc) { # if !defined(ATOMIC32_ADD64) -# if (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (1 == TN) +# if (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/) asm("{ .reg .f32 t; atom.global.add.f32 t, [%0], %1; }" ::"l"(dst), "f"(inc)); -# elif (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (3 == TN) +# elif (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/) asm("{ .reg .f64 t; atom.global.add.f64 t, [%0], %1; }" ::"l"(dst), "d"(inc)); # else union { diff --git a/src/acc/opencl/common/opencl_common.h b/src/acc/opencl/common/opencl_common.h index 9cf10fb4887..1b890707b76 100644 --- a/src/acc/opencl/common/opencl_common.h +++ b/src/acc/opencl/common/opencl_common.h @@ -30,12 +30,4 @@ # define UNROLL(N) UNROLL_FORCE(N) #endif -#if (1 == TN) -# define ZERO 0.f -#elif (3 == TN) -# define ZERO 0.0 -#else -# define ZERO 0 -#endif - #endif /*OPENCL_COMMON_H*/ diff --git a/src/acc/opencl/smm/README.md b/src/acc/opencl/smm/README.md index e4b8d6a470d..a34c0700942 100644 --- a/src/acc/opencl/smm/README.md +++ b/src/acc/opencl/smm/README.md @@ -13,7 +13,6 @@ There are two categories for the two domains in LIBSMM, i.e., matrix transpose ( The most common settings for multiplying matrices are: * `OPENCL_LIBSMM_SMM_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler. -* `OPENCL_LIBSMM_SMM_ATOMICS`: selects the kind of atomic operation used for global memory updates (`xchg`, `cmpxchg`, `cmpxchg2`), attempts to force atomic instructions, or disables atomic instructions (`0`). The latter is for instance to quantify the impact of atomic operations. * `OPENCL_LIBSMM_SMM_PARAMS`: Disable embedded/auto-tuned parameters (`0`), or load CSV-file (e.g., `path/to/tune_multiply.csv`). * `OPENCL_LIBSMM_SMM_BS`: non-negative integer number denoting the intra-kernel (mini-)batchsize mainly used to amortize atomic updates of data in global/main memory. The remainder with respect to the "stacksize" is handled by the kernel. * `OPENCL_LIBSMM_SMM_BM`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction. diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index a446d8d39de..934d075e893 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -1160,14 +1160,11 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, /* determine kernel-kind (mini-batch vs. mini-kernel) */ if (1 == bs || 0 > s || (bs * s) > stack_size) kernel_idx = bs = 1; if (NULL == config || NULL == config->kernel[kernel_idx]) { - char buffer[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE]; - char fname[ACC_OPENCL_MAXSTRLEN]; - int cl_level_major, nchar = LIBXSMM_SNPRINTF(fname, sizeof(fname), - /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */ - "x" OPENCL_LIBSMM_KERNELNAME_SMM "%ix%ix%i", m_max, n_max, k_max); - const char* extensions[] = {NULL, NULL}; + char buffer[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE], fname[ACC_OPENCL_MAXSTRLEN]; + int nchar = LIBXSMM_SNPRINTF(fname, sizeof(fname), + /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */ + "x" OPENCL_LIBSMM_KERNELNAME_SMM "%ix%ix%i", m_max, n_max, k_max); cl_device_id active_device = NULL; - cl_device_type device_type = 0; # if defined(__DBCSR_ACC) int routine_handle; c_dbcsr_timeset(LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR, LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR, &routine_handle); @@ -1176,65 +1173,23 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, ? clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &active_device, NULL) : EXIT_FAILURE); if (EXIT_SUCCESS == result) { - result = c_dbcsr_acc_opencl_device_level( - active_device, &cl_level_major, NULL /*level_minor*/, NULL /*cl_std*/, &device_type); - } - if (EXIT_SUCCESS == result) { - const char *tname = NULL, *atomic_type = ""; - int std_c11 = 0; + c_dbcsr_acc_opencl_atomic_fp_t tkind = c_dbcsr_acc_opencl_atomic_fp_no; + const char* tname = NULL; switch (datatype) { case dbcsr_type_real_8: { - extensions[0] = "cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"; + tkind = c_dbcsr_acc_opencl_atomic_fp_64; tname = "double"; fname[0] = 'd'; - if (2 <= cl_level_major && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) { - atomic_type = "-DTA=long -DTA2=atomic_long -DTF=atomic_double"; - std_c11 = 1; - } - else { - extensions[0] = "cl_khr_fp64 cl_khr_int64_base_atomics"; - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) { - atomic_type = "-DTA=long"; - } - else { /* fallback */ - extensions[0] = "cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics"; - if (2 <= cl_level_major && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) { - atomic_type = "-DATOMIC32_ADD64 -DTA=int -DTA2=atomic_int -DTF=atomic_double"; - std_c11 = 1; - } - else { - extensions[0] = "cl_khr_fp64 cl_khr_global_int32_base_atomics"; - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) { - atomic_type = "-DATOMIC32_ADD64 -DTA=int"; - } - else tname = NULL; - } - } - } } break; case dbcsr_type_real_4: { - extensions[0] = "cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics"; - if (2 <= cl_level_major && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) { - extensions[1] = "cl_khr_int64_base_atomics cl_khr_int64_extended_atomics"; - atomic_type = "-DTA=int -DTA2=atomic_int -DTF=atomic_float"; - std_c11 = 1; - tname = "float"; - fname[0] = 's'; - } - else { - extensions[0] = "cl_khr_global_int32_base_atomics"; - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) { - extensions[1] = "cl_khr_int64_base_atomics"; - atomic_type = "-DTA=int"; - tname = "float"; - fname[0] = 's'; - } - } + tkind = c_dbcsr_acc_opencl_atomic_fp_32; + tname = "float"; + fname[0] = 's'; } break; default: assert(NULL == tname); } if (NULL != tname) { - const char* const env_devid = getenv("OPENCL_LIBSMM_SMM_DEVID"); + const char *extensions[] = {NULL, NULL}, *const env_devid = getenv("OPENCL_LIBSMM_SMM_DEVID"); const unsigned int devuid = (NULL == env_devid || '\0' == *env_devid) ? devinfo->uid : (unsigned int)strtoul(env_devid, NULL, 0); size_t wgsize_max, wgsize_prf, sgs = 0; @@ -1257,12 +1212,6 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, const char *const env_ab = getenv("OPENCL_LIBSMM_SMM_AB"), *const env_ac = getenv("OPENCL_LIBSMM_SMM_AC"); const char *const env_xf = getenv("OPENCL_LIBSMM_SMM_XF"), *const env_cl = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS"); const char* const intel_xf = "-cl-intel-256-GRF-per-thread"; - const int cl_nonv = (0 != devinfo->intel || EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor( - active_device, "nvidia", 0 /*use_platform_name*/)); - const int cl_noamd = - (0 != devinfo->intel || !cl_nonv || - (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 0 /*use_platform_name*/) && - EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 1 /*use_platform_name*/))); const int default_lu = (0 != devinfo->intel ? -1 : 0); const int unroll = LIBXSMM_MAX(-2, (NULL == env_lu || '\0' == *env_lu) ? (0 == kernel_idx ? (NULL == config ? default_lu : config->lu) : default_lu) @@ -1305,7 +1254,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, 0, 1); new_config.al = LIBXSMM_CLMP( (NULL == env_al || '\0' == *env_al) - ? (cl_noamd ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) : 1) + ? (0 == devinfo->amd ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) : 1) : atoi(env_al), 0, 1); new_config.tb = LIBXSMM_CLMP((NULL == env_tb || '\0' == *env_tb) @@ -1336,7 +1285,9 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, : atoi(env_ac), 0, 1); if (NULL == env_xf || '\0' == *env_xf) { - if (0 == devinfo->intel || NULL == env_cl || NULL == strstr(env_cl, intel_xf)) { + if (0 == devinfo->intel || CL_DEVICE_TYPE_GPU != devinfo->type || NULL == env_cl || + NULL == strstr(env_cl, intel_xf)) + { new_config.flags = (NULL == config ? /*default*/ 0 : config->flags); } else new_config.flags = 1; @@ -1418,143 +1369,40 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } if (new_config.wgsize[kernel_idx] <= wgsize_max) { /* SMMs can be potentially handled by device */ const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant"); - const char* const env_barrier = getenv("OPENCL_LIBSMM_SMM_BARRIER"); - const char* const env_atomics = getenv("OPENCL_LIBSMM_SMM_ATOMICS"); const char* const env_nrepeat = getenv("SMM_NREPEAT"); const int typesize = OPENCL_LIBSMM_TYPESIZE(datatype); const int slm_a = (1 != new_config.aa ? 0 : (LIBXSMM_ISPOT(k_max * typesize) + 1)); const int slm_b = (1 != new_config.ab ? 0 : (LIBXSMM_ISPOT(k_max * typesize) + 1)); const int slm_c = (1 != new_config.ac ? 0 : (LIBXSMM_ISPOT(m_max * typesize) + 1)); - const char *barrier_expr = NULL, *atomic_ops = ""; - const char *atomic_exp = NULL, *atomic_expr2 = ""; - if (NULL == env_barrier || '0' != *env_barrier) { - barrier_expr = ((0 != std_c11 && (0 == devinfo->intel || (CL_DEVICE_TYPE_CPU != device_type))) - ? "-D\"BARRIER(A)=work_group_barrier(A,memory_scope_work_group)\"" - : "-D\"BARRIER(A)=barrier(A)\""); - } - else barrier_expr = ""; /* no barrier */ - assert(NULL != barrier_expr); - if (NULL == env_atomics || '0' != *env_atomics) { - /* atomics_force: attempt to force atomics without confirmation */ - const int atomics_force = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics)); - if (NULL == env_atomics || '\0' == *env_atomics || 0 != atomics_force) { - cl_bitfield fp_atomics; - assert(dbcsr_type_real_8 == datatype || dbcsr_type_real_4 == datatype); - if (CL_SUCCESS == clGetDeviceInfo(active_device, - (cl_device_info)(dbcsr_type_real_8 == datatype ? 0x4232 : 0x4231), sizeof(cl_bitfield), - &fp_atomics, NULL) && - 0 != (/*add*/ (1 << 1) & fp_atomics)) - { - extensions[1] = "cl_ext_float_atomics"; - atomic_exp = (dbcsr_type_real_8 == datatype - ? "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_double)*)A,B," - "memory_order_relaxed,memory_scope_work_group)" - : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_float)*)A,B," - "memory_order_relaxed,memory_scope_work_group)"); - } - else if ((0 != devinfo->intel && 0x4905 != devinfo->uid && 0 == devinfo->unified) || 0 != atomics_force) { - if ((0 != devinfo->intel && - (dbcsr_type_real_4 == datatype || (0x0bd0 <= devinfo->uid && 0x0bdb >= devinfo->uid))) || - (0 != atomics_force)) - { - if (0 == atomics_force && (0 == devinfo->intel || 0x0bd0 > devinfo->uid || 0x0bdb < devinfo->uid)) { - extensions[1] = "cl_intel_global_float_atomics"; - atomic_ops = "-Dcl_intel_global_float_atomics"; - } - else { - atomic_ops = ((0 == std_c11 && 2 > atomics_force) - ? "-DATOMIC_PROTOTYPES=1" - : (3 > atomics_force ? "-DATOMIC_PROTOTYPES=2" : "-DATOMIC_PROTOTYPES=3")); - } - atomic_exp = ((0 == std_c11 && 2 > atomics_force) ? "atomic_add(A,B)" - : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(TF)*)A,B," - "memory_order_relaxed,memory_scope_work_group)"); - } - else { - atomic_exp = "atomic_add_global_cmpxchg(A,B)"; - atomic_ops = "-DCMPXCHG=atom_cmpxchg"; - } - } - else if (cl_nonv) { - int gfx90 = 0; - if (!cl_noamd && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_device, buffer, ACC_OPENCL_BUFFERSIZE, - NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1)) - { - const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx"); - if (NULL != gfxname && 90 <= atoi(gfxname + 3)) gfx90 = 1; - } - if (0 == gfx90) { - if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al && - (0 == (m_max & 1) || (0 == devinfo->intel /*&& cl_nonv*/)) /* TODO */ - && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) - { - assert(dbcsr_type_real_4 == datatype); - atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A,B)\""; - } - else { - extensions[1] = NULL; - } - atomic_exp = "atomic_add_global_cmpxchg(A,B)"; - atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); - } - else { - atomic_exp = (dbcsr_type_real_8 == datatype - ? "__builtin_amdgcn_global_atomic_fadd_f64(A,B,__ATOMIC_RELAXED)" - : "__builtin_amdgcn_global_atomic_fadd_f32(A,B,__ATOMIC_RELAXED)"); - } - } - else { - assert(NULL != atomic_ops && '\0' == *atomic_ops); - atomic_exp = "atomic_add_global_xchg(A,B)"; - } - } - else if (NULL != LIBXSMM_STRISTR(env_atomics, "cmpxchg")) { - if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al && - (0 == (m_max & 1) || (0 == devinfo->intel && cl_nonv)) /* TODO */ - && '2' == env_atomics[strlen(env_atomics) - 1] && - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1)) - { - assert(dbcsr_type_real_4 == datatype); - atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A,B)\""; - } - else { - extensions[1] = NULL; - } - atomic_exp = "atomic_add_global_cmpxchg(A,B)"; - atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg"); - } - else { - atomic_exp = "atomic_add_global_xchg(A,B)"; - atomic_ops = (dbcsr_type_real_4 == datatype ? "-DXCHG=atomic_xchg" : "-DXCHG=atom_xchg"); - } - } - else { /* unsynchronized */ - assert(NULL != env_atomics); - atomic_exp = "*(A)+=(B)"; /* non-atomic update */ - } - assert(NULL != atomic_exp); /* compose build parameters and flags */ nchar = LIBXSMM_SNPRINTF(build_params, sizeof(build_params), - "-DMAD=fma -DINTEL=%u -DGLOBAL=%s -DSWG=%i -DSGS=%i -DFN=%s -DREPEAT=%i -DLU=%i " - "-DSM=%i -DSN=%i -DSK=%i -DBS=%i -DVL=%i %s -DBM=%i -DBN=%i -DBK=%i -DT=%s -DTN=%i " - "%s %s %s %s %s %s %s %s %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s %s", - 0 != devinfo->intel ? devinfo->uid : 0, cmem, (int)new_config.wgsize[kernel_idx], (int)sgs, fname, + "-DMAD=fma -DT=%s -DINTEL=%u -DGLOBAL=%s -DSWG=%i -DSGS=%i -DFN=%s -DREPEAT=%i -DLU=%i " + "-DSM=%i -DSN=%i -DSK=%i -DBS=%i -DVL=%i %s -DBM=%i -DBN=%i -DBK=%i " + "%s %s %s %s %s %s %s %s ", /* space! */ + tname, 0 != devinfo->intel ? devinfo->uid : 0, cmem, (int)new_config.wgsize[kernel_idx], (int)sgs, fname, NULL == env_nrepeat ? 1 : atoi(env_nrepeat), new_config.lu, m_max, n_max, k_max, bs, OPENCL_LIBSMM_VMIN, - bs == new_config.bs ? "-DBSC" : "", new_config.bm, new_config.bn, new_config.bk, tname, datatype, + bs == new_config.bs ? "-DBSC" : "", new_config.bm, new_config.bn, new_config.bk, + 0 == new_config.tb ? "" : "-DTRACK_B", 0 != new_config.tc ? "-DTRACK_C" : "", 0 == new_config.nz ? "" : "-DATOMIC_INC_NZ", 0 == new_config.al ? "" : "-DAL", - 0 == new_config.tb ? "" : "-DTRACK_B", 0 != new_config.tc ? "-DTRACK_C" : "", 0 == new_config.ap ? "" : "-DSLM_P", + 0 == new_config.ap ? "" : "-DSLM_P", 0 == new_config.aa ? "" : (1 == slm_a ? "-DSLM_A=1" : (0 != slm_a ? "-DSLM_A=2" : "-DREG_A")), 0 == new_config.ab ? "" : (1 == slm_b ? "-DSLM_B=1" : (0 != slm_b ? "-DSLM_B=2" : "-DREG_B")), - 0 == new_config.ac ? "" : (1 == slm_c ? "-DSLM_C=1" : "-DSLM_C=2"), atomic_type, atomic_ops, atomic_exp, - atomic_expr2, barrier_expr); + 0 == new_config.ac ? "" : (1 == slm_c ? "-DSLM_C=1" : "-DSLM_C=2")); + /* apply support for FP-atomics */ + if (0 < nchar && (int)sizeof(build_params) > nchar) { + nchar = c_dbcsr_acc_opencl_flags_atomics(active_device, tkind, devinfo, extensions, + sizeof(extensions) / sizeof(*extensions), build_params + nchar, sizeof(build_params) - nchar); + } + else result = EXIT_FAILURE; if (0 < nchar && (int)sizeof(build_params) > nchar) { const char* const cl_debug = ( # if !defined(NDBGDEV) - (0 != devinfo->intel && CL_DEVICE_TYPE_CPU != device_type) ? "-gline-tables-only" : + (0 != devinfo->intel && CL_DEVICE_TYPE_CPU != devinfo->type) ? "-gline-tables-only" : # endif - ""); - nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "-cl-fast-relaxed-math -cl-denorms-are-zero %s %s %s", - NULL == env_cl ? "" : env_cl, (0 == new_config.flags || 0 == devinfo->intel) ? "" : intel_xf, cl_debug); + ""); + nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s %s -cl-fast-relaxed-math -cl-denorms-are-zero %s", + (0 == new_config.flags || 0 == devinfo->intel || CL_DEVICE_TYPE_GPU != devinfo->type) ? "" : intel_xf, cl_debug, + NULL == env_cl ? "" : env_cl); if (0 >= nchar || (int)sizeof(buffer) <= nchar) result = EXIT_FAILURE; } else result = EXIT_FAILURE;