diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index b4488b1e524..db5d2d79d6f 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -56,7 +56,6 @@ extern "C" {
 c_dbcsr_acc_opencl_config_t c_dbcsr_acc_opencl_config;
 
 
-#  if !defined(NDEBUG)
 void c_dbcsr_acc_opencl_notify(const char /*errinfo*/[], const void* /*private_info*/, size_t /*cb*/, void* /*user_data*/);
 void c_dbcsr_acc_opencl_notify(const char errinfo[], const void* private_info, size_t cb, void* user_data) {
   LIBXSMM_UNUSED(private_info);
@@ -64,7 +63,6 @@ void c_dbcsr_acc_opencl_notify(const char errinfo[], const void* private_info, s
   LIBXSMM_UNUSED(user_data);
   fprintf(stderr, "ERROR ACC/OpenCL: %s\n", errinfo);
 }
-#  endif
 
 
 cl_context c_dbcsr_acc_opencl_context(int* thread_id) {
@@ -919,11 +917,8 @@ int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id active_id) {
   result = clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL);
   assert(CL_SUCCESS != result || NULL != platform);
   if (CL_SUCCESS == result) {
-#  if defined(NDEBUG)
-    void (*const notify)(const char*, const void*, size_t, void*) = NULL;
-#  else
-    void (*const notify)(const char*, const void*, size_t, void*) = c_dbcsr_acc_opencl_notify;
-#  endif
+    void (*const notify)(
+      const char*, const void*, size_t, void*) = (0 != c_dbcsr_acc_opencl_config.verbosity ? c_dbcsr_acc_opencl_notify : NULL);
     cl_context_properties properties[] = {
       CL_CONTEXT_PLATFORM, 0 /*placeholder*/, 0 /* end of properties */
     };
@@ -1007,27 +1002,46 @@ int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id) {
           }
         }
         if (EXIT_SUCCESS == result) { /* update/cache device-specific information */
-          char devname[ACC_OPENCL_BUFFERSIZE];
+          result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device[thread_id].level,
+            c_dbcsr_acc_opencl_config.device[thread_id].level + 1, NULL /*cl_std*/,
+            &c_dbcsr_acc_opencl_config.device[thread_id].type);
+          if (EXIT_SUCCESS == result) {
+            char devname[ACC_OPENCL_BUFFERSIZE];
 #  if defined(CL_VERSION_2_0)
-          const char* const env_svm = getenv("ACC_OPENCL_SVM");
-          int level_major = 0;
-          const int nok = (NULL == env_svm || EXIT_SUCCESS != c_dbcsr_acc_opencl_device_level(active_id, &level_major,
-                                                                NULL /*level_minor*/, NULL /*cl_std*/, NULL /*type*/));
-          c_dbcsr_acc_opencl_config.device[thread_id].svm_interop = ((0 != nok || 2 > level_major) ? 0 : atoi(env_svm));
+            const char* const env_svm = getenv("ACC_OPENCL_SVM");
+            c_dbcsr_acc_opencl_config.device[thread_id].svm_interop =
+              ((NULL == env_svm || 2 > *c_dbcsr_acc_opencl_config.device[thread_id].level) ? 0 : atoi(env_svm));
 #  endif
-          if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool),
-                              &c_dbcsr_acc_opencl_config.device[thread_id].unified, NULL))
-          {
-            c_dbcsr_acc_opencl_config.device[thread_id].unified = CL_FALSE;
-          }
-          if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
-                                0 /*platform_maxlen*/, /*cleanup*/ 1) ||
-              EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device[thread_id].uid))
-          {
-            c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1;
+            if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool),
+                                &c_dbcsr_acc_opencl_config.device[thread_id].unified, NULL))
+            {
+              c_dbcsr_acc_opencl_config.device[thread_id].unified = CL_FALSE;
+            }
+            if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
+                                  0 /*platform_maxlen*/, /*cleanup*/ 1) ||
+                EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device[thread_id].uid))
+            {
+              c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1;
+            }
+            c_dbcsr_acc_opencl_config.device[thread_id].intel = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(
+                                                                                   active_id, "intel", 0 /*use_platform_name*/));
+            c_dbcsr_acc_opencl_config.device[thread_id].nv = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(
+                                                                                active_id, "nvidia", 0 /*use_platform_name*/));
+            if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) ||
+                EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/))
+            {
+              char buffer[ACC_OPENCL_BUFFERSIZE];
+              c_dbcsr_acc_opencl_config.device[thread_id].amd = 1;
+              if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_id, buffer, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
+                                    0 /*platform_maxlen*/, /*cleanup*/ 1))
+              {
+                const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx");
+                if (NULL != gfxname && 90 <= atoi(gfxname + 3)) {
+                  c_dbcsr_acc_opencl_config.device[thread_id].amd = 2;
+                }
+              }
+            }
           }
-          c_dbcsr_acc_opencl_config.device[thread_id].intel =
-            (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/) ? CL_TRUE : CL_FALSE);
         }
       }
     }
@@ -1135,7 +1149,148 @@ int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, size_t* max
 }
 
 
-int c_dbcsr_acc_opencl_build_flags(const char build_params[], const char build_options[], const char try_build_options[],
+int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_atomic_fp_t kind,
+  const c_dbcsr_acc_opencl_device_t* devinfo, const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen) {
+  int result = 0, ext1, ext2;
+  for (ext1 = 0; ext1 < exts_maxlen; ++ext1)
+    if (NULL == exts[ext1] || '\0' == *exts[ext1]) break;
+  for (ext2 = ext1 + 1; ext2 < exts_maxlen; ++ext2)
+    if (NULL == exts[ext2] || '\0' == *exts[ext2]) break;
+  if (NULL != devinfo && ext2 < exts_maxlen) {
+    const char* atomic_type = "";
+    switch (kind) {
+      case c_dbcsr_acc_opencl_atomic_fp_64: {
+        exts[ext1] = "cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics";
+        if (2 <= *devinfo->level && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) {
+          atomic_type = "-DTA=long -DTA2=atomic_long -DTF=atomic_double";
+        }
+        else {
+          exts[ext1] = "cl_khr_fp64 cl_khr_int64_base_atomics";
+          if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) {
+            atomic_type = "-DTA=long";
+          }
+          else { /* fallback */
+            exts[ext1] = "cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics";
+            if (2 <= *devinfo->level && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) {
+              atomic_type = "-DATOMIC32_ADD64 -DTA=int -DTA2=atomic_int -DTF=atomic_double";
+            }
+            else {
+              exts[ext1] = "cl_khr_fp64 cl_khr_global_int32_base_atomics";
+              if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) {
+                atomic_type = "-DATOMIC32_ADD64 -DTA=int";
+              }
+              else kind = c_dbcsr_acc_opencl_atomic_fp_no;
+            }
+          }
+        }
+      } break;
+      case c_dbcsr_acc_opencl_atomic_fp_32: {
+        exts[ext1] = "cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics";
+        if (2 <= *devinfo->level && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) {
+          exts[ext2] = "cl_khr_int64_base_atomics cl_khr_int64_extended_atomics";
+          atomic_type = "-DTA=int -DTA2=atomic_int -DTF=atomic_float";
+        }
+        else {
+          exts[ext1] = "cl_khr_global_int32_base_atomics";
+          if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(device_id, exts, ext2)) {
+            exts[ext2] = "cl_khr_int64_base_atomics";
+            atomic_type = "-DTA=int";
+          }
+          else kind = c_dbcsr_acc_opencl_atomic_fp_no;
+        }
+      } break;
+      default: assert(c_dbcsr_acc_opencl_atomic_fp_no == kind);
+    }
+    if (c_dbcsr_acc_opencl_atomic_fp_no != kind) {
+      const char *barrier_expr = NULL, *atomic_exp = NULL, *atomic_ops = "";
+      const char* const env_barrier = getenv("ACC_OPENCL_BARRIER");
+      const char* const env_atomics = getenv("ACC_OPENCL_ATOMICS");
+      if (NULL == env_barrier || '0' != *env_barrier) {
+        barrier_expr = ((2 <= *devinfo->level && (0 == devinfo->intel || (CL_DEVICE_TYPE_CPU != devinfo->type)))
+                          ? "-D\"BARRIER(A)=work_group_barrier(A,memory_scope_work_group)\""
+                          : "-D\"BARRIER(A)=barrier(A)\"");
+      }
+      else barrier_expr = ""; /* no barrier */
+      assert(NULL != barrier_expr);
+      if (NULL == env_atomics || '0' != *env_atomics) {
+        /* can signal/force atomics without confirmation */
+        const int force_atomics = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics));
+        if (NULL == env_atomics || '\0' == *env_atomics || 0 != force_atomics) {
+          cl_bitfield fp_atomics;
+          if (CL_SUCCESS == clGetDeviceInfo(device_id, (cl_device_info)(c_dbcsr_acc_opencl_atomic_fp_64 == kind ? 0x4232 : 0x4231),
+                              sizeof(cl_bitfield), &fp_atomics, NULL) &&
+              0 != (/*add*/ (1 << 1) & fp_atomics))
+          {
+            exts[ext2] = "cl_ext_float_atomics";
+            atomic_exp = (c_dbcsr_acc_opencl_atomic_fp_64 == kind
+                            ? "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_double)*)A,B,"
+                              "memory_order_relaxed,memory_scope_work_group)"
+                            : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_float)*)A,B,"
+                              "memory_order_relaxed,memory_scope_work_group)");
+          }
+          else if (0 != force_atomics || (0 != devinfo->intel && ((0x4905 != devinfo->uid && 0 == devinfo->unified)))) {
+            if ((((0 != force_atomics || (0 != devinfo->intel && ((0x0bd0 <= devinfo->uid && 0x0bdb >= devinfo->uid) ||
+                                                                   c_dbcsr_acc_opencl_atomic_fp_32 == kind))))))
+            {
+              if (0 == force_atomics && (0 == devinfo->intel || 0x0bd0 > devinfo->uid || 0x0bdb < devinfo->uid)) {
+                exts[ext2] = "cl_intel_global_float_atomics";
+                atomic_ops = "-Dcl_intel_global_float_atomics";
+              }
+              else {
+                atomic_ops = ((2 > *devinfo->level && 2 > force_atomics)
+                                ? "-DATOMIC_PROTOTYPES=1"
+                                : (3 > force_atomics ? "-DATOMIC_PROTOTYPES=2" : "-DATOMIC_PROTOTYPES=3"));
+              }
+              atomic_exp = ((2 > *devinfo->level && 2 > force_atomics) ? "atomic_add(A,B)"
+                                                                       : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(TF)*)A,B,"
+                                                                         "memory_order_relaxed,memory_scope_work_group)");
+            }
+            else {
+              atomic_exp = "atomic_add_global_cmpxchg(A,B)";
+              atomic_ops = "-DCMPXCHG=atom_cmpxchg";
+            }
+          }
+          else if (0 == devinfo->nv) {
+            if (1 >= devinfo->amd) {
+              atomic_ops = (c_dbcsr_acc_opencl_atomic_fp_32 == kind ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg");
+              atomic_exp = "atomic_add_global_cmpxchg(A,B)";
+              exts[ext2] = NULL;
+            }
+            else { /* GCN */
+              atomic_exp = (c_dbcsr_acc_opencl_atomic_fp_64 == kind
+                              ? "__builtin_amdgcn_global_atomic_fadd_f64(A,B,__ATOMIC_RELAXED)"
+                              : "__builtin_amdgcn_global_atomic_fadd_f32(A,B,__ATOMIC_RELAXED)");
+            }
+          }
+          else { /* xchg */
+            assert(NULL != atomic_ops && '\0' == *atomic_ops);
+            atomic_exp = "atomic_add_global_xchg(A,B)";
+          }
+        }
+        else if (NULL != LIBXSMM_STRISTR(env_atomics, "cmpxchg")) {
+          atomic_ops = (c_dbcsr_acc_opencl_atomic_fp_32 == kind ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg");
+          atomic_exp = "atomic_add_global_cmpxchg(A,B)";
+          exts[ext2] = NULL;
+        }
+        else { /* xchg */
+          atomic_exp = "atomic_add_global_xchg(A,B)";
+          atomic_ops = (c_dbcsr_acc_opencl_atomic_fp_32 == kind ? "-DXCHG=atomic_xchg" : "-DXCHG=atom_xchg");
+        }
+      }
+      else { /* unsynchronized */
+        atomic_exp = "*(A)+=(B)"; /* non-atomic update */
+      }
+      assert(NULL != atomic_exp);
+      /* compose build parameters and flags */
+      result = LIBXSMM_SNPRINTF(flags, flags_maxlen, "-DTAN=%i %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s", kind, atomic_type,
+        atomic_ops, atomic_exp, barrier_expr);
+    }
+  }
+  return result;
+}
+
+
+int c_dbcsr_acc_opencl_flags(const char build_params[], const char build_options[], const char try_build_options[],
   const char cl_std[], char buffer[], size_t buffer_size) {
   int result;
   if (NULL != buffer) {
@@ -1334,13 +1489,12 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha
     program = clCreateProgramWithSource(context, 1 /*nlines*/, &ext_source, NULL, &result);
     if (CL_SUCCESS == result) {
       assert(NULL != program);
-      result = c_dbcsr_acc_opencl_build_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer));
+      result = c_dbcsr_acc_opencl_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer));
       if (EXIT_SUCCESS == result) {
         result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/);
       }
       if (CL_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) {
-        result = c_dbcsr_acc_opencl_build_flags(
-          build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer));
+        result = c_dbcsr_acc_opencl_flags(build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer));
         if (EXIT_SUCCESS == result) {
           ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */
           program = clCreateProgramWithSource(context, 1 /*nlines*/, &ext_source, NULL, &result);
@@ -1428,13 +1582,12 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha
     }
     if (CL_SUCCESS == result) {
       assert(NULL != program);
-      result = c_dbcsr_acc_opencl_build_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer));
+      result = c_dbcsr_acc_opencl_flags(build_params, build_options, try_build_options, cl_std, buffer, sizeof(buffer));
       if (EXIT_SUCCESS == result) {
         result = clBuildProgram(program, 1 /*num_devices*/, &active_id, buffer, NULL /*callback*/, NULL /*user_data*/);
       }
       if (CL_SUCCESS != result && NULL != try_build_options && '\0' != *try_build_options) {
-        result = c_dbcsr_acc_opencl_build_flags(
-          build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer));
+        result = c_dbcsr_acc_opencl_flags(build_params, build_options, NULL /*try_build_options*/, cl_std, buffer, sizeof(buffer));
         if (EXIT_SUCCESS == result) {
           ACC_OPENCL_EXPECT(CL_SUCCESS == clReleaseProgram(program)); /* recreate below (to avoid unclean state) */
 #  if defined(CL_VERSION_2_1)
diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h
index 352cf8a1e93..58e1e4ea9e9 100644
--- a/src/acc/opencl/acc_opencl.h
+++ b/src/acc/opencl/acc_opencl.h
@@ -218,28 +218,32 @@
 extern "C" {
 #endif
 
-/** Enumeration of timer kinds used for built-in execution-profile. */
-typedef enum c_dbcsr_acc_opencl_timer_t {
-  c_dbcsr_acc_opencl_timer_device,
-  c_dbcsr_acc_opencl_timer_host
-} c_dbcsr_acc_opencl_timer_t;
-
 /** Settings updated during c_dbcsr_acc_set_active_device. */
 typedef struct c_dbcsr_acc_opencl_device_t {
   /** Activated device context. */
   cl_context context;
+  /** OpenCL support-level of device. */
+  cl_int level[2];
+  /** Kind of device (GPU, CPU, or other). */
+  cl_device_type type;
 #if defined(CL_VERSION_2_0)
   /** Runtime SVM support. */
   cl_bool svm_interop;
 #endif
+  /** Whether host memory is unified. */
+  cl_bool unified;
   /** Device-ID. */
   cl_uint uid;
-  /** Intel device? */
-  cl_bool intel;
-  /** Whether host memory is unified or not. */
-  cl_bool unified;
+  /** Main vendor? */
+  cl_int intel, amd, nv;
 } c_dbcsr_acc_opencl_device_t;
 
+/** Enumeration of timer kinds used for built-in execution-profile. */
+typedef enum c_dbcsr_acc_opencl_timer_t {
+  c_dbcsr_acc_opencl_timer_device,
+  c_dbcsr_acc_opencl_timer_host
+} c_dbcsr_acc_opencl_timer_t;
+
 /**
  * Settings discovered/setup during c_dbcsr_acc_init (independent of the device)
  * and settings updated during c_dbcsr_acc_set_active_device (devinfo).
@@ -295,7 +299,7 @@ typedef struct c_dbcsr_acc_opencl_info_hostptr_t {
 c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory);
 
 /** Determines cl_mem object and offset of memory. */
-void* c_dbcsr_acc_opencl_info_devptr(const void* memory, const size_t* amount, size_t* offset);
+void* c_dbcsr_acc_opencl_info_devptr(const void* memory, size_t elsize, const size_t* amount, size_t* offset);
 
 /** Information about streams (c_dbcsr_acc_stream_create). */
 typedef struct c_dbcsr_acc_opencl_info_stream_t {
@@ -324,7 +328,7 @@ int c_dbcsr_acc_opencl_device_uid(cl_device_id device, const char devname[], uns
 /** Based on the device-ID, return the device's UID (capture or calculate), device name, and platform name. */
 int c_dbcsr_acc_opencl_device_name(
   cl_device_id device, char name[], size_t name_maxlen, char platform[], size_t platform_maxlen, int cleanup);
-/** Return the OpenCL support level for the given device. */
+/** Return the OpenCL support-level for the given device. */
 int c_dbcsr_acc_opencl_device_level(cl_device_id device, int* level_major, int* level_minor, char cl_std[16], cl_device_type* type);
 /** Check if given device supports the extensions. */
 int c_dbcsr_acc_opencl_device_ext(cl_device_id device, const char* const extnames[], int num_exts);
@@ -334,10 +338,6 @@ int c_dbcsr_acc_opencl_create_context(int thread_id, cl_device_id device_id);
 int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id);
 /** Get preferred multiple and max. size of workgroup (kernel- or device-specific). */
 int c_dbcsr_acc_opencl_wgsize(cl_device_id device, cl_kernel kernel, size_t* max_value, size_t* preferred_multiple);
-/** Assemble various flags for calling clBuildProgram into the given buffer.*/
-/** Combines build-params and build-options, some optional flags (try_build_options), and applies language std. (cl_std). */
-int c_dbcsr_acc_opencl_build_flags(const char build_params[], const char build_options[], const char try_build_options[],
-  const char cl_std[], char buffer[], size_t buffer_size);
 /**
  * Build kernel from source with given kernel_name, build_params and build_options.
  * The build_params are meant to instantiate the kernel (-D) whereas build_options
@@ -351,6 +351,21 @@ int c_dbcsr_acc_opencl_device_synchronize(int thread_id);
 /** Create user-event if not created and sets initial state. */
 int c_dbcsr_acc_opencl_event_create(cl_event* event_p);
 
+/** Enumeration of FP-atomic kinds. */
+typedef enum c_dbcsr_acc_opencl_atomic_fp_t {
+  c_dbcsr_acc_opencl_atomic_fp_no = 0,
+  c_dbcsr_acc_opencl_atomic_fp_32 = 1,
+  c_dbcsr_acc_opencl_atomic_fp_64 = 2
+} c_dbcsr_acc_opencl_atomic_fp_t;
+
+/** Assemble flags to support atomic operations. */
+int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_atomic_fp_t kind,
+  const c_dbcsr_acc_opencl_device_t* devinfo, const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen);
+
+/** Combines build-params and build-options, some optional flags (try_build_options), and applies language std. (cl_std). */
+int c_dbcsr_acc_opencl_flags(const char build_params[], const char build_options[], const char try_build_options[],
+  const char cl_std[], char buffer[], size_t buffer_size);
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c
index fa25994fedc..f35b258e572 100644
--- a/src/acc/opencl/acc_opencl_mem.c
+++ b/src/acc/opencl/acc_opencl_mem.c
@@ -61,11 +61,11 @@ c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory)
 }
 
 
-void* c_dbcsr_acc_opencl_info_devptr(const void* memory, const size_t* amount, size_t* offset) {
+void* c_dbcsr_acc_opencl_info_devptr(const void* memory, size_t elsize, const size_t* amount, size_t* offset) {
   void* result = NULL;
 #  if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \
     defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT)
-  if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != memory) {
+  if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != memory && 0 < elsize) {
     const char* const buffer = (const char*)memory;
     const size_t n = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads;
     size_t i = c_dbcsr_acc_opencl_config.nclmems, hit = (size_t)-1;
@@ -78,20 +78,21 @@ void* c_dbcsr_acc_opencl_info_devptr(const void* memory, const size_t* amount, s
         result = handle;
         break;
       }
-      else if (NULL != mem) {
+      else if (NULL != mem && mem < buffer && NULL != offset) {
         size_t d = buffer - mem, s = 0;
-        if (d < hit && NULL != offset &&
-            (NULL == amount ||
-              (CL_SUCCESS == clGetMemObjectInfo((cl_mem)mem, CL_MEM_SIZE, sizeof(size_t), &s, NULL) && (*amount + d) <= s)))
+        if (d < hit && CL_SUCCESS == clGetMemObjectInfo((cl_mem)mem, CL_MEM_SIZE, sizeof(size_t), &s, NULL) &&
+            (1 == elsize || (0 == (d % elsize) && 0 == (s % elsize))) && (NULL == amount || (*amount + d) <= s))
         {
-          *offset = hit = d;
+          *offset = (1 == elsize ? d : (d / elsize));
           result = handle;
+          hit = d;
         }
       }
     }
   }
 #  else
   LIBXSMM_UNUSED(memory);
+  LIBXSMM_UNUSED(elsize);
   LIBXSMM_UNUSED(amount);
   LIBXSMM_UNUSED(offset);
 #  endif
@@ -311,7 +312,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) {
 #      pragma omp critical(c_dbcsr_acc_dev_mem_deallocate)
 #    endif
     {
-      void** handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, NULL /*amount*/, NULL /*offset*/);
+      void** handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, NULL /*amount*/, NULL /*offset*/);
       if (NULL != handle) {
         void** const pfree = c_dbcsr_acc_opencl_config.clmems[c_dbcsr_acc_opencl_config.nclmems];
         libxsmm_pfree(pfree, c_dbcsr_acc_opencl_config.clmems, &c_dbcsr_acc_opencl_config.nclmems);
@@ -383,7 +384,7 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
     defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT)
     assert(NULL != c_dbcsr_acc_opencl_config.clmems);
     {
-      void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, &nbytes, &offset);
+      void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset);
       if (NULL != handle) buffer = *(cl_mem*)handle;
 #    if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG)
       else result = EXIT_FAILURE;
@@ -432,7 +433,7 @@ int c_dbcsr_acc_memcpy_d2h(const void* dev_mem, void* host_mem, size_t nbytes, v
     defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT)
     assert(NULL != c_dbcsr_acc_opencl_config.clmems);
     {
-      void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, &nbytes, &offset);
+      void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset);
       if (NULL != handle) buffer = *(cl_mem*)handle;
 #    if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG)
       else result = EXIT_FAILURE;
@@ -489,8 +490,8 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
     defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT)
     assert(NULL != c_dbcsr_acc_opencl_config.clmems);
     {
-      void* const handle_src = c_dbcsr_acc_opencl_info_devptr(devmem_src, &nbytes, &src_offset);
-      void* const handle_dst = c_dbcsr_acc_opencl_info_devptr(devmem_dst, &nbytes, &dst_offset);
+      void* const handle_src = c_dbcsr_acc_opencl_info_devptr(devmem_src, 1 /*elsize*/, &nbytes, &src_offset);
+      void* const handle_dst = c_dbcsr_acc_opencl_info_devptr(devmem_dst, 1 /*elsize*/, &nbytes, &dst_offset);
       if (NULL != handle_src) src = *(cl_mem*)handle_src;
 #    if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG)
       else result = EXIT_FAILURE;
@@ -567,7 +568,7 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb
 #  if defined(ACC_OPENCL_MEM_OFFSET) && LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && \
     defined(ACC_OPENCL_HANDLES_MAXCOUNT) && (0 < ACC_OPENCL_HANDLES_MAXCOUNT)
     if (0 == offset && NULL != c_dbcsr_acc_opencl_config.clmems) {
-      void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, &nbytes, &offset);
+      void* const handle = c_dbcsr_acc_opencl_info_devptr(dev_mem, 1 /*elsize*/, &nbytes, &offset);
       if (NULL != handle) buffer = *(cl_mem*)handle;
 #    if !defined(NDEBUG) || defined(ACC_OPENCL_MEM_DEBUG)
       else result = EXIT_FAILURE;
diff --git a/src/acc/opencl/common/opencl_atomics.h b/src/acc/opencl/common/opencl_atomics.h
index 9d285b9a7c5..7809a5216cd 100644
--- a/src/acc/opencl/common/opencl_atomics.h
+++ b/src/acc/opencl/common/opencl_atomics.h
@@ -11,6 +11,20 @@
 
 #include "opencl_common.h"
 
+#if (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/)
+#  if !defined(T)
+#    define T double
+#  endif
+#  define ZERO 0.0
+#elif (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/)
+#  if !defined(T)
+#    define T float
+#  endif
+#  define ZERO 0.f
+#elif defined(T) /*c_dbcsr_acc_opencl_atomic_fp_no*/
+#  define ZERO 0
+#endif
+
 #define GLOBAL_VOLATILE(A) global volatile A
 
 #if defined(ATOMIC_PROTOTYPES) || defined(__opencl_c_ext_fp64_global_atomic_add)
@@ -38,7 +52,7 @@ __attribute__((overloadable)) T atomic_add(GLOBAL_VOLATILE(T) *, T);
 #define ACCUMULATE(A, B) ATOMIC_ADD_GLOBAL(A, B)
 
 
-#if !defined(cl_intel_global_float_atomics) || (1 != TN)
+#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/)
 #  if defined(ATOMIC32_ADD64)
 __attribute__((always_inline)) inline void atomic32_add64_global(GLOBAL_VOLATILE(double) * dst, double inc) {
   *dst += inc; /* TODO */
@@ -47,7 +61,7 @@ __attribute__((always_inline)) inline void atomic32_add64_global(GLOBAL_VOLATILE
 #endif
 
 
-#if !defined(cl_intel_global_float_atomics) || (1 != TN)
+#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/)
 #  if defined(CMPXCHG)
 __attribute__((always_inline)) inline void atomic_add_global_cmpxchg(GLOBAL_VOLATILE(T) * dst, T inc) {
 #    if !defined(ATOMIC32_ADD64)
@@ -74,8 +88,8 @@ __attribute__((always_inline)) inline void atomic_add_global_cmpxchg(GLOBAL_VOLA
 #endif
 
 
-#if !defined(cl_intel_global_float_atomics) || (1 != TN)
-#  if defined(ATOMIC_ADD2_GLOBAL) && (1 == TN)
+#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/)
+#  if defined(ATOMIC_ADD2_GLOBAL) && (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/)
 __attribute__((always_inline)) inline void atomic_add_global_cmpxchg2(GLOBAL_VOLATILE(float) * dst, float2 inc) {
   union {
     float2 f;
@@ -97,13 +111,13 @@ __attribute__((always_inline)) inline void atomic_add_global_cmpxchg2(GLOBAL_VOL
 #endif
 
 
-#if !defined(cl_intel_global_float_atomics) || (1 != TN)
+#if !defined(cl_intel_global_float_atomics) || (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/)
 #  if defined(XCHG) || (defined(__NV_CL_C_VERSION) && !defined(CMPXCHG) && !defined(ATOMIC_PROTOTYPES))
 __attribute__((always_inline)) inline void atomic_add_global_xchg(GLOBAL_VOLATILE(T) * dst, T inc) {
 #    if !defined(ATOMIC32_ADD64)
-#      if (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (1 == TN)
+#      if (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (1 == TAN /*c_dbcsr_acc_opencl_atomic_fp_32*/)
   asm("{ .reg .f32 t; atom.global.add.f32 t, [%0], %1; }" ::"l"(dst), "f"(inc));
-#      elif (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (3 == TN)
+#      elif (defined(__NV_CL_C_VERSION) && !defined(XCHG)) && (2 == TAN /*c_dbcsr_acc_opencl_atomic_fp_64*/)
   asm("{ .reg .f64 t; atom.global.add.f64 t, [%0], %1; }" ::"l"(dst), "d"(inc));
 #      else
   union {
diff --git a/src/acc/opencl/common/opencl_common.h b/src/acc/opencl/common/opencl_common.h
index 9cf10fb4887..1b890707b76 100644
--- a/src/acc/opencl/common/opencl_common.h
+++ b/src/acc/opencl/common/opencl_common.h
@@ -30,12 +30,4 @@
 #  define UNROLL(N) UNROLL_FORCE(N)
 #endif
 
-#if (1 == TN)
-#  define ZERO 0.f
-#elif (3 == TN)
-#  define ZERO 0.0
-#else
-#  define ZERO 0
-#endif
-
 #endif /*OPENCL_COMMON_H*/
diff --git a/src/acc/opencl/smm/README.md b/src/acc/opencl/smm/README.md
index e4b8d6a470d..a34c0700942 100644
--- a/src/acc/opencl/smm/README.md
+++ b/src/acc/opencl/smm/README.md
@@ -13,7 +13,6 @@ There are two categories for the two domains in LIBSMM, i.e., matrix transpose (
 The most common settings for multiplying matrices are:
 
 * `OPENCL_LIBSMM_SMM_BUILDOPTS`: character string with build options (compile and link) supplied to the OpenCL runtime compiler.
-* `OPENCL_LIBSMM_SMM_ATOMICS`: selects the kind of atomic operation used for global memory updates (`xchg`, `cmpxchg`, `cmpxchg2`), attempts to force atomic instructions, or disables atomic instructions (`0`). The latter is for instance to quantify the impact of atomic operations.
 * `OPENCL_LIBSMM_SMM_PARAMS`: Disable embedded/auto-tuned parameters (`0`), or load CSV-file (e.g., `path/to/tune_multiply.csv`).
 * `OPENCL_LIBSMM_SMM_BS`: non-negative integer number denoting the intra-kernel (mini-)batchsize mainly used to amortize atomic updates of data in global/main memory. The remainder with respect to the "stacksize" is handled by the kernel.
 * `OPENCL_LIBSMM_SMM_BM`: non-negative integer number (less/equal than the M-extent) denoting the blocksize in M-direction.
diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c
index a446d8d39de..934d075e893 100644
--- a/src/acc/opencl/smm/opencl_libsmm.c
+++ b/src/acc/opencl/smm/opencl_libsmm.c
@@ -1160,14 +1160,11 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
       /* determine kernel-kind (mini-batch vs. mini-kernel) */
       if (1 == bs || 0 > s || (bs * s) > stack_size) kernel_idx = bs = 1;
       if (NULL == config || NULL == config->kernel[kernel_idx]) {
-        char buffer[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE];
-        char fname[ACC_OPENCL_MAXSTRLEN];
-        int cl_level_major, nchar = LIBXSMM_SNPRINTF(fname, sizeof(fname),
-                              /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */
-                              "x" OPENCL_LIBSMM_KERNELNAME_SMM "%ix%ix%i", m_max, n_max, k_max);
-        const char* extensions[] = {NULL, NULL};
+        char buffer[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE], fname[ACC_OPENCL_MAXSTRLEN];
+        int nchar = LIBXSMM_SNPRINTF(fname, sizeof(fname),
+          /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */
+          "x" OPENCL_LIBSMM_KERNELNAME_SMM "%ix%ix%i", m_max, n_max, k_max);
         cl_device_id active_device = NULL;
-        cl_device_type device_type = 0;
 #    if defined(__DBCSR_ACC)
         int routine_handle;
         c_dbcsr_timeset(LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR, LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR, &routine_handle);
@@ -1176,65 +1173,23 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
                     ? clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &active_device, NULL)
                     : EXIT_FAILURE);
         if (EXIT_SUCCESS == result) {
-          result = c_dbcsr_acc_opencl_device_level(
-            active_device, &cl_level_major, NULL /*level_minor*/, NULL /*cl_std*/, &device_type);
-        }
-        if (EXIT_SUCCESS == result) {
-          const char *tname = NULL, *atomic_type = "";
-          int std_c11 = 0;
+          c_dbcsr_acc_opencl_atomic_fp_t tkind = c_dbcsr_acc_opencl_atomic_fp_no;
+          const char* tname = NULL;
           switch (datatype) {
             case dbcsr_type_real_8: {
-              extensions[0] = "cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics";
+              tkind = c_dbcsr_acc_opencl_atomic_fp_64;
               tname = "double";
               fname[0] = 'd';
-              if (2 <= cl_level_major && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
-                atomic_type = "-DTA=long -DTA2=atomic_long -DTF=atomic_double";
-                std_c11 = 1;
-              }
-              else {
-                extensions[0] = "cl_khr_fp64 cl_khr_int64_base_atomics";
-                if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
-                  atomic_type = "-DTA=long";
-                }
-                else { /* fallback */
-                  extensions[0] = "cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics";
-                  if (2 <= cl_level_major && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
-                    atomic_type = "-DATOMIC32_ADD64 -DTA=int -DTA2=atomic_int -DTF=atomic_double";
-                    std_c11 = 1;
-                  }
-                  else {
-                    extensions[0] = "cl_khr_fp64 cl_khr_global_int32_base_atomics";
-                    if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
-                      atomic_type = "-DATOMIC32_ADD64 -DTA=int";
-                    }
-                    else tname = NULL;
-                  }
-                }
-              }
             } break;
             case dbcsr_type_real_4: {
-              extensions[0] = "cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics";
-              if (2 <= cl_level_major && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
-                extensions[1] = "cl_khr_int64_base_atomics cl_khr_int64_extended_atomics";
-                atomic_type = "-DTA=int -DTA2=atomic_int -DTF=atomic_float";
-                std_c11 = 1;
-                tname = "float";
-                fname[0] = 's';
-              }
-              else {
-                extensions[0] = "cl_khr_global_int32_base_atomics";
-                if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
-                  extensions[1] = "cl_khr_int64_base_atomics";
-                  atomic_type = "-DTA=int";
-                  tname = "float";
-                  fname[0] = 's';
-                }
-              }
+              tkind = c_dbcsr_acc_opencl_atomic_fp_32;
+              tname = "float";
+              fname[0] = 's';
             } break;
             default: assert(NULL == tname);
           }
           if (NULL != tname) {
-            const char* const env_devid = getenv("OPENCL_LIBSMM_SMM_DEVID");
+            const char *extensions[] = {NULL, NULL}, *const env_devid = getenv("OPENCL_LIBSMM_SMM_DEVID");
             const unsigned int devuid = (NULL == env_devid || '\0' == *env_devid) ? devinfo->uid
                                                                                   : (unsigned int)strtoul(env_devid, NULL, 0);
             size_t wgsize_max, wgsize_prf, sgs = 0;
@@ -1257,12 +1212,6 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
               const char *const env_ab = getenv("OPENCL_LIBSMM_SMM_AB"), *const env_ac = getenv("OPENCL_LIBSMM_SMM_AC");
               const char *const env_xf = getenv("OPENCL_LIBSMM_SMM_XF"), *const env_cl = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS");
               const char* const intel_xf = "-cl-intel-256-GRF-per-thread";
-              const int cl_nonv = (0 != devinfo->intel || EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(
-                                                                            active_device, "nvidia", 0 /*use_platform_name*/));
-              const int cl_noamd =
-                (0 != devinfo->intel || !cl_nonv ||
-                  (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 0 /*use_platform_name*/) &&
-                    EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "amd", 1 /*use_platform_name*/)));
               const int default_lu = (0 != devinfo->intel ? -1 : 0);
               const int unroll = LIBXSMM_MAX(-2, (NULL == env_lu || '\0' == *env_lu)
                                                    ? (0 == kernel_idx ? (NULL == config ? default_lu : config->lu) : default_lu)
@@ -1305,7 +1254,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
                 0, 1);
               new_config.al = LIBXSMM_CLMP(
                 (NULL == env_al || '\0' == *env_al)
-                  ? (cl_noamd ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) : 1)
+                  ? (0 == devinfo->amd ? (0 == kernel_idx ? (NULL == config ? /*default*/ 0 : config->al) : /*default*/ 0) : 1)
                   : atoi(env_al),
                 0, 1);
               new_config.tb = LIBXSMM_CLMP((NULL == env_tb || '\0' == *env_tb)
@@ -1336,7 +1285,9 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
                   : atoi(env_ac),
                 0, 1);
               if (NULL == env_xf || '\0' == *env_xf) {
-                if (0 == devinfo->intel || NULL == env_cl || NULL == strstr(env_cl, intel_xf)) {
+                if (0 == devinfo->intel || CL_DEVICE_TYPE_GPU != devinfo->type || NULL == env_cl ||
+                    NULL == strstr(env_cl, intel_xf))
+                {
                   new_config.flags = (NULL == config ? /*default*/ 0 : config->flags);
                 }
                 else new_config.flags = 1;
@@ -1418,143 +1369,40 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
               }
               if (new_config.wgsize[kernel_idx] <= wgsize_max) { /* SMMs can be potentially handled by device */
                 const char* const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant");
-                const char* const env_barrier = getenv("OPENCL_LIBSMM_SMM_BARRIER");
-                const char* const env_atomics = getenv("OPENCL_LIBSMM_SMM_ATOMICS");
                 const char* const env_nrepeat = getenv("SMM_NREPEAT");
                 const int typesize = OPENCL_LIBSMM_TYPESIZE(datatype);
                 const int slm_a = (1 != new_config.aa ? 0 : (LIBXSMM_ISPOT(k_max * typesize) + 1));
                 const int slm_b = (1 != new_config.ab ? 0 : (LIBXSMM_ISPOT(k_max * typesize) + 1));
                 const int slm_c = (1 != new_config.ac ? 0 : (LIBXSMM_ISPOT(m_max * typesize) + 1));
-                const char *barrier_expr = NULL, *atomic_ops = "";
-                const char *atomic_exp = NULL, *atomic_expr2 = "";
-                if (NULL == env_barrier || '0' != *env_barrier) {
-                  barrier_expr = ((0 != std_c11 && (0 == devinfo->intel || (CL_DEVICE_TYPE_CPU != device_type)))
-                                    ? "-D\"BARRIER(A)=work_group_barrier(A,memory_scope_work_group)\""
-                                    : "-D\"BARRIER(A)=barrier(A)\"");
-                }
-                else barrier_expr = ""; /* no barrier */
-                assert(NULL != barrier_expr);
-                if (NULL == env_atomics || '0' != *env_atomics) {
-                  /* atomics_force: attempt to force atomics without confirmation */
-                  const int atomics_force = ((NULL == env_atomics || '\0' == *env_atomics) ? 0 : atoi(env_atomics));
-                  if (NULL == env_atomics || '\0' == *env_atomics || 0 != atomics_force) {
-                    cl_bitfield fp_atomics;
-                    assert(dbcsr_type_real_8 == datatype || dbcsr_type_real_4 == datatype);
-                    if (CL_SUCCESS == clGetDeviceInfo(active_device,
-                                        (cl_device_info)(dbcsr_type_real_8 == datatype ? 0x4232 : 0x4231), sizeof(cl_bitfield),
-                                        &fp_atomics, NULL) &&
-                        0 != (/*add*/ (1 << 1) & fp_atomics))
-                    {
-                      extensions[1] = "cl_ext_float_atomics";
-                      atomic_exp = (dbcsr_type_real_8 == datatype
-                                      ? "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_double)*)A,B,"
-                                        "memory_order_relaxed,memory_scope_work_group)"
-                                      : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(atomic_float)*)A,B,"
-                                        "memory_order_relaxed,memory_scope_work_group)");
-                    }
-                    else if ((0 != devinfo->intel && 0x4905 != devinfo->uid && 0 == devinfo->unified) || 0 != atomics_force) {
-                      if ((0 != devinfo->intel &&
-                            (dbcsr_type_real_4 == datatype || (0x0bd0 <= devinfo->uid && 0x0bdb >= devinfo->uid))) ||
-                          (0 != atomics_force))
-                      {
-                        if (0 == atomics_force && (0 == devinfo->intel || 0x0bd0 > devinfo->uid || 0x0bdb < devinfo->uid)) {
-                          extensions[1] = "cl_intel_global_float_atomics";
-                          atomic_ops = "-Dcl_intel_global_float_atomics";
-                        }
-                        else {
-                          atomic_ops = ((0 == std_c11 && 2 > atomics_force)
-                                          ? "-DATOMIC_PROTOTYPES=1"
-                                          : (3 > atomics_force ? "-DATOMIC_PROTOTYPES=2" : "-DATOMIC_PROTOTYPES=3"));
-                        }
-                        atomic_exp = ((0 == std_c11 && 2 > atomics_force) ? "atomic_add(A,B)"
-                                                                          : "atomic_fetch_add_explicit((GLOBAL_VOLATILE(TF)*)A,B,"
-                                                                            "memory_order_relaxed,memory_scope_work_group)");
-                      }
-                      else {
-                        atomic_exp = "atomic_add_global_cmpxchg(A,B)";
-                        atomic_ops = "-DCMPXCHG=atom_cmpxchg";
-                      }
-                    }
-                    else if (cl_nonv) {
-                      int gfx90 = 0;
-                      if (!cl_noamd && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_device, buffer, ACC_OPENCL_BUFFERSIZE,
-                                                         NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1))
-                      {
-                        const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx");
-                        if (NULL != gfxname && 90 <= atoi(gfxname + 3)) gfx90 = 1;
-                      }
-                      if (0 == gfx90) {
-                        if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al &&
-                            (0 == (m_max & 1) || (0 == devinfo->intel /*&& cl_nonv*/)) /* TODO */
-                            && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1))
-                        {
-                          assert(dbcsr_type_real_4 == datatype);
-                          atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A,B)\"";
-                        }
-                        else {
-                          extensions[1] = NULL;
-                        }
-                        atomic_exp = "atomic_add_global_cmpxchg(A,B)";
-                        atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg");
-                      }
-                      else {
-                        atomic_exp = (dbcsr_type_real_8 == datatype
-                                        ? "__builtin_amdgcn_global_atomic_fadd_f64(A,B,__ATOMIC_RELAXED)"
-                                        : "__builtin_amdgcn_global_atomic_fadd_f32(A,B,__ATOMIC_RELAXED)");
-                      }
-                    }
-                    else {
-                      assert(NULL != atomic_ops && '\0' == *atomic_ops);
-                      atomic_exp = "atomic_add_global_xchg(A,B)";
-                    }
-                  }
-                  else if (NULL != LIBXSMM_STRISTR(env_atomics, "cmpxchg")) {
-                    if (NULL != extensions[1] && 1 < bs && 1 == new_config.bn && new_config.bm >= m_max && 0 == new_config.al &&
-                        (0 == (m_max & 1) || (0 == devinfo->intel && cl_nonv)) /* TODO */
-                        && '2' == env_atomics[strlen(env_atomics) - 1] &&
-                        EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1))
-                    {
-                      assert(dbcsr_type_real_4 == datatype);
-                      atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A,B)\"";
-                    }
-                    else {
-                      extensions[1] = NULL;
-                    }
-                    atomic_exp = "atomic_add_global_cmpxchg(A,B)";
-                    atomic_ops = (dbcsr_type_real_4 == datatype ? "-DCMPXCHG=atomic_cmpxchg" : "-DCMPXCHG=atom_cmpxchg");
-                  }
-                  else {
-                    atomic_exp = "atomic_add_global_xchg(A,B)";
-                    atomic_ops = (dbcsr_type_real_4 == datatype ? "-DXCHG=atomic_xchg" : "-DXCHG=atom_xchg");
-                  }
-                }
-                else { /* unsynchronized */
-                  assert(NULL != env_atomics);
-                  atomic_exp = "*(A)+=(B)"; /* non-atomic update */
-                }
-                assert(NULL != atomic_exp);
                 /* compose build parameters and flags */
                 nchar = LIBXSMM_SNPRINTF(build_params, sizeof(build_params),
-                  "-DMAD=fma -DINTEL=%u -DGLOBAL=%s -DSWG=%i -DSGS=%i -DFN=%s -DREPEAT=%i -DLU=%i "
-                  "-DSM=%i -DSN=%i -DSK=%i -DBS=%i -DVL=%i %s -DBM=%i -DBN=%i -DBK=%i -DT=%s -DTN=%i "
-                  "%s %s %s %s %s %s %s %s %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s %s",
-                  0 != devinfo->intel ? devinfo->uid : 0, cmem, (int)new_config.wgsize[kernel_idx], (int)sgs, fname,
+                  "-DMAD=fma -DT=%s -DINTEL=%u -DGLOBAL=%s -DSWG=%i -DSGS=%i -DFN=%s -DREPEAT=%i -DLU=%i "
+                  "-DSM=%i -DSN=%i -DSK=%i -DBS=%i -DVL=%i %s -DBM=%i -DBN=%i -DBK=%i "
+                  "%s %s %s %s %s %s %s %s ", /* space! */
+                  tname, 0 != devinfo->intel ? devinfo->uid : 0, cmem, (int)new_config.wgsize[kernel_idx], (int)sgs, fname,
                   NULL == env_nrepeat ? 1 : atoi(env_nrepeat), new_config.lu, m_max, n_max, k_max, bs, OPENCL_LIBSMM_VMIN,
-                  bs == new_config.bs ? "-DBSC" : "", new_config.bm, new_config.bn, new_config.bk, tname, datatype,
+                  bs == new_config.bs ? "-DBSC" : "", new_config.bm, new_config.bn, new_config.bk,
+                  0 == new_config.tb ? "" : "-DTRACK_B", 0 != new_config.tc ? "-DTRACK_C" : "",
                   0 == new_config.nz ? "" : "-DATOMIC_INC_NZ", 0 == new_config.al ? "" : "-DAL",
-                  0 == new_config.tb ? "" : "-DTRACK_B", 0 != new_config.tc ? "-DTRACK_C" : "", 0 == new_config.ap ? "" : "-DSLM_P",
+                  0 == new_config.ap ? "" : "-DSLM_P",
                   0 == new_config.aa ? "" : (1 == slm_a ? "-DSLM_A=1" : (0 != slm_a ? "-DSLM_A=2" : "-DREG_A")),
                   0 == new_config.ab ? "" : (1 == slm_b ? "-DSLM_B=1" : (0 != slm_b ? "-DSLM_B=2" : "-DREG_B")),
-                  0 == new_config.ac ? "" : (1 == slm_c ? "-DSLM_C=1" : "-DSLM_C=2"), atomic_type, atomic_ops, atomic_exp,
-                  atomic_expr2, barrier_expr);
+                  0 == new_config.ac ? "" : (1 == slm_c ? "-DSLM_C=1" : "-DSLM_C=2"));
+                /* apply support for FP-atomics */
+                if (0 < nchar && (int)sizeof(build_params) > nchar) {
+                  nchar = c_dbcsr_acc_opencl_flags_atomics(active_device, tkind, devinfo, extensions,
+                    sizeof(extensions) / sizeof(*extensions), build_params + nchar, sizeof(build_params) - nchar);
+                }
+                else result = EXIT_FAILURE;
                 if (0 < nchar && (int)sizeof(build_params) > nchar) {
                   const char* const cl_debug = (
 #    if !defined(NDBGDEV)
-                    (0 != devinfo->intel && CL_DEVICE_TYPE_CPU != device_type) ? "-gline-tables-only" :
+                    (0 != devinfo->intel && CL_DEVICE_TYPE_CPU != devinfo->type) ? "-gline-tables-only" :
 #    endif
-                                                                               "");
-                  nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "-cl-fast-relaxed-math -cl-denorms-are-zero %s %s %s",
-                    NULL == env_cl ? "" : env_cl, (0 == new_config.flags || 0 == devinfo->intel) ? "" : intel_xf, cl_debug);
+                                                                                 "");
+                  nchar = LIBXSMM_SNPRINTF(buffer, sizeof(buffer), "%s %s -cl-fast-relaxed-math -cl-denorms-are-zero %s",
+                    (0 == new_config.flags || 0 == devinfo->intel || CL_DEVICE_TYPE_GPU != devinfo->type) ? "" : intel_xf, cl_debug,
+                    NULL == env_cl ? "" : env_cl);
                   if (0 >= nchar || (int)sizeof(buffer) <= nchar) result = EXIT_FAILURE;
                 }
                 else result = EXIT_FAILURE;