Skip to content

Commit

Permalink
ocl: pointer-arithmetic for device-pointers
Browse files Browse the repository at this point in the history
* Fallback to main-thread's stream (c_dbcsr_acc_opencl_stream_default).
* Fixed c_dbcsr_acc_opencl_stream_default and reduce one level of indirection.
* Reworked entire memory allocation (determining offsets).
* Improved error checks and introduced more assertions.
* ACC_OPENCL_MEM_OFFSET is now mandatory.
* Tightened memory facility (locks).
* Improved locking stream facility.
* Adjusted UNROLL-control.
  • Loading branch information
hfp committed Jan 30, 2024
1 parent 00311f4 commit 5f5d059
Show file tree
Hide file tree
Showing 7 changed files with 382 additions and 366 deletions.
153 changes: 77 additions & 76 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,46 +524,47 @@ int c_dbcsr_acc_init(void) {
else {
result = EXIT_FAILURE;
}
c_dbcsr_acc_opencl_config.clmems = NULL;
c_dbcsr_acc_opencl_config.events = NULL;
c_dbcsr_acc_opencl_config.clmem_info = NULL;
c_dbcsr_acc_opencl_config.event_info = NULL;
c_dbcsr_acc_opencl_config.nclmems = c_dbcsr_acc_opencl_config.nevents = 0;
c_dbcsr_acc_opencl_config.clmems = c_dbcsr_acc_opencl_config.events = NULL;
c_dbcsr_acc_opencl_config.storage = NULL;
# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER && defined(ACC_OPENCL_HANDLES_MAXCOUNT) && \
(0 < ACC_OPENCL_HANDLES_MAXCOUNT)
if (EXIT_SUCCESS == result) {
const size_t nhandles = ACC_OPENCL_HANDLES_MAXCOUNT * c_dbcsr_acc_opencl_config.nthreads;
# if defined(ACC_OPENCL_MEM_OFFSET)
c_dbcsr_acc_opencl_config.nclmems = nhandles;
c_dbcsr_acc_opencl_config.clmems = (void**)malloc(sizeof(void*) * nhandles);
c_dbcsr_acc_opencl_config.storage = malloc(sizeof(void*) * (nhandles + nhandles));
if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != c_dbcsr_acc_opencl_config.storage) {
libxsmm_pmalloc_init(sizeof(void*), &c_dbcsr_acc_opencl_config.nclmems, c_dbcsr_acc_opencl_config.clmems,
(void**)c_dbcsr_acc_opencl_config.storage + nhandles);
c_dbcsr_acc_opencl_config.clmems = (c_dbcsr_acc_opencl_info_ptr_t**)malloc(
sizeof(c_dbcsr_acc_opencl_info_ptr_t*) * nhandles);
c_dbcsr_acc_opencl_config.clmem_info = (c_dbcsr_acc_opencl_info_ptr_t*)malloc(
sizeof(c_dbcsr_acc_opencl_info_ptr_t) * nhandles);
if (NULL != c_dbcsr_acc_opencl_config.clmems && NULL != c_dbcsr_acc_opencl_config.clmem_info) {
libxsmm_pmalloc_init(sizeof(c_dbcsr_acc_opencl_info_ptr_t), &c_dbcsr_acc_opencl_config.nclmems,
(void**)c_dbcsr_acc_opencl_config.clmems, c_dbcsr_acc_opencl_config.clmem_info);
}
else {
free(c_dbcsr_acc_opencl_config.clmems);
free(c_dbcsr_acc_opencl_config.clmem_info);
c_dbcsr_acc_opencl_config.clmem_info = NULL;
c_dbcsr_acc_opencl_config.clmems = NULL;
c_dbcsr_acc_opencl_config.nclmems = 0;
result = EXIT_FAILURE;
}
# else
c_dbcsr_acc_opencl_config.storage = malloc(sizeof(void*) * nhandles);
# endif
c_dbcsr_acc_opencl_config.nevents = nhandles;
c_dbcsr_acc_opencl_config.events = (void**)malloc(sizeof(void*) * nhandles);
if (NULL != c_dbcsr_acc_opencl_config.events && NULL != c_dbcsr_acc_opencl_config.storage) {
c_dbcsr_acc_opencl_config.event_info = malloc(sizeof(void*) * nhandles);
if (NULL != c_dbcsr_acc_opencl_config.events && NULL != c_dbcsr_acc_opencl_config.event_info) {
libxsmm_pmalloc_init(sizeof(void*), &c_dbcsr_acc_opencl_config.nevents, c_dbcsr_acc_opencl_config.events,
c_dbcsr_acc_opencl_config.storage);
c_dbcsr_acc_opencl_config.event_info);
}
else {
free(c_dbcsr_acc_opencl_config.events);
free(c_dbcsr_acc_opencl_config.event_info);
c_dbcsr_acc_opencl_config.event_info = NULL;
c_dbcsr_acc_opencl_config.events = NULL;
c_dbcsr_acc_opencl_config.nevents = 0;
result = EXIT_FAILURE;
}
if (EXIT_SUCCESS != result) {
free(c_dbcsr_acc_opencl_config.storage);
c_dbcsr_acc_opencl_config.storage = NULL;
}
}
# endif
if (EXIT_SUCCESS == result) {
Expand Down Expand Up @@ -649,8 +650,10 @@ int c_dbcsr_acc_finalize(void) {
}
}
/* release/reset buffers */
free(c_dbcsr_acc_opencl_config.clmems);
free(c_dbcsr_acc_opencl_config.clmem_info);
free(c_dbcsr_acc_opencl_config.events);
free(c_dbcsr_acc_opencl_config.storage);
free(c_dbcsr_acc_opencl_config.event_info);
free(c_dbcsr_acc_opencl_config.streams);
/* clear configuration */
memset(&c_dbcsr_acc_opencl_config, 0, sizeof(c_dbcsr_acc_opencl_config));
Expand Down Expand Up @@ -972,80 +975,78 @@ int c_dbcsr_acc_opencl_set_active_device(int thread_id, int device_id) {
assert(0 <= thread_id && thread_id < c_dbcsr_acc_opencl_config.nthreads);
assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_DEVICES_MAXCOUNT);
if (0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) {
static volatile int lock;
assert(NULL != c_dbcsr_acc_opencl_config.device);
LIBXSMM_ATOMIC_ACQUIRE(&lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED);
active_id = c_dbcsr_acc_opencl_config.devices[device_id];
if (NULL != active_id) {
# if defined(_OPENMP)
# pragma omp critical(c_dbcsr_acc_set_active_device)
# endif
{
int inherit_id = 0;
const cl_context context = c_dbcsr_acc_opencl_device_context(active_id, &inherit_id);
const cl_context inherit = c_dbcsr_acc_opencl_config.device[inherit_id].context;
if (NULL != context) {
if (context != inherit) {
if (NULL != inherit) {
c_dbcsr_acc_opencl_config.device[inherit_id].context = NULL;
result = clReleaseContext(inherit);
}
else if (thread_id != inherit_id) {
c_dbcsr_acc_opencl_config.device[inherit_id].context = context;
result = clRetainContext(context);
}
}
}
else if (NULL == c_dbcsr_acc_opencl_config.device[thread_id].context) {
result = c_dbcsr_acc_opencl_create_context(thread_id, active_id);
if (EXIT_SUCCESS == result && NULL /*context*/ != inherit) {
c_dbcsr_acc_opencl_config.device[inherit_id].context = c_dbcsr_acc_opencl_config.device[thread_id].context;
int inherit_id = 0;
const cl_context context = c_dbcsr_acc_opencl_device_context(active_id, &inherit_id);
const cl_context inherit = c_dbcsr_acc_opencl_config.device[inherit_id].context;
if (NULL != context) {
if (context != inherit) {
if (NULL != inherit) {
c_dbcsr_acc_opencl_config.device[inherit_id].context = NULL;
result = clReleaseContext(inherit);
}
else if (thread_id != inherit_id) {
c_dbcsr_acc_opencl_config.device[inherit_id].context = context;
result = clRetainContext(context);
}
}
if (EXIT_SUCCESS == result) { /* update/cache device-specific information */
result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device[thread_id].level,
c_dbcsr_acc_opencl_config.device[thread_id].level + 1, NULL /*cl_std*/,
&c_dbcsr_acc_opencl_config.device[thread_id].type);
if (EXIT_SUCCESS == result) {
char devname[ACC_OPENCL_BUFFERSIZE];
}
else if (NULL == c_dbcsr_acc_opencl_config.device[thread_id].context) {
result = c_dbcsr_acc_opencl_create_context(thread_id, active_id);
if (EXIT_SUCCESS == result && NULL /*context*/ != inherit) {
c_dbcsr_acc_opencl_config.device[inherit_id].context = c_dbcsr_acc_opencl_config.device[thread_id].context;
result = clReleaseContext(inherit);
}
}
if (EXIT_SUCCESS == result) { /* update/cache device-specific information */
result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device[thread_id].level,
c_dbcsr_acc_opencl_config.device[thread_id].level + 1, NULL /*cl_std*/,
&c_dbcsr_acc_opencl_config.device[thread_id].type);
if (EXIT_SUCCESS == result) {
char devname[ACC_OPENCL_BUFFERSIZE];
# if defined(CL_VERSION_2_0)
const char* const env_svm = getenv("ACC_OPENCL_SVM");
c_dbcsr_acc_opencl_config.device[thread_id].svm_interop =
((NULL == env_svm || 2 > *c_dbcsr_acc_opencl_config.device[thread_id].level) ? 0 : atoi(env_svm));
const char* const env_svm = getenv("ACC_OPENCL_SVM");
c_dbcsr_acc_opencl_config.device[thread_id].svm_interop =
((NULL == env_svm || 2 > *c_dbcsr_acc_opencl_config.device[thread_id].level) ? 0 : atoi(env_svm));
# endif
if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool),
&c_dbcsr_acc_opencl_config.device[thread_id].unified, NULL))
{
c_dbcsr_acc_opencl_config.device[thread_id].unified = CL_FALSE;
}
if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
0 /*platform_maxlen*/, /*cleanup*/ 1) ||
EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device[thread_id].uid))
{
c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1;
}
c_dbcsr_acc_opencl_config.device[thread_id].intel = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(
active_id, "intel", 0 /*use_platform_name*/));
c_dbcsr_acc_opencl_config.device[thread_id].nv = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(
active_id, "nvidia", 0 /*use_platform_name*/));
if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) ||
EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/))
if (CL_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool),
&c_dbcsr_acc_opencl_config.device[thread_id].unified, NULL))
{
c_dbcsr_acc_opencl_config.device[thread_id].unified = CL_FALSE;
}
if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
0 /*platform_maxlen*/, /*cleanup*/ 1) ||
EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device[thread_id].uid))
{
c_dbcsr_acc_opencl_config.device[thread_id].uid = (cl_uint)-1;
}
c_dbcsr_acc_opencl_config.device[thread_id].intel = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(
active_id, "intel", 0 /*use_platform_name*/));
c_dbcsr_acc_opencl_config.device[thread_id].nv = (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(
active_id, "nvidia", 0 /*use_platform_name*/));
if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) ||
EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/))
{
char buffer[ACC_OPENCL_BUFFERSIZE];
c_dbcsr_acc_opencl_config.device[thread_id].amd = 1;
if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_id, buffer, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
0 /*platform_maxlen*/, /*cleanup*/ 1))
{
char buffer[ACC_OPENCL_BUFFERSIZE];
c_dbcsr_acc_opencl_config.device[thread_id].amd = 1;
if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_name(active_id, buffer, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
0 /*platform_maxlen*/, /*cleanup*/ 1))
{
const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx");
if (NULL != gfxname && 90 <= atoi(gfxname + 3)) {
c_dbcsr_acc_opencl_config.device[thread_id].amd = 2;
}
const char* const gfxname = LIBXSMM_STRISTR(buffer, "gfx");
if (NULL != gfxname && 90 <= atoi(gfxname + 3)) {
c_dbcsr_acc_opencl_config.device[thread_id].amd = 2;
}
}
}
}
}
}
else result = EXIT_FAILURE;
LIBXSMM_ATOMIC_RELEASE(&lock, LIBXSMM_ATOMIC_RELAXED);
}
return result;
}
Expand Down
81 changes: 35 additions & 46 deletions src/acc/opencl/acc_opencl.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,18 +95,10 @@
#if !defined(ACC_OPENCL_STREAMS_MAXCOUNT)
# define ACC_OPENCL_STREAMS_MAXCOUNT 64
#endif
#if !defined(ACC_OPENCL_OVERMALLOC)
# if defined(__DBCSR_ACC) || 1
# define ACC_OPENCL_OVERMALLOC 0
# else
# define ACC_OPENCL_OVERMALLOC 8192
# endif
#endif
/* First char is CSV-separator by default (w/o spaces) */
#if !defined(ACC_OPENCL_DELIMS)
# define ACC_OPENCL_DELIMS ",;"
#endif

#if !defined(ACC_OPENCL_LAZYINIT) && (defined(__DBCSR_ACC) || 1)
# define ACC_OPENCL_LAZYINIT
#endif
Expand All @@ -123,12 +115,6 @@
#if !defined(ACC_OPENCL_STREAM_NULL) && 1
# define ACC_OPENCL_STREAM_NULL
#endif

/** Automatically determine cl_mem offset */
#if !defined(ACC_OPENCL_MEM_OFFSET) && 1
# define ACC_OPENCL_MEM_OFFSET
#endif

/** Use DBCSR's profile for detailed timings */
#if !defined(ACC_OPENCL_PROFILE) && 0
# define ACC_OPENCL_PROFILE
Expand Down Expand Up @@ -238,12 +224,32 @@ typedef struct c_dbcsr_acc_opencl_device_t {
cl_int intel, amd, nv;
} c_dbcsr_acc_opencl_device_t;

/** Information about host/device-memory pointer. */
typedef struct c_dbcsr_acc_opencl_info_ptr_t {
cl_mem memory;
void* memptr;
} c_dbcsr_acc_opencl_info_ptr_t;

/** Information about streams (c_dbcsr_acc_stream_create). */
typedef struct c_dbcsr_acc_opencl_info_stream_t {
void* pointer;
int priority;
int tid;
} c_dbcsr_acc_opencl_info_stream_t;

/** Enumeration of timer kinds used for built-in execution-profile. */
typedef enum c_dbcsr_acc_opencl_timer_t {
c_dbcsr_acc_opencl_timer_device,
c_dbcsr_acc_opencl_timer_host
} c_dbcsr_acc_opencl_timer_t;

/** Enumeration of FP-atomic kinds. */
typedef enum c_dbcsr_acc_opencl_atomic_fp_t {
c_dbcsr_acc_opencl_atomic_fp_no = 0,
c_dbcsr_acc_opencl_atomic_fp_32 = 1,
c_dbcsr_acc_opencl_atomic_fp_64 = 2
} c_dbcsr_acc_opencl_atomic_fp_t;

/**
* Settings discovered/setup during c_dbcsr_acc_init (independent of the device)
* and settings updated during c_dbcsr_acc_set_active_device (devinfo).
Expand All @@ -255,8 +261,10 @@ typedef struct c_dbcsr_acc_opencl_config_t {
c_dbcsr_acc_opencl_device_t* device;
/** Handle-counter. */
size_t nclmems, nevents;
/** All handles and related storage. */
void **clmems, **events, *storage;
/** All events and related storage. */
void **events, *event_info;
/** All clmems and related storage. */
c_dbcsr_acc_opencl_info_ptr_t **clmems, *clmem_info;
/** All created streams partitioned by thread-ID (thread-local slots). */
void** streams;
/** Kind of timer used for built-in execution-profile. */
Expand Down Expand Up @@ -290,30 +298,20 @@ extern c_dbcsr_acc_opencl_config_t c_dbcsr_acc_opencl_config;
cl_context c_dbcsr_acc_opencl_context(int* thread_id);
/** Share context for given device (start searching at optional thread_id), or return NULL). */
cl_context c_dbcsr_acc_opencl_device_context(cl_device_id device, const int* thread_id);

/** Information about host-memory pointer (c_dbcsr_acc_host_mem_allocate). */
typedef struct c_dbcsr_acc_opencl_info_hostptr_t {
cl_mem memory;
void* mapped;
} c_dbcsr_acc_opencl_info_hostptr_t;
c_dbcsr_acc_opencl_info_hostptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory);

/** Determines cl_mem object and offset of memory. */
void* c_dbcsr_acc_opencl_info_devptr(const void* memory, size_t elsize, const size_t* amount, size_t* offset);

/** Information about streams (c_dbcsr_acc_stream_create). */
typedef struct c_dbcsr_acc_opencl_info_stream_t {
void* pointer;
int priority;
int tid;
} c_dbcsr_acc_opencl_info_stream_t;
/** Determines cl_mem object and storage pointer. */
c_dbcsr_acc_opencl_info_ptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory);
/** Determines cl_mem object and memory offset (device). */
c_dbcsr_acc_opencl_info_ptr_t* c_dbcsr_acc_opencl_info_devptr(
const void* memory, size_t elsize, const size_t* amount, size_t* offset);
/** Determines information about stream. */
c_dbcsr_acc_opencl_info_stream_t* c_dbcsr_acc_opencl_info_stream(void* stream);
/** Determines a stream's priority. */
const int* c_dbcsr_acc_opencl_stream_priority(const void* stream);

/** Finds an existing stream for the given thread-ID (or NULL). */
void* c_dbcsr_acc_opencl_stream(int thread_id);
/** Determines default-stream (see ACC_OPENCL_STREAM_NULL). */
void* c_dbcsr_acc_opencl_stream_default(void);

/** Get host-pointer associated with device-memory (c_dbcsr_acc_dev_mem_allocate). */
void* c_dbcsr_acc_opencl_get_hostptr(cl_mem memory);
/** Like c_dbcsr_acc_memset_zero, but supporting an arbitrary value used as initialization pattern. */
int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nbytes, void* stream);
/** Amount of device memory; local memory is only non-zero if separate from global. */
int c_dbcsr_acc_opencl_info_devmem(cl_device_id device, size_t* mem_free, size_t* mem_total, size_t* mem_local, int* mem_unified);
Expand Down Expand Up @@ -350,18 +348,9 @@ int c_dbcsr_acc_opencl_kernel(int source_is_file, const char source[], const cha
int c_dbcsr_acc_opencl_device_synchronize(int thread_id);
/** Create user-event if not created and sets initial state. */
int c_dbcsr_acc_opencl_event_create(cl_event* event_p);

/** Enumeration of FP-atomic kinds. */
typedef enum c_dbcsr_acc_opencl_atomic_fp_t {
c_dbcsr_acc_opencl_atomic_fp_no = 0,
c_dbcsr_acc_opencl_atomic_fp_32 = 1,
c_dbcsr_acc_opencl_atomic_fp_64 = 2
} c_dbcsr_acc_opencl_atomic_fp_t;

/** Assemble flags to support atomic operations. */
int c_dbcsr_acc_opencl_flags_atomics(cl_device_id device_id, c_dbcsr_acc_opencl_atomic_fp_t kind,
const c_dbcsr_acc_opencl_device_t* devinfo, const char* exts[], int exts_maxlen, char flags[], size_t flags_maxlen);

/** Combines build-params and build-options, some optional flags (try_build_options), and applies language std. (cl_std). */
int c_dbcsr_acc_opencl_flags(const char build_params[], const char build_options[], const char try_build_options[],
const char cl_std[], char buffer[], size_t buffer_size);
Expand Down
2 changes: 1 addition & 1 deletion src/acc/opencl/acc_opencl_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# if !defined(ACC_OPENCL_EVENT_BARRIER) && 0
# define ACC_OPENCL_EVENT_BARRIER
# endif
# if !defined(ACC_OPENCL_EVENT_CREATE) && 0
# if !defined(ACC_OPENCL_EVENT_CREATE) && 1
# define ACC_OPENCL_EVENT_CREATE
# endif

Expand Down
Loading

0 comments on commit 5f5d059

Please sign in to comment.