forked from paboyle/GridBench
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Macros.h
81 lines (69 loc) · 2.44 KB
/
Macros.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#pragma once
/////////////////////////////////////////////////
// Looping constructs for OpenMP and onload
// Can we add SyCL ??
/////////////////////////////////////////////////
#ifndef __NVCC__
#define accelerator_inline inline
#define accelerator_func
#define thread_loop( range , ... ) for range { __VA_ARGS__ ; };
#define accelerator_loopN( iterator, num, ... ) thread_loop( (int iterator = 0;iterator<num;iterator++), { __VA_ARGS__ });
#else
////////////////////
// CUDA target
////////////////////
#define accelerator_inline __host__ __device__ inline
#define accelerator_func __host__ __device__
#define kernel_call __global__
template<typename lambda> kernel_call
void LambdaApply(uint64_t base, uint64_t Num, lambda Lambda)
{
uint64_t ss = blockIdx.x*blockDim.x + threadIdx.x;
if ( ss < Num ) {
Lambda(ss+base);
}
}
#define thread_loop( range , ... ) for range { __VA_ARGS__ ; };
#define accelerator_loopN_debug( iterator, num, ... ) thread_loop( (int iterator = 0;iterator<num;iterator++), { __VA_ARGS__ });
#define accelerator_loopN( iterator, num, ... ) \
typedef decltype(num) Iterator; \
if ( num > 0 ) { \
auto lambda = [=] __host__ __device__ (Iterator iterator) mutable { \
__VA_ARGS__; \
}; \
Iterator base = 0; \
Iterator num_block = (num+gpu_threads-1)/gpu_threads; \
LambdaApply<<<num_block,gpu_threads>>>(base,num,lambda); \
cudaDeviceSynchronize(); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("Cuda error %s\n",cudaGetErrorString( err )); \
exit(0); \
} \
}
#endif
///////////////////////////////////////////////////////
// GPU each thread does one SIMD lane of work
// Host each thread must loop over SIMD lanes
// Use these inline routines to decide what to do according
// to host or device. By using a vector length Nsimd we get Nsimd level
// of read coalescing on the GPU.
// CPU loops over lanes provide compatability and debug with hopefully
// some level of compiler vectorisation in good enough compilers (not holding breath)...
///////////////////////////////////////////////////////
accelerator_inline int get_my_lanes(int Nsimd)
{
#ifdef __CUDA_ARCH__
return 1;
#else
return Nsimd;
#endif
}
accelerator_inline int get_my_lane_offset(int Nsimd)
{
#ifdef __CUDA_ARCH__
return ( (threadIdx.x) % Nsimd);
#else
return 0;
#endif
}