Skip to content

Commit e827635

Browse files
committed
Hipify latest CUDA updates
1 parent 22c86e9 commit e827635

File tree

9 files changed

+437
-373
lines changed

9 files changed

+437
-373
lines changed

src/hip/decode1.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ decode1(
132132
#endif
133133

134134
// launch GPU kernel
135-
hipLaunchKernelGGL(HIP_KERNEL_NAME(decode1_kernel<Scalar>), grid_size, block_size, 0, 0,
135+
decode1_kernel<Scalar><<<grid_size, block_size>>>(
136136
d_data,
137137
size[0],
138138
stride[0],

src/hip/decode2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ decode2(
143143
#endif
144144

145145
// launch GPU kernel
146-
hipLaunchKernelGGL(HIP_KERNEL_NAME(decode2_kernel<Scalar>), grid_size, block_size, 0, 0,
146+
decode2_kernel<Scalar><<<grid_size, block_size>>>(
147147
d_data,
148148
make_size2(size[0], size[1]),
149149
make_ptrdiff2(stride[0], stride[1]),

src/hip/decode3.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ decode3(
153153
#endif
154154

155155
// launch GPU kernel
156-
hipLaunchKernelGGL(HIP_KERNEL_NAME(decode3_kernel<Scalar>), grid_size, block_size, 0, 0,
156+
decode3_kernel<Scalar><<<grid_size, block_size>>>(
157157
d_data,
158158
make_size3(size[0], size[1], size[2]),
159159
make_ptrdiff3(stride[0], stride[1], stride[2]),

src/hip/device.h

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ bool device_init()
2727
success &= error.check("zfp device init - hipMalloc");
2828

2929
// launch a dummy kernel
30-
hipLaunchKernelGGL(device_init_kernel, 1, 1, 0, 0, d_word);
30+
device_init_kernel<<<1, 1>>>(d_word);
3131
success &= error.check("zfp device init - kernel");
3232

3333
// allocate host memory
@@ -194,25 +194,22 @@ Word* setup_device_index_decompress(zfp_stream* stream)
194194
return d_index;
195195
}
196196

197-
bool setup_device_compact(size_t* chunk_size, unsigned long long** d_offsets, size_t* lcubtemp, void** d_cubtemp, uint processors)
197+
bool setup_device_compact(size_t* chunk_size, unsigned long long** d_offset, size_t* cubtmp_size, void** d_cubtmp, uint processors)
198198
{
199+
// use 1K threads per SM for high occupancy (assumes one thread per zfp block)
199200
const size_t threads_per_sm = 1024;
200-
// Assuming 1 thread = 1 ZFP block,
201-
// launching 1024 threads per SM should give a decent occupancy
202201
*chunk_size = processors * threads_per_sm;
203-
size_t size = (*chunk_size + 1) * sizeof(unsigned long long);
202+
204203
// allocate and zero-initialize offsets
205-
if (!device_calloc(d_offsets, size, "offsets"))
204+
const size_t size = (*chunk_size + 1) * sizeof(unsigned long long);
205+
if (!device_calloc(d_offset, size, "offsets"))
206206
return false;
207207

208-
// TODO : error handling for CUB
209-
// Using CUB for the prefix sum. CUB needs a bit of temp memory too
210-
size_t tempsize;
211-
hipcub::DeviceScan::InclusiveSum(nullptr, tempsize, *d_offsets, *d_offsets, *chunk_size + 1);
212-
*lcubtemp = tempsize;
213-
if (!device_malloc(d_cubtemp, tempsize, "offsets")) {
214-
device_free(d_offsets);
215-
*d_offsets = NULL;
208+
// allocate temporary memory for CUB prefix sum
209+
if (hipcub::DeviceScan::InclusiveSum(nullptr, *cubtmp_size, *d_offset, *d_offset, *chunk_size + 1) != hipSuccess ||
210+
!device_malloc(d_cubtmp, *cubtmp_size, "offsets")) {
211+
device_free(d_offset);
212+
*d_offset = NULL;
216213
return false;
217214
}
218215

@@ -268,8 +265,7 @@ void* setup_device_field_decompress(const zfp_field* field, void*& d_begin)
268265
}
269266

270267
// copy from device to host (if needed) and deallocate device memory
271-
// TODO: d_begin should be first argument, with begin = NULL as default
272-
void cleanup_device(void* begin, void* d_begin, size_t bytes = 0)
268+
void cleanup_device(void* d_begin, void* begin = 0, size_t bytes = 0)
273269
{
274270
if (d_begin != begin) {
275271
// copy data from device to host and free device memory

src/hip/encode1.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ encode1(
111111
#endif
112112

113113
// launch GPU kernel
114-
hipLaunchKernelGGL(HIP_KERNEL_NAME(encode1_kernel<Scalar>), grid_size, block_size, 0, 0,
114+
encode1_kernel<Scalar><<<grid_size, block_size>>>(
115115
d_data,
116116
size[0],
117117
stride[0],

src/hip/encode2.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ encode2(
126126
#endif
127127

128128
// launch GPU kernel
129-
hipLaunchKernelGGL(HIP_KERNEL_NAME(encode2_kernel<Scalar>), grid_size, block_size, 0, 0,
129+
encode2_kernel<Scalar><<<grid_size, block_size>>>(
130130
d_data,
131131
make_size2(size[0], size[1]),
132132
make_ptrdiff2(stride[0], stride[1]),

src/hip/encode3.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ encode3_kernel(
7373
if (block_idx >= blocks)
7474
return;
7575

76-
// logical position in 2d array
76+
// logical position in 3d array
7777
size_t pos = block_idx;
7878
const ptrdiff_t x = (pos % bx) * 4; pos /= bx;
7979
const ptrdiff_t y = (pos % by) * 4; pos /= by;
@@ -139,7 +139,7 @@ encode3(
139139
#endif
140140

141141
// launch GPU kernel
142-
hipLaunchKernelGGL(HIP_KERNEL_NAME(encode3_kernel<Scalar>), grid_size, block_size, 0, 0,
142+
encode3_kernel<Scalar><<<grid_size, block_size>>>(
143143
d_data,
144144
make_size3(size[0], size[1], size[2]),
145145
make_ptrdiff3(stride[0], stride[1], stride[2]),

src/hip/interface.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,12 @@ zfp_internal_hip_compress(zfp_stream* stream, const zfp_field* field)
119119
if (d_index) {
120120
const size_t size = zfp_field_blocks(field) * sizeof(ushort);
121121
// TODO: assumes index stores block sizes
122-
zfp::hip::internal::cleanup_device(stream->index ? stream->index->data : NULL, d_index, size);
122+
zfp::hip::internal::cleanup_device(d_index, stream->index ? stream->index->data : NULL, size);
123123
}
124124

125125
// copy stream from device to host if needed and free temporary buffers
126-
zfp::hip::internal::cleanup_device(stream->stream->begin, d_stream, stream_bytes);
127-
zfp::hip::internal::cleanup_device(zfp_field_begin(field), d_begin);
126+
zfp::hip::internal::cleanup_device(d_stream, stream->stream->begin, stream_bytes);
127+
zfp::hip::internal::cleanup_device(d_begin, zfp_field_begin(field));
128128

129129
// update bit stream to point just past produced data
130130
if (bits_written)
@@ -220,10 +220,10 @@ zfp_internal_hip_decompress(zfp_stream* stream, zfp_field* field)
220220

221221
// copy field from device to host if needed and free temporary buffers
222222
size_t field_bytes = zfp_field_size_bytes(field);
223-
zfp::hip::internal::cleanup_device(zfp_field_begin(field), d_begin, field_bytes);
224-
zfp::hip::internal::cleanup_device(stream->stream->begin, d_stream);
223+
zfp::hip::internal::cleanup_device(d_begin, zfp_field_begin(field), field_bytes);
224+
zfp::hip::internal::cleanup_device(d_stream, stream->stream->begin);
225225
if (d_index)
226-
zfp::hip::internal::cleanup_device(stream->index->data, d_index);
226+
zfp::hip::internal::cleanup_device(d_index, stream->index->data);
227227

228228
// update bit stream to point just past consumed data
229229
if (bits_read)

0 commit comments

Comments
 (0)