@@ -27,7 +27,7 @@ bool device_init()
27
27
success &= error.check (" zfp device init - hipMalloc" );
28
28
29
29
// launch a dummy kernel
30
- hipLaunchKernelGGL ( device_init_kernel, 1 , 1 , 0 , 0 , d_word);
30
+ device_init_kernel<<< 1 , 1 >>>( d_word);
31
31
success &= error.check (" zfp device init - kernel" );
32
32
33
33
// allocate host memory
@@ -194,25 +194,22 @@ Word* setup_device_index_decompress(zfp_stream* stream)
194
194
return d_index;
195
195
}
196
196
197
- bool setup_device_compact (size_t * chunk_size, unsigned long long ** d_offsets , size_t * lcubtemp , void ** d_cubtemp , uint processors)
197
+ bool setup_device_compact (size_t * chunk_size, unsigned long long ** d_offset , size_t * cubtmp_size , void ** d_cubtmp , uint processors)
198
198
{
199
+ // use 1K threads per SM for high occupancy (assumes one thread per zfp block)
199
200
const size_t threads_per_sm = 1024 ;
200
- // Assuming 1 thread = 1 ZFP block,
201
- // launching 1024 threads per SM should give a decent occupancy
202
201
*chunk_size = processors * threads_per_sm;
203
- size_t size = (*chunk_size + 1 ) * sizeof ( unsigned long long );
202
+
204
203
// allocate and zero-initialize offsets
205
- if (!device_calloc (d_offsets, size, " offsets" ))
204
+ const size_t size = (*chunk_size + 1 ) * sizeof (unsigned long long );
205
+ if (!device_calloc (d_offset, size, " offsets" ))
206
206
return false ;
207
207
208
- // TODO : error handling for CUB
209
- // Using CUB for the prefix sum. CUB needs a bit of temp memory too
210
- size_t tempsize;
211
- hipcub::DeviceScan::InclusiveSum (nullptr , tempsize, *d_offsets, *d_offsets, *chunk_size + 1 );
212
- *lcubtemp = tempsize;
213
- if (!device_malloc (d_cubtemp, tempsize, " offsets" )) {
214
- device_free (d_offsets);
215
- *d_offsets = NULL ;
208
+ // allocate temporary memory for CUB prefix sum
209
+ if (hipcub::DeviceScan::InclusiveSum (nullptr , *cubtmp_size, *d_offset, *d_offset, *chunk_size + 1 ) != hipSuccess ||
210
+ !device_malloc (d_cubtmp, *cubtmp_size, " offsets" )) {
211
+ device_free (d_offset);
212
+ *d_offset = NULL ;
216
213
return false ;
217
214
}
218
215
@@ -268,8 +265,7 @@ void* setup_device_field_decompress(const zfp_field* field, void*& d_begin)
268
265
}
269
266
270
267
// copy from device to host (if needed) and deallocate device memory
271
- // TODO: d_begin should be first argument, with begin = NULL as default
272
- void cleanup_device (void * begin, void * d_begin, size_t bytes = 0 )
268
+ void cleanup_device (void * d_begin, void * begin = 0 , size_t bytes = 0 )
273
269
{
274
270
if (d_begin != begin) {
275
271
// copy data from device to host and free device memory
0 commit comments