Skip to content

Commit 22c86e9

Browse files
committed
variable.cuh clean-up: fourth pass; further bug fixes
- fix number of thread blocks to launch (caused HIP version to hang) - change cleanup_device() signature so only device pointer can be passed - clean up comments
1 parent aec4786 commit 22c86e9

File tree

4 files changed

+60
-58
lines changed

4 files changed

+60
-58
lines changed

src/cuda/device.cuh

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -193,25 +193,22 @@ Word* setup_device_index_decompress(zfp_stream* stream)
193193
return d_index;
194194
}
195195

196-
bool setup_device_compact(size_t* chunk_size, unsigned long long** d_offsets, size_t* lcubtemp, void** d_cubtemp, uint processors)
196+
bool setup_device_compact(size_t* chunk_size, unsigned long long** d_offset, size_t* cubtmp_size, void** d_cubtmp, uint processors)
197197
{
198+
// use 1K threads per SM for high occupancy (assumes one thread per zfp block)
198199
const size_t threads_per_sm = 1024;
199-
// Assuming 1 thread = 1 ZFP block,
200-
// launching 1024 threads per SM should give a decent occupancy
201200
*chunk_size = processors * threads_per_sm;
202-
size_t size = (*chunk_size + 1) * sizeof(unsigned long long);
201+
203202
// allocate and zero-initialize offsets
204-
if (!device_calloc(d_offsets, size, "offsets"))
203+
const size_t size = (*chunk_size + 1) * sizeof(unsigned long long);
204+
if (!device_calloc(d_offset, size, "offsets"))
205205
return false;
206206

207-
// TODO : error handling for CUB
208-
// Using CUB for the prefix sum. CUB needs a bit of temp memory too
209-
size_t tempsize;
210-
cub::DeviceScan::InclusiveSum(nullptr, tempsize, *d_offsets, *d_offsets, *chunk_size + 1);
211-
*lcubtemp = tempsize;
212-
if (!device_malloc(d_cubtemp, tempsize, "offsets")) {
213-
device_free(d_offsets);
214-
*d_offsets = NULL;
207+
// allocate temporary memory for CUB prefix sum
208+
if (cub::DeviceScan::InclusiveSum(nullptr, *cubtmp_size, *d_offset, *d_offset, *chunk_size + 1) != cudaSuccess ||
209+
!device_malloc(d_cubtmp, *cubtmp_size, "offsets")) {
210+
device_free(d_offset);
211+
*d_offset = NULL;
215212
return false;
216213
}
217214

@@ -267,8 +264,7 @@ void* setup_device_field_decompress(const zfp_field* field, void*& d_begin)
267264
}
268265

269266
// copy from device to host (if needed) and deallocate device memory
270-
// TODO: d_begin should be first argument, with begin = NULL as default
271-
void cleanup_device(void* begin, void* d_begin, size_t bytes = 0)
267+
void cleanup_device(void* d_begin, void* begin = 0, size_t bytes = 0)
272268
{
273269
if (d_begin != begin) {
274270
// copy data from device to host and free device memory

src/cuda/encode3.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ encode3_kernel(
7272
if (block_idx >= blocks)
7373
return;
7474

75-
// logical position in 2d array
75+
// logical position in 3d array
7676
size_t pos = block_idx;
7777
const ptrdiff_t x = (pos % bx) * 4; pos /= bx;
7878
const ptrdiff_t y = (pos % by) * 4; pos /= by;

src/cuda/interface.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,12 @@ zfp_internal_cuda_compress(zfp_stream* stream, const zfp_field* field)
119119
if (d_index) {
120120
const size_t size = zfp_field_blocks(field) * sizeof(ushort);
121121
// TODO: assumes index stores block sizes
122-
zfp::cuda::internal::cleanup_device(stream->index ? stream->index->data : NULL, d_index, size);
122+
zfp::cuda::internal::cleanup_device(d_index, stream->index ? stream->index->data : NULL, size);
123123
}
124124

125125
// copy stream from device to host if needed and free temporary buffers
126-
zfp::cuda::internal::cleanup_device(stream->stream->begin, d_stream, stream_bytes);
127-
zfp::cuda::internal::cleanup_device(zfp_field_begin(field), d_begin);
126+
zfp::cuda::internal::cleanup_device(d_stream, stream->stream->begin, stream_bytes);
127+
zfp::cuda::internal::cleanup_device(d_begin, zfp_field_begin(field));
128128

129129
// update bit stream to point just past produced data
130130
if (bits_written)
@@ -220,10 +220,10 @@ zfp_internal_cuda_decompress(zfp_stream* stream, zfp_field* field)
220220

221221
// copy field from device to host if needed and free temporary buffers
222222
size_t field_bytes = zfp_field_size_bytes(field);
223-
zfp::cuda::internal::cleanup_device(zfp_field_begin(field), d_begin, field_bytes);
224-
zfp::cuda::internal::cleanup_device(stream->stream->begin, d_stream);
223+
zfp::cuda::internal::cleanup_device(d_begin, zfp_field_begin(field), field_bytes);
224+
zfp::cuda::internal::cleanup_device(d_stream, stream->stream->begin);
225225
if (d_index)
226-
zfp::cuda::internal::cleanup_device(stream->index->data, d_index);
226+
zfp::cuda::internal::cleanup_device(d_index, stream->index->data);
227227

228228
// update bit stream to point just past consumed data
229229
if (bits_read)

src/cuda/variable.cuh

Lines changed: 42 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ void
1818
copy_length_kernel(
1919
unsigned long long* d_offset, // block offsets; first is base of prefix sum
2020
const ushort* d_length, // block lengths in bits
21-
size_t blocks_per_chunk // number of blocks in chunk to process
21+
uint blocks_per_chunk // number of blocks in chunk to process
2222
)
2323
{
24-
size_t block = threadIdx.x + (size_t)blockIdx.x * blockDim.x;
24+
uint block = threadIdx.x + blockIdx.x * blockDim.x;
2525
if (block < blocks_per_chunk)
2626
d_offset[block + 1] = d_length[block];
2727
}
@@ -31,7 +31,7 @@ void
3131
copy_length_launch(
3232
unsigned long long* d_offset, // block offsets; first is base of prefix sum
3333
const ushort* d_length, // block lengths in bits
34-
size_t blocks_per_chunk // number of blocks in chunk to process
34+
uint blocks_per_chunk // number of blocks in chunk to process
3535
)
3636
{
3737
dim3 blocks((int)count_up(blocks_per_chunk, 1024), 1, 1);
@@ -206,23 +206,32 @@ compact_stream_kernel(
206206
// block. The caller also allocates shared memory for sm_in and sm_out.
207207

208208
cg::grid_group grid = cg::this_grid();
209-
extern __shared__ uint32 sm_in[]; // sm_in[num_tiles * words_per_slot]
210-
uint32* sm_out = sm_in + num_tiles * words_per_slot; // sm_out[num_tiles * words_per_slot + 2]
211-
212-
const uint tid = threadIdx.x + threadIdx.y * tile_size; // thread within thread block
213-
const uint blocks_per_group = gridDim.x * num_tiles; // number of blocks per group
214-
const uint first_subchunk_block = blockIdx.x * num_tiles; // first block in this subchunk
215-
216-
// zero-initialize compacted shared-memory buffer (also done in process())
209+
// sm_in[num_tiles * words_per_slot]
210+
extern __shared__ uint32 sm_in[];
211+
// sm_out[num_tiles * words_per_slot + 2]
212+
uint32* sm_out = sm_in + num_tiles * words_per_slot;
213+
// thread within thread block
214+
const uint tid = threadIdx.x + threadIdx.y * tile_size;
215+
// number of blocks per group
216+
const uint blocks_per_group = gridDim.x * num_tiles;
217+
// first block in this subchunk
218+
const uint first_subchunk_block = blockIdx.x * num_tiles;
219+
220+
// zero-initialize compacted buffer (also done in store_subchunk())
217221
for (uint i = tid; i < num_tiles * words_per_slot + 2; i += num_tiles * tile_size)
218222
sm_out[i] = 0;
219223

220224
// compact chunk one group at a time
221225
for (uint i = 0; i < blocks_per_chunk; i += blocks_per_group) {
222-
const uint base_block = first_subchunk_block + i; // first block in this subchunk
223-
const uint block = base_block + threadIdx.y; // block assigned to this thread
226+
// first block in this subchunk
227+
const uint base_block = first_subchunk_block + i;
228+
// block assigned to this thread
229+
const uint block = base_block + threadIdx.y;
230+
// is this thread block assigned any compressed blocks?
224231
const bool active_thread_block = (base_block < blocks_per_chunk);
225-
const bool valid_block = (block < blocks_per_chunk); // is thread assigned to valid block?
232+
// is this thread assigned to valid block?
233+
const bool valid_block = (block < blocks_per_chunk);
234+
// destination offset to beginning of subchunk in compacted stream
226235
const unsigned long long base_offset = active_thread_block ? d_offset[base_block] : 0;
227236

228237
unsigned long long offset_out = 0;
@@ -277,29 +286,28 @@ compact_stream_launch(
277286
uint processors // number of device multiprocessors
278287
)
279288
{
280-
// Increase the number of threads per zfp block ("tile") as bits_per_slot increases
281-
// Compromise between coalescing, inactive threads and shared memory size <= 48KB
282-
// Total shared memory used = (2 * num_tiles * words_per_slot + 2) x 32-bit dynamic shared memory
283-
// and num_tiles x 32-bit static shared memory.
284-
// The extra 2 elements of dynamic shared memory are needed to handle unaligned output data
285-
// and potential zero-padding to the next multiple of 64 bits.
286-
// Block sizes set so that the shared memory stays < 48KB.
289+
// Assign number of threads ("tile_size") per zfp block in proportion to
290+
// bits_per_slot. Compromise between coalescing, keeping threads active,
291+
// and limiting shared memory usage. The total dynamic shared memory used
292+
// equals (2 * num_tiles * words_per_slot + 2) 32-bit words. The extra
293+
// two words of shared memory are needed to handle output data that is not
294+
// aligned on 32-bit words. The number of zfp blocks per thread block
295+
// ("num_tiles") is set to ensure that shared memory is at most 48 KB.
287296

288297
const uint words_per_slot = count_up(bits_per_slot, 32);
289298
const size_t shmem = (2 * num_tiles * words_per_slot + 2) * sizeof(uint32);
290299

291300
// compute number of blocks to process concurrently
292-
int max_blocks = 0;
301+
int thread_blocks = 0;
293302
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
294-
&max_blocks,
303+
&thread_blocks,
295304
compact_stream_kernel<tile_size, num_tiles>,
296305
tile_size * num_tiles,
297306
shmem
298307
);
299-
max_blocks *= processors;
300-
max_blocks = min(max_blocks, blocks_per_chunk);
308+
thread_blocks *= processors;
309+
thread_blocks = min(thread_blocks, (int)count_up(blocks_per_chunk, num_tiles));
301310

302-
const dim3 threads(tile_size, num_tiles, 1);
303311
void* kernel_args[] = {
304312
(void *)&d_stream,
305313
(void *)&d_offset,
@@ -311,8 +319,8 @@ compact_stream_launch(
311319

312320
return cudaLaunchCooperativeKernel(
313321
(void *)compact_stream_kernel<tile_size, num_tiles>,
314-
dim3(max_blocks, 1, 1),
315-
threads,
322+
dim3(thread_blocks, 1, 1),
323+
dim3(tile_size, num_tiles, 1),
316324
kernel_args,
317325
shmem,
318326
0
@@ -381,14 +389,12 @@ compact_stream(
381389
bool success = true;
382390
unsigned long long* d_offset;
383391
size_t chunk_size;
384-
size_t lcubtemp;
385-
void* d_cubtemp;
392+
size_t cubtmp_size;
393+
void* d_cubtmp;
386394

387-
if (!setup_device_compact(&chunk_size, &d_offset, &lcubtemp, &d_cubtemp, processors))
395+
if (!setup_device_compact(&chunk_size, &d_offset, &cubtmp_size, &d_cubtmp, processors))
388396
return 0;
389397

390-
printf("chunk_size=%zu\n", chunk_size);
391-
392398
// perform compaction one chunk of blocks at a time
393399
for (size_t block = 0; block < blocks && success; block += chunk_size) {
394400
// determine chunk size
@@ -398,7 +404,7 @@ printf("chunk_size=%zu\n", chunk_size);
398404
copy_length_launch(d_offset, d_length + block, blocks_per_chunk);
399405

400406
// compute prefix sum to turn block lengths into offsets
401-
cub::DeviceScan::InclusiveSum(d_cubtemp, lcubtemp, d_offset, d_offset, blocks_per_chunk + 1);
407+
cub::DeviceScan::InclusiveSum(d_cubtmp, cubtmp_size, d_offset, d_offset, blocks_per_chunk + 1);
402408

403409
// compact the stream in place
404410
if (!compact_stream_chunk((uint32*)d_stream, d_offset, block, blocks_per_chunk, bits_per_slot, processors))
@@ -413,8 +419,8 @@ printf("chunk_size=%zu\n", chunk_size);
413419
}
414420

415421
// free temporary buffers
416-
cleanup_device(NULL, d_offset);
417-
cleanup_device(NULL, d_cubtemp);
422+
cleanup_device(d_offset);
423+
cleanup_device(d_cubtmp);
418424

419425
return bits_written;
420426
}

0 commit comments

Comments
 (0)