18
18
copy_length_kernel (
19
19
unsigned long long * d_offset, // block offsets; first is base of prefix sum
20
20
const ushort* d_length, // block lengths in bits
21
- size_t blocks_per_chunk // number of blocks in chunk to process
21
+ uint blocks_per_chunk // number of blocks in chunk to process
22
22
)
23
23
{
24
- size_t block = threadIdx .x + ( size_t ) blockIdx .x * blockDim .x ;
24
+ uint block = threadIdx .x + blockIdx .x * blockDim .x ;
25
25
if (block < blocks_per_chunk)
26
26
d_offset[block + 1 ] = d_length[block];
27
27
}
31
31
copy_length_launch (
32
32
unsigned long long * d_offset, // block offsets; first is base of prefix sum
33
33
const ushort* d_length, // block lengths in bits
34
- size_t blocks_per_chunk // number of blocks in chunk to process
34
+ uint blocks_per_chunk // number of blocks in chunk to process
35
35
)
36
36
{
37
37
dim3 blocks ((int )count_up (blocks_per_chunk, 1024 ), 1 , 1 );
@@ -206,23 +206,32 @@ compact_stream_kernel(
206
206
// block. The caller also allocates shared memory for sm_in and sm_out.
207
207
208
208
cg::grid_group grid = cg::this_grid ();
209
- extern __shared__ uint32 sm_in[]; // sm_in[num_tiles * words_per_slot]
210
- uint32* sm_out = sm_in + num_tiles * words_per_slot; // sm_out[num_tiles * words_per_slot + 2]
211
-
212
- const uint tid = threadIdx .x + threadIdx .y * tile_size; // thread within thread block
213
- const uint blocks_per_group = gridDim .x * num_tiles; // number of blocks per group
214
- const uint first_subchunk_block = blockIdx .x * num_tiles; // first block in this subchunk
215
-
216
- // zero-initialize compacted shared-memory buffer (also done in process())
209
+ // sm_in[num_tiles * words_per_slot]
210
+ extern __shared__ uint32 sm_in[];
211
+ // sm_out[num_tiles * words_per_slot + 2]
212
+ uint32* sm_out = sm_in + num_tiles * words_per_slot;
213
+ // thread within thread block
214
+ const uint tid = threadIdx .x + threadIdx .y * tile_size;
215
+ // number of blocks per group
216
+ const uint blocks_per_group = gridDim .x * num_tiles;
217
+ // first block in this subchunk
218
+ const uint first_subchunk_block = blockIdx .x * num_tiles;
219
+
220
+ // zero-initialize compacted buffer (also done in store_subchunk())
217
221
for (uint i = tid; i < num_tiles * words_per_slot + 2 ; i += num_tiles * tile_size)
218
222
sm_out[i] = 0 ;
219
223
220
224
// compact chunk one group at a time
221
225
for (uint i = 0 ; i < blocks_per_chunk; i += blocks_per_group) {
222
- const uint base_block = first_subchunk_block + i; // first block in this subchunk
223
- const uint block = base_block + threadIdx .y ; // block assigned to this thread
226
+ // first block in this subchunk
227
+ const uint base_block = first_subchunk_block + i;
228
+ // block assigned to this thread
229
+ const uint block = base_block + threadIdx .y ;
230
+ // is this thread block assigned any compressed blocks?
224
231
const bool active_thread_block = (base_block < blocks_per_chunk);
225
- const bool valid_block = (block < blocks_per_chunk); // is thread assigned to valid block?
232
+ // is this thread assigned to valid block?
233
+ const bool valid_block = (block < blocks_per_chunk);
234
+ // destination offset to beginning of subchunk in compacted stream
226
235
const unsigned long long base_offset = active_thread_block ? d_offset[base_block] : 0 ;
227
236
228
237
unsigned long long offset_out = 0 ;
@@ -277,29 +286,28 @@ compact_stream_launch(
277
286
uint processors // number of device multiprocessors
278
287
)
279
288
{
280
- // Increase the number of threads per zfp block ("tile") as bits_per_slot increases
281
- // Compromise between coalescing, inactive threads and shared memory size <= 48KB
282
- // Total shared memory used = (2 * num_tiles * words_per_slot + 2) x 32-bit dynamic shared memory
283
- // and num_tiles x 32-bit static shared memory.
284
- // The extra 2 elements of dynamic shared memory are needed to handle unaligned output data
285
- // and potential zero-padding to the next multiple of 64 bits.
286
- // Block sizes set so that the shared memory stays < 48KB .
289
+ // Assign number of threads ("tile_size") per zfp block in proportion to
290
+ // bits_per_slot. Compromise between coalescing, keeping threads active,
291
+ // and limiting shared memory usage. The total dynamic shared memory used
292
+ // equals (2 * num_tiles * words_per_slot + 2) 32-bit words. The extra
293
+ // two words of shared memory are needed to handle output data that is not
294
+ // aligned on 32-bit words. The number of zfp blocks per thread block
295
+ // ("num_tiles") is set to ensure that shared memory is at most 48 KB .
287
296
288
297
const uint words_per_slot = count_up (bits_per_slot, 32 );
289
298
const size_t shmem = (2 * num_tiles * words_per_slot + 2 ) * sizeof (uint32);
290
299
291
300
// compute number of blocks to process concurrently
292
- int max_blocks = 0 ;
301
+ int thread_blocks = 0 ;
293
302
cudaOccupancyMaxActiveBlocksPerMultiprocessor (
294
- &max_blocks ,
303
+ &thread_blocks ,
295
304
compact_stream_kernel<tile_size, num_tiles>,
296
305
tile_size * num_tiles,
297
306
shmem
298
307
);
299
- max_blocks *= processors;
300
- max_blocks = min (max_blocks, blocks_per_chunk);
308
+ thread_blocks *= processors;
309
+ thread_blocks = min (thread_blocks, ( int ) count_up ( blocks_per_chunk, num_tiles) );
301
310
302
- const dim3 threads (tile_size, num_tiles, 1 );
303
311
void * kernel_args[] = {
304
312
(void *)&d_stream,
305
313
(void *)&d_offset,
@@ -311,8 +319,8 @@ compact_stream_launch(
311
319
312
320
return cudaLaunchCooperativeKernel (
313
321
(void *)compact_stream_kernel<tile_size, num_tiles>,
314
- dim3 (max_blocks , 1 , 1 ),
315
- threads ,
322
+ dim3 (thread_blocks , 1 , 1 ),
323
+ dim3 (tile_size, num_tiles, 1 ) ,
316
324
kernel_args,
317
325
shmem,
318
326
0
@@ -381,14 +389,12 @@ compact_stream(
381
389
bool success = true ;
382
390
unsigned long long * d_offset;
383
391
size_t chunk_size;
384
- size_t lcubtemp ;
385
- void * d_cubtemp ;
392
+ size_t cubtmp_size ;
393
+ void * d_cubtmp ;
386
394
387
- if (!setup_device_compact (&chunk_size, &d_offset, &lcubtemp , &d_cubtemp , processors))
395
+ if (!setup_device_compact (&chunk_size, &d_offset, &cubtmp_size , &d_cubtmp , processors))
388
396
return 0 ;
389
397
390
- printf (" chunk_size=%zu\n " , chunk_size);
391
-
392
398
// perform compaction one chunk of blocks at a time
393
399
for (size_t block = 0 ; block < blocks && success; block += chunk_size) {
394
400
// determine chunk size
@@ -398,7 +404,7 @@ printf("chunk_size=%zu\n", chunk_size);
398
404
copy_length_launch (d_offset, d_length + block, blocks_per_chunk);
399
405
400
406
// compute prefix sum to turn block lengths into offsets
401
- cub::DeviceScan::InclusiveSum (d_cubtemp, lcubtemp , d_offset, d_offset, blocks_per_chunk + 1 );
407
+ cub::DeviceScan::InclusiveSum (d_cubtmp, cubtmp_size , d_offset, d_offset, blocks_per_chunk + 1 );
402
408
403
409
// compact the stream in place
404
410
if (!compact_stream_chunk ((uint32*)d_stream, d_offset, block, blocks_per_chunk, bits_per_slot, processors))
@@ -413,8 +419,8 @@ printf("chunk_size=%zu\n", chunk_size);
413
419
}
414
420
415
421
// free temporary buffers
416
- cleanup_device (NULL , d_offset);
417
- cleanup_device (NULL , d_cubtemp );
422
+ cleanup_device (d_offset);
423
+ cleanup_device (d_cubtmp );
418
424
419
425
return bits_written;
420
426
}
0 commit comments