Skip to content

Commit 99ee483

Browse files
committed
Fixed the cuda make test
1 parent a9d73df commit 99ee483

File tree

4 files changed

+30
-63
lines changed

4 files changed

+30
-63
lines changed

modules/module2/examples/Makefile

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -202,15 +202,15 @@ test_cuda: cuda
202202
@if command -v nvidia-smi > /dev/null; then \
203203
echo "=== Testing Advanced Memory Management Examples ==="; \
204204
echo "1. Shared Memory Transpose..."; \
205-
./01_shared_memory_transpose_cuda || echo "✗ Shared memory transpose failed"; \
205+
$(BUILD_DIR)/01_shared_memory_transpose_cuda || echo "✗ Shared memory transpose failed"; \
206206
echo "2. Memory Coalescing Analysis..."; \
207-
./02_memory_coalescing_cuda || echo "✗ Memory coalescing failed"; \
207+
$(BUILD_DIR)/02_memory_coalescing_cuda || echo "✗ Memory coalescing failed"; \
208208
echo "3. Texture Memory Examples..."; \
209-
./03_texture_memory_cuda || echo "✗ Texture memory failed"; \
209+
$(BUILD_DIR)/03_texture_memory_cuda || echo "✗ Texture memory failed"; \
210210
echo "4. Unified Memory Examples..."; \
211-
./04_unified_memory_cuda || echo "✗ Unified memory failed"; \
211+
$(BUILD_DIR)/04_unified_memory_cuda || echo "✗ Unified memory failed"; \
212212
echo "5. Bandwidth Optimization..."; \
213-
./05_memory_bandwidth_optimization_cuda || echo "✗ Bandwidth optimization failed"; \
213+
$(BUILD_DIR)/05_memory_bandwidth_optimization_cuda || echo "✗ Bandwidth optimization failed"; \
214214
echo "✓ Module 2 CUDA tests completed"; \
215215
else \
216216
echo "No NVIDIA GPU detected, skipping CUDA tests"; \
@@ -221,9 +221,9 @@ test_hip: hip
221221
@if command -v rocm-smi > /dev/null || command -v nvidia-smi > /dev/null; then \
222222
echo "=== Testing HIP Memory Examples ==="; \
223223
echo "1. Shared Memory Transpose..."; \
224-
./01_shared_memory_transpose_hip || echo "✗ HIP shared memory transpose failed"; \
224+
$(BUILD_DIR)/01_shared_memory_transpose_hip || echo "✗ HIP shared memory transpose failed"; \
225225
echo "2. Memory Coalescing Analysis..."; \
226-
./02_memory_coalescing_hip || echo "✗ HIP memory coalescing failed"; \
226+
$(BUILD_DIR)/02_memory_coalescing_hip || echo "✗ HIP memory coalescing failed"; \
227227
echo "✓ Module 2 HIP tests completed"; \
228228
else \
229229
echo "No compatible GPU detected, skipping HIP tests"; \

modules/module3/examples/Makefile

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -203,19 +203,19 @@ test_cuda: cuda
203203
@if command -v nvidia-smi > /dev/null; then \
204204
echo "=== Testing Advanced Algorithm Examples ==="; \
205205
echo "1. Reduction Algorithms..."; \
206-
./01_reduction_algorithms_cuda || echo "✗ Reduction algorithms failed"; \
206+
$(BUILD_DIR)/01_reduction_algorithms_cuda || echo "✗ Reduction algorithms failed"; \
207207
echo "2. Scan (Prefix Sum)..."; \
208-
./02_scan_prefix_sum_cuda || echo "✗ Scan algorithms failed"; \
208+
$(BUILD_DIR)/02_scan_prefix_sum_cuda || echo "✗ Scan algorithms failed"; \
209209
echo "3. Sorting Algorithms..."; \
210-
./03_sorting_algorithms_cuda || echo "✗ Sorting algorithms failed"; \
210+
$(BUILD_DIR)/03_sorting_algorithms_cuda || echo "✗ Sorting algorithms failed"; \
211211
echo "4. Convolution/Stencil..."; \
212-
./04_convolution_stencil_cuda || echo "✗ Convolution failed"; \
212+
$(BUILD_DIR)/04_convolution_stencil_cuda || echo "✗ Convolution failed"; \
213213
echo "5. Matrix Operations..."; \
214-
./05_matrix_operations_cuda || echo "✗ Matrix operations failed"; \
214+
$(BUILD_DIR)/05_matrix_operations_cuda || echo "✗ Matrix operations failed"; \
215215
echo "6. Graph Algorithms..."; \
216-
./06_graph_algorithms_cuda || echo "✗ Graph algorithms failed"; \
216+
$(BUILD_DIR)/06_graph_algorithms_cuda || echo "✗ Graph algorithms failed"; \
217217
echo "7. Cooperative Groups..."; \
218-
./07_cooperative_groups_cuda || echo "✗ Cooperative groups failed"; \
218+
$(BUILD_DIR)/07_cooperative_groups_cuda || echo "✗ Cooperative groups failed"; \
219219
echo "✓ Module 3 CUDA tests completed"; \
220220
else \
221221
echo "No NVIDIA GPU detected, skipping CUDA tests"; \
@@ -226,13 +226,13 @@ test_hip: hip
226226
@if command -v rocm-smi > /dev/null || command -v nvidia-smi > /dev/null; then \
227227
echo "=== Testing HIP Algorithm Examples ==="; \
228228
echo "1. Reduction Algorithms..."; \
229-
./01_reduction_algorithms_hip || echo "✗ HIP reduction algorithms failed"; \
229+
$(BUILD_DIR)/01_reduction_algorithms_hip || echo "✗ HIP reduction algorithms failed"; \
230230
echo "2. Scan (Prefix Sum)..."; \
231-
./02_scan_prefix_sum_hip || echo "✗ HIP scan algorithms failed"; \
231+
$(BUILD_DIR)/02_scan_prefix_sum_hip || echo "✗ HIP scan algorithms failed"; \
232232
echo "3. Sorting Algorithms..."; \
233-
./03_sorting_algorithms_hip || echo "✗ HIP sorting algorithms failed"; \
233+
$(BUILD_DIR)/03_sorting_algorithms_hip || echo "✗ HIP sorting algorithms failed"; \
234234
echo "4. Convolution/Stencil..."; \
235-
./04_convolution_stencil_hip || echo "✗ HIP convolution failed"; \
235+
$(BUILD_DIR)/04_convolution_stencil_hip || echo "✗ HIP convolution failed"; \
236236
echo "✓ Module 3 HIP tests completed"; \
237237
else \
238238
echo "No compatible GPU detected, skipping HIP tests"; \

modules/module4/examples/Makefile

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -254,15 +254,15 @@ test_cuda: cuda
254254
@if command -v nvidia-smi > /dev/null; then \
255255
echo "=== Testing Advanced GPU Programming Examples ==="; \
256256
echo "1. CUDA Streams..."; \
257-
./01_cuda_streams_basics || echo "✗ CUDA Streams failed"; \
257+
$(BUILD_DIR)/01_cuda_streams_basics || echo "✗ CUDA Streams failed"; \
258258
echo "2. Multi-GPU Programming..."; \
259-
./02_multi_gpu_programming || echo "✗ Multi-GPU Programming failed"; \
259+
$(BUILD_DIR)/02_multi_gpu_programming || echo "✗ Multi-GPU Programming failed"; \
260260
echo "3. Unified Memory..."; \
261-
./03_unified_memory || echo "✗ Unified Memory failed"; \
261+
$(BUILD_DIR)/03_unified_memory || echo "✗ Unified Memory failed"; \
262262
echo "4. Peer-to-Peer Communication..."; \
263-
./04_peer_to_peer_communication || echo "✗ P2P Communication failed"; \
263+
$(BUILD_DIR)/04_peer_to_peer_communication || echo "✗ P2P Communication failed"; \
264264
echo "5. Dynamic Parallelism (requires compute capability 3.5+)..."; \
265-
./05_dynamic_parallelism || echo "✗ Dynamic Parallelism failed (may require newer GPU)"; \
265+
$(BUILD_DIR)/05_dynamic_parallelism || echo "✗ Dynamic Parallelism failed (may require newer GPU)"; \
266266
echo "✓ Module 4 tests completed"; \
267267
else \
268268
echo "No NVIDIA GPU detected, skipping GPU tests"; \
@@ -274,13 +274,13 @@ test_hip: hip
274274
@if command -v rocm-smi > /dev/null 2>&1 || command -v nvidia-smi > /dev/null 2>&1; then \
275275
echo "=== Testing HIP Advanced GPU Programming Examples ==="; \
276276
echo "1. HIP Streams..."; \
277-
./01_hip_streams_basics || echo "✗ HIP Streams failed"; \
277+
$(BUILD_DIR)/01_hip_streams_basics || echo "✗ HIP Streams failed"; \
278278
echo "2. HIP Multi-GPU Programming..."; \
279-
./02_hip_multi_gpu_programming || echo "✗ HIP Multi-GPU Programming failed"; \
279+
$(BUILD_DIR)/02_hip_multi_gpu_programming || echo "✗ HIP Multi-GPU Programming failed"; \
280280
echo "3. HIP Unified Memory..."; \
281-
./03_hip_unified_memory || echo "✗ HIP Unified Memory failed"; \
281+
$(BUILD_DIR)/03_hip_unified_memory || echo "✗ HIP Unified Memory failed"; \
282282
echo "4. HIP Peer-to-Peer Communication..."; \
283-
./04_hip_peer_to_peer_communication || echo "✗ HIP P2P Communication failed"; \
283+
$(BUILD_DIR)/04_hip_peer_to_peer_communication || echo "✗ HIP P2P Communication failed"; \
284284
echo "✓ HIP Module 4 tests completed"; \
285285
else \
286286
echo "No GPU detected (ROCm or CUDA), skipping HIP tests"; \

modules/module6/examples/03_histogram_cuda.cu

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -183,43 +183,10 @@ __global__ void histogram_warp_aggregated(unsigned char *input, int *histogram,
183183

184184
/**
185185
* Optimized warp-aggregated histogram using ballot and popc
186+
* Note: This implementation has been removed due to performance issues
187+
* with the nested loop over all bins. The warp aggregation approach
188+
* works better with selective bin processing rather than exhaustive search.
186189
*/
187-
__global__ void histogram_warp_optimized(unsigned char *input, int *histogram, int n) {
188-
extern __shared__ int private_hist[];
189-
190-
int tid = threadIdx.x;
191-
int idx = blockIdx.x * blockDim.x + threadIdx.x;
192-
int lane_id = threadIdx.x % 32;
193-
194-
// Initialize private histogram
195-
for (int bin = tid; bin < NUM_BINS; bin += blockDim.x) {
196-
private_hist[bin] = 0;
197-
}
198-
__syncthreads();
199-
200-
// Process input with optimized warp aggregation
201-
if (idx < n) {
202-
int bin = input[idx];
203-
204-
// Use ballot to find threads with same bin value
205-
for (int target_bin = 0; target_bin < NUM_BINS; target_bin++) {
206-
unsigned int ballot = __ballot_sync(0xffffffff, bin == target_bin);
207-
int count = __popc(ballot);
208-
209-
if (count > 0 && lane_id == __ffs(ballot) - 1) {
210-
atomicAdd(&private_hist[target_bin], count);
211-
}
212-
}
213-
}
214-
__syncthreads();
215-
216-
// Merge private histogram to global histogram
217-
for (int bin = tid; bin < NUM_BINS; bin += blockDim.x) {
218-
if (private_hist[bin] > 0) {
219-
atomicAdd(&histogram[bin], private_hist[bin]);
220-
}
221-
}
222-
}
223190

224191
/**
225192
* Multi-pass histogram for very large datasets

0 commit comments

Comments
 (0)