Skip to content

Commit 023cc63

Browse files
Madhav MadhusoodananMadhav Madhusoodanan
authored andcommitted
feat: Created the rust-cuda version of the matrix multiplication sample
of `Nvidia/cuda-samples` repo.
1 parent b5fbb79 commit 023cc63

File tree

8 files changed

+313
-8
lines changed

8 files changed

+313
-8
lines changed

Cargo.lock

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ members = [
5454

5555
"samples/introduction/async_api",
5656
"samples/introduction/async_api/kernels",
57+
"samples/introduction/matmul",
58+
"samples/introduction/matmul/kernels",
5759

5860
"tests/compiletests",
5961
"tests/compiletests/deps-helper",

samples/introduction/README.md

Lines changed: 0 additions & 8 deletions
This file was deleted.

samples/introduction/async_api/src/main.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
/* This example demonstrates two key capabilities of CUDA events: measuring GPU execution time and enabling concurrent CPU-GPU operations.
2+
*
3+
* 1. Events are recorded at specific points within a CUDA stream to mark the beginning and end of GPU operations.
4+
* 2. Because CUDA stream operations execute asynchronously, the CPU remains free to perform other work while the GPU processes tasks (including memory transfers between host and device)
5+
* 3. The CPU can query these events to check whether the GPU has finished its work, allowing for coordination between the two processors without blocking the CPU.
6+
*/
7+
18
use cust::device::Device;
29
use cust::event::{Event, EventFlags};
310
use cust::function::{BlockSize, GridSize};
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[package]
2+
name = "matmul"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cust = { path = "../../../crates/cust" }
8+
cuda_std = { path = "../../../crates/cuda_std" }
9+
10+
[build-dependencies]
11+
cuda_builder = { workspace = true, default-features = false }
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[package]
2+
name = "kernels"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cuda_std = { path = "../../../../crates/cuda_std" }
8+
9+
[lib]
10+
crate-type = ["cdylib", "rlib"]
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
use core::mem::MaybeUninit;
2+
use cuda_std::*;
3+
4+
#[kernel]
5+
#[allow(improper_ctypes_definitions)]
6+
#[allow(clippy::needless_range_loop)]
7+
/// # Safety
8+
///
9+
/// This function is unsafe because it dereferences raw pointers.
10+
pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize) {
11+
let bx: usize = cuda_std::thread::block_idx().x as usize;
12+
let by: usize = cuda_std::thread::block_idx().y as usize;
13+
14+
let tx: usize = cuda_std::thread::thread_idx().x as usize;
15+
let ty: usize = cuda_std::thread::thread_idx().y as usize;
16+
17+
const BLOCK_SIZE: usize = 32;
18+
let a_begin = wa * BLOCK_SIZE * by;
19+
let a_end = a_begin + wa - 1;
20+
let a_step = BLOCK_SIZE;
21+
22+
let b_begin = BLOCK_SIZE * bx;
23+
let b_step = BLOCK_SIZE * wb;
24+
25+
let mut c_sub: f32 = 0.0;
26+
let mut kahan_correction_factor = 0.0f32;
27+
let mut bi = b_begin;
28+
29+
for ai in (a_begin..=a_end).step_by(a_step) {
30+
// The equivalent Cuda C++ code for the below is:
31+
// ```
32+
// __shared__ float A_MATRIX[BLOCK_SIZE][BLOCK_SIZE];
33+
// ```
34+
// This memory space is shared between threads of the same block
35+
#[address_space(shared)]
36+
static mut A_MATRIX: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
37+
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
38+
39+
#[address_space(shared)]
40+
static mut B_MATRIX: [[MaybeUninit<f32>; BLOCK_SIZE]; BLOCK_SIZE] =
41+
[[const { MaybeUninit::uninit() }; BLOCK_SIZE]; BLOCK_SIZE];
42+
43+
// Load A and B matrices into shared memory
44+
// A.add(index) returns the pointer to the index-th element of A
45+
// Hence a dereference is needed to get the value at that index
46+
unsafe {
47+
A_MATRIX[ty][tx].write(a[ai + wa * ty + tx]);
48+
B_MATRIX[ty][tx].write(b[bi + wb * ty + tx]);
49+
}
50+
51+
// Synchronize to make sure the matrices are loaded
52+
cuda_std::thread::sync_threads();
53+
for k in 0..BLOCK_SIZE {
54+
// Typically, this would be a simple calculation:
55+
// ```
56+
// c_sub += A_MATRIX[ty][k] * B_MATRIX[k][tx];
57+
// ```
58+
// However, to improve numerical stability, we use Kahan summation here so that the error can be isolated
59+
// and not allow it to accumulate in c_sub
60+
let input = unsafe { A_MATRIX[ty][k].assume_init() * B_MATRIX[k][tx].assume_init() };
61+
let y = input - kahan_correction_factor;
62+
let sum = c_sub + y;
63+
64+
// This seems like the correction factor would yield zero, however due to f32 precision limitations,
65+
// it helps to isolate the small errors that would otherwise accumulate in c_sub
66+
kahan_correction_factor = (sum - c_sub) - y;
67+
c_sub = sum;
68+
}
69+
70+
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
71+
cuda_std::thread::sync_threads();
72+
73+
bi += b_step;
74+
}
75+
76+
let ci = wb * BLOCK_SIZE * by + BLOCK_SIZE * bx;
77+
unsafe {
78+
*c.add((ci + wb * ty + tx) as usize) = c_sub;
79+
}
80+
}
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
/* This example demonstrates an implementation of matrix multiplication.
2+
*
3+
* 1. The matrices are first created on the host side and then copied to the device.
4+
* 2. A shared piece of block-specific memory is created (on the device side), so that summation can be done very quickly
5+
* 3. The result is copied back to host, where the accumulated error occur.
6+
* 4. Extra: The error that accumulates during the summation process is reduced (in the kernel itself) using [Kahan summation algorithm](https://en.wikipedia.org/wiki/Kahan_summation_algorithm).
7+
*/
8+
9+
use cuda_std::glam::USizeVec2;
10+
use cust::device::Device;
11+
use cust::event::{Event, EventFlags};
12+
use cust::function::{BlockSize, GridSize};
13+
use cust::launch;
14+
use cust::memory::{AsyncCopyDestination, DeviceBuffer, LockedBuffer};
15+
use cust::module::Module;
16+
use cust::stream::{Stream, StreamFlags};
17+
18+
static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));
19+
20+
fn matrix_multiply(
21+
block_size: usize,
22+
dims_a: USizeVec2,
23+
dims_b: USizeVec2,
24+
) -> Result<(), cust::error::CudaError> {
25+
let dims_c = USizeVec2::new(dims_b.x, dims_a.y);
26+
let size_a = dims_a.x * dims_a.y;
27+
let h_a = LockedBuffer::new(&1.0f32, size_a).expect("host array couldn't be initialized!");
28+
29+
let size_b = dims_b.x * dims_b.y;
30+
let h_b = LockedBuffer::new(&0.01f32, size_b).expect("host arrray couldn't be initialized!");
31+
32+
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");
33+
34+
let size_c = dims_b.x * dims_a.y;
35+
let mut h_c = LockedBuffer::new(&0.0f32, size_c).expect("host array couldn't be initialized!");
36+
37+
let start_event = Event::new(EventFlags::DEFAULT)?;
38+
let stop_event = Event::new(EventFlags::DEFAULT)?;
39+
40+
let d_a =
41+
DeviceBuffer::from_slice(h_a.as_slice()).expect("device array couldn't be initialized!");
42+
let d_b =
43+
DeviceBuffer::from_slice(h_b.as_slice()).expect("device array couldn't be initialized!");
44+
let d_c =
45+
DeviceBuffer::from_slice(h_c.as_slice()).expect("device array couldn't be initialized!");
46+
47+
stream.synchronize().expect("Stream couldn't synchronize!");
48+
let threads = BlockSize::xy(block_size as u32, block_size as u32);
49+
let grid = GridSize::xy(
50+
(dims_b.x / (threads.x as usize)).try_into().unwrap(),
51+
(dims_a.y / (threads.y as usize)).try_into().unwrap(),
52+
);
53+
54+
println!("Computing result using CUDA Kernel...");
55+
56+
let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
57+
let matrix_mul_cuda = module
58+
.get_function("matrix_mul_cuda")
59+
.expect("Kernel function not found!");
60+
61+
unsafe {
62+
// The function definition of the kernel is:
63+
// ```
64+
// pub unsafe fn matrix_mul_cuda(c: *mut f32, a: &[f32], b: &[f32], wa: usize, wb: usize)
65+
// ```
66+
// For elements that have the type `*mut T` or `*const T`, we'll need to pass only the device pointer.
67+
// For elements that have the type `&[T]`, we must pass the device pointer as well as the length of the slice.
68+
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
69+
d_c.as_device_ptr(),
70+
d_a.as_device_ptr(),
71+
d_a.len(),
72+
d_b.as_device_ptr(),
73+
d_b.len(),
74+
dims_a.x,
75+
dims_b.x
76+
))?;
77+
}
78+
79+
println!("Done!");
80+
stream.synchronize().expect("Stream couldn't synchronize!");
81+
82+
start_event
83+
.record(&stream)
84+
.expect("Failed to record start_event in the CUDA stream!");
85+
86+
const N_ITER: u32 = 300;
87+
88+
for _ in 0..N_ITER {
89+
unsafe {
90+
launch!(matrix_mul_cuda<<<grid, threads, 0, stream>>>(
91+
d_c.as_device_ptr(),
92+
d_a.as_device_ptr(),
93+
d_a.len(),
94+
d_b.as_device_ptr(),
95+
d_b.len(),
96+
dims_a.x,
97+
dims_b.x,
98+
))?;
99+
}
100+
}
101+
102+
stop_event
103+
.record(&stream)
104+
.expect("Failed to record stop_event in the CUDA stream!");
105+
106+
stop_event
107+
.synchronize()
108+
.expect("Stream couldn't synchronize!");
109+
110+
let gpu_time: u128 = stop_event
111+
.elapsed(&start_event)
112+
.expect("Failed to calculate duration of GPU operations!")
113+
.as_micros();
114+
115+
let avg_time = gpu_time as f32 / N_ITER as f32;
116+
println!(
117+
"Average time spent executing by the GPU: {} microseconds",
118+
avg_time
119+
);
120+
let flops_per_matrix_mul = 2.0 * (dims_a.x as f32) * (dims_a.y as f32) * (dims_b.x as f32);
121+
let giga_flops = (flops_per_matrix_mul / (avg_time)) / 1000.0;
122+
println!("Performance = {} GFlop/s", giga_flops);
123+
124+
unsafe {
125+
d_c.async_copy_to(&mut h_c, &stream)
126+
.expect("Could not copy from device to host!");
127+
}
128+
stream.synchronize().expect("Stream couldn't synchronize!");
129+
130+
// checking computed result
131+
// test relative error by the formula
132+
// |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
133+
let machine_epsilon = 1.1920929E-07f32;
134+
let mut correct = true;
135+
136+
for i in 0..(dims_c.x * dims_c.y) {
137+
let abs_err = (h_c[i] - (dims_a.x as f32 * 0.01f32)).abs();
138+
let dot_length = (dims_a.x as f32).abs();
139+
let abs_val = h_c[i].abs();
140+
let rel_err = abs_err / abs_val.max(dot_length * machine_epsilon);
141+
142+
if rel_err > 1e-6 {
143+
println!(
144+
"Error at index {}: CPU = {}, GPU = {}, rel_err = {}",
145+
i,
146+
dims_a.x as f32 * 0.01f32,
147+
h_c[i],
148+
rel_err
149+
);
150+
correct = false;
151+
}
152+
}
153+
154+
if correct {
155+
println!("Result = PASS");
156+
println!(
157+
"NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled."
158+
);
159+
} else {
160+
println!("Result = FAIL");
161+
return Err(cust::error::CudaError::UnknownError);
162+
}
163+
164+
Ok(())
165+
}
166+
167+
fn main() -> Result<(), cust::error::CudaError> {
168+
// Set up the context, load the module, and create a stream to run kernels in.
169+
let _ctx = cust::quick_init();
170+
let device = Device::get_device(0).expect("Couldn't find Cuda supported devices!");
171+
println!("Device Name: {}", device.name().unwrap());
172+
173+
let block_size: u32 = 32;
174+
175+
// Larger dimensions (compared to the original samples) were chosen to demonstrate
176+
// the pitfalls of floating point arithmetic and the need for Kahan summation algorithm (within the kernel)
177+
// to reduce the error that accumulates during the summation process in matrix multiplication.
178+
let dims_a = USizeVec2::new(40 * block_size as usize, 40 * block_size as usize);
179+
let dims_b = USizeVec2::new(80 * block_size as usize, 40 * block_size as usize);
180+
181+
if dims_a.x != dims_b.y {
182+
panic!("Matrix multiplication not possible with the given dimensions!");
183+
}
184+
185+
matrix_multiply(block_size as usize, dims_a, dims_b)?;
186+
Ok(())
187+
}

0 commit comments

Comments
 (0)