diff --git a/crates/blastoff/Cargo.toml b/crates/blastoff/Cargo.toml index 23340f1b..869bbfca 100644 --- a/crates/blastoff/Cargo.toml +++ b/crates/blastoff/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "blastoff" version = "0.1.0" -edition = "2021" +edition = "2024" authors = ["Riccardo D'Ambrosio "] repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/blastoff/src/context.rs b/crates/blastoff/src/context.rs index 83c53aa5..277e6be9 100644 --- a/crates/blastoff/src/context.rs +++ b/crates/blastoff/src/context.rs @@ -320,9 +320,11 @@ impl CublasContext { /// /// The callback must not panic and unwind. pub unsafe fn set_logger_callback(callback: Option) { - cublas_sys::cublasSetLoggerCallback(callback) - .to_result() - .unwrap(); + unsafe { + cublas_sys::cublasSetLoggerCallback(callback) + .to_result() + .unwrap(); + } } /// Gets the logger callback that was previously set. diff --git a/crates/blastoff/src/error.rs b/crates/blastoff/src/error.rs index b4b16420..35c0691f 100644 --- a/crates/blastoff/src/error.rs +++ b/crates/blastoff/src/error.rs @@ -38,8 +38,8 @@ pub trait ToResult { impl ToResult for cublas_sys::cublasStatus_t { fn to_result(self) -> Result<(), CublasError> { - use cust_raw::cublas_sys::cublasStatus_t::*; use CublasError::*; + use cust_raw::cublas_sys::cublasStatus_t::*; Err(match self { CUBLAS_STATUS_SUCCESS => return Ok(()), @@ -58,8 +58,8 @@ impl ToResult for cublas_sys::cublasStatus_t { impl CublasError { pub fn into_raw(self) -> cublas_sys::cublasStatus_t { - use cust_raw::cublas_sys::cublasStatus_t::*; use CublasError::*; + use cust_raw::cublas_sys::cublasStatus_t::*; match self { NotInitialized => CUBLAS_STATUS_NOT_INITIALIZED, diff --git a/crates/blastoff/src/level1.rs b/crates/blastoff/src/level1.rs index 40a1bc87..01dfd643 100644 --- a/crates/blastoff/src/level1.rs +++ b/crates/blastoff/src/level1.rs @@ -1,10 +1,10 @@ //! Scalar and Vector-based operations use crate::{ + BlasDatatype, Float, context::CublasContext, error::{Error, ToResult}, raw::{ComplexLevel1, FloatLevel1, Level1}, - BlasDatatype, Float, }; use cust::memory::{GpuBox, GpuBuffer}; use cust::stream::Stream; diff --git a/crates/blastoff/src/level3.rs b/crates/blastoff/src/level3.rs index ce08e6c0..a8ff81de 100644 --- a/crates/blastoff/src/level3.rs +++ b/crates/blastoff/src/level3.rs @@ -1,8 +1,8 @@ use crate::{ + GemmDatatype, MatrixOp, context::CublasContext, error::{Error, ToResult}, raw::GemmOps, - GemmDatatype, MatrixOp, }; use cust::memory::{GpuBox, GpuBuffer}; use cust::stream::Stream; diff --git a/crates/blastoff/src/raw/level1.rs b/crates/blastoff/src/raw/level1.rs index 7268d642..73103e74 100644 --- a/crates/blastoff/src/raw/level1.rs +++ b/crates/blastoff/src/raw/level1.rs @@ -103,7 +103,7 @@ impl Level1 for f32 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIsamax(handle, n, x, incx, result) + unsafe { cublasIsamax(handle, n, x, incx, result) } } unsafe fn amin( handle: cublasHandle_t, @@ -112,7 +112,7 @@ impl Level1 for f32 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIsamin(handle, n, x, incx, result) + unsafe { cublasIsamin(handle, n, x, incx, result) } } unsafe fn axpy( handle: cublasHandle_t, @@ -123,7 +123,7 @@ impl Level1 for f32 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasSaxpy(handle, n, alpha, x, incx, y, incy) + unsafe { cublasSaxpy(handle, n, alpha, x, incx, y, incy) } } unsafe fn copy( handle: cublasHandle_t, @@ -133,7 +133,7 @@ impl Level1 for f32 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasScopy(handle, n, x, incx, y, incy) + unsafe { cublasScopy(handle, n, x, incx, y, incy) } } unsafe fn nrm2( handle: cublasHandle_t, @@ -142,7 +142,7 @@ impl Level1 for f32 { incx: c_int, result: *mut Self::FloatTy, ) -> cublasStatus_t { - cublasSnrm2(handle, n, x, incx, result) + unsafe { cublasSnrm2(handle, n, x, incx, result) } } unsafe fn rot( handle: cublasHandle_t, @@ -154,7 +154,7 @@ impl Level1 for f32 { c: *const Self::FloatTy, s: *const Self, ) -> cublasStatus_t { - cublasSrot(handle, n, x, incx, y, incy, c, s) + unsafe { cublasSrot(handle, n, x, incx, y, incy, c, s) } } unsafe fn rotg( handle: cublasHandle_t, @@ -163,7 +163,7 @@ impl Level1 for f32 { c: *mut Self::FloatTy, s: *mut Self, ) -> cublasStatus_t { - cublasSrotg(handle, a, b, c, s) + unsafe { cublasSrotg(handle, a, b, c, s) } } unsafe fn rotm( handle: cublasHandle_t, @@ -174,7 +174,7 @@ impl Level1 for f32 { incy: c_int, param: *const Self::FloatTy, ) -> cublasStatus_t { - cublasSrotm(handle, n, x, incx, y, incy, param) + unsafe { cublasSrotm(handle, n, x, incx, y, incy, param) } } unsafe fn rotmg( handle: cublasHandle_t, @@ -184,7 +184,7 @@ impl Level1 for f32 { y1: *const Self, param: *mut Self, ) -> cublasStatus_t { - cublasSrotmg(handle, d1, d2, x1, y1, param) + unsafe { cublasSrotmg(handle, d1, d2, x1, y1, param) } } unsafe fn scal( handle: cublasHandle_t, @@ -193,7 +193,7 @@ impl Level1 for f32 { x: *mut Self, incx: c_int, ) -> cublasStatus_t { - cublasSscal(handle, n, alpha, x, incx) + unsafe { cublasSscal(handle, n, alpha, x, incx) } } unsafe fn swap( handle: cublasHandle_t, @@ -203,7 +203,7 @@ impl Level1 for f32 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasSswap(handle, n, x, incx, y, incy) + unsafe { cublasSswap(handle, n, x, incx, y, incy) } } } @@ -215,7 +215,7 @@ impl Level1 for f64 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIdamax(handle, n, x, incx, result) + unsafe { cublasIdamax(handle, n, x, incx, result) } } unsafe fn amin( handle: cublasHandle_t, @@ -224,7 +224,7 @@ impl Level1 for f64 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIdamin(handle, n, x, incx, result) + unsafe { cublasIdamin(handle, n, x, incx, result) } } unsafe fn axpy( handle: cublasHandle_t, @@ -235,7 +235,7 @@ impl Level1 for f64 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasDaxpy(handle, n, alpha, x, incx, y, incy) + unsafe { cublasDaxpy(handle, n, alpha, x, incx, y, incy) } } unsafe fn copy( handle: cublasHandle_t, @@ -245,7 +245,7 @@ impl Level1 for f64 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasDcopy(handle, n, x, incx, y, incy) + unsafe { cublasDcopy(handle, n, x, incx, y, incy) } } unsafe fn nrm2( handle: cublasHandle_t, @@ -254,7 +254,7 @@ impl Level1 for f64 { incx: c_int, result: *mut Self::FloatTy, ) -> cublasStatus_t { - cublasDnrm2(handle, n, x, incx, result) + unsafe { cublasDnrm2(handle, n, x, incx, result) } } unsafe fn rot( handle: cublasHandle_t, @@ -266,7 +266,7 @@ impl Level1 for f64 { c: *const Self::FloatTy, s: *const Self, ) -> cublasStatus_t { - cublasDrot(handle, n, x, incx, y, incy, c, s) + unsafe { cublasDrot(handle, n, x, incx, y, incy, c, s) } } unsafe fn rotg( handle: cublasHandle_t, @@ -275,7 +275,7 @@ impl Level1 for f64 { c: *mut Self::FloatTy, s: *mut Self, ) -> cublasStatus_t { - cublasDrotg(handle, a, b, c, s) + unsafe { cublasDrotg(handle, a, b, c, s) } } unsafe fn rotm( handle: cublasHandle_t, @@ -286,7 +286,7 @@ impl Level1 for f64 { incy: c_int, param: *const Self::FloatTy, ) -> cublasStatus_t { - cublasDrotm(handle, n, x, incx, y, incy, param) + unsafe { cublasDrotm(handle, n, x, incx, y, incy, param) } } unsafe fn rotmg( handle: cublasHandle_t, @@ -296,7 +296,7 @@ impl Level1 for f64 { y1: *const Self, param: *mut Self, ) -> cublasStatus_t { - cublasDrotmg(handle, d1, d2, x1, y1, param) + unsafe { cublasDrotmg(handle, d1, d2, x1, y1, param) } } unsafe fn scal( handle: cublasHandle_t, @@ -305,7 +305,7 @@ impl Level1 for f64 { x: *mut Self, incx: c_int, ) -> cublasStatus_t { - cublasDscal(handle, n, alpha, x, incx) + unsafe { cublasDscal(handle, n, alpha, x, incx) } } unsafe fn swap( handle: cublasHandle_t, @@ -315,7 +315,7 @@ impl Level1 for f64 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasDswap(handle, n, x, incx, y, incy) + unsafe { cublasDswap(handle, n, x, incx, y, incy) } } } @@ -327,7 +327,7 @@ impl Level1 for Complex32 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIcamax(handle, n, x.cast(), incx, result) + unsafe { cublasIcamax(handle, n, x.cast(), incx, result) } } unsafe fn amin( handle: cublasHandle_t, @@ -336,7 +336,7 @@ impl Level1 for Complex32 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIcamin(handle, n, x.cast(), incx, result) + unsafe { cublasIcamin(handle, n, x.cast(), incx, result) } } unsafe fn axpy( handle: cublasHandle_t, @@ -347,7 +347,7 @@ impl Level1 for Complex32 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasCaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy) + unsafe { cublasCaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy) } } unsafe fn copy( handle: cublasHandle_t, @@ -357,7 +357,7 @@ impl Level1 for Complex32 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasCcopy(handle, n, x.cast(), incx, y.cast(), incy) + unsafe { cublasCcopy(handle, n, x.cast(), incx, y.cast(), incy) } } unsafe fn nrm2( handle: cublasHandle_t, @@ -366,7 +366,7 @@ impl Level1 for Complex32 { incx: c_int, result: *mut Self::FloatTy, ) -> cublasStatus_t { - cublasScnrm2(handle, n, x.cast(), incx, result) + unsafe { cublasScnrm2(handle, n, x.cast(), incx, result) } } unsafe fn rot( handle: cublasHandle_t, @@ -378,7 +378,7 @@ impl Level1 for Complex32 { c: *const Self::FloatTy, s: *const Self::FloatTy, ) -> cublasStatus_t { - cublasCsrot(handle, n, x.cast(), incx, y.cast(), incy, c, s) + unsafe { cublasCsrot(handle, n, x.cast(), incx, y.cast(), incy, c, s) } } unsafe fn rotg( handle: cublasHandle_t, @@ -387,7 +387,7 @@ impl Level1 for Complex32 { c: *mut Self::FloatTy, s: *mut Self, ) -> cublasStatus_t { - cublasCrotg(handle, a.cast(), b.cast(), c, s.cast()) + unsafe { cublasCrotg(handle, a.cast(), b.cast(), c, s.cast()) } } unsafe fn rotm( _handle: cublasHandle_t, @@ -417,7 +417,7 @@ impl Level1 for Complex32 { x: *mut Self, incx: c_int, ) -> cublasStatus_t { - cublasCscal(handle, n, alpha.cast(), x.cast(), incx) + unsafe { cublasCscal(handle, n, alpha.cast(), x.cast(), incx) } } unsafe fn swap( handle: cublasHandle_t, @@ -427,7 +427,7 @@ impl Level1 for Complex32 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasCswap(handle, n, x.cast(), incx, y.cast(), incy) + unsafe { cublasCswap(handle, n, x.cast(), incx, y.cast(), incy) } } } @@ -439,7 +439,7 @@ impl Level1 for Complex64 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIzamax(handle, n, x.cast(), incx, result) + unsafe { cublasIzamax(handle, n, x.cast(), incx, result) } } unsafe fn amin( handle: cublasHandle_t, @@ -448,7 +448,7 @@ impl Level1 for Complex64 { incx: c_int, result: *mut c_int, ) -> cublasStatus_t { - cublasIzamin(handle, n, x.cast(), incx, result) + unsafe { cublasIzamin(handle, n, x.cast(), incx, result) } } unsafe fn axpy( handle: cublasHandle_t, @@ -459,7 +459,7 @@ impl Level1 for Complex64 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasZaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy) + unsafe { cublasZaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy) } } unsafe fn copy( handle: cublasHandle_t, @@ -469,7 +469,7 @@ impl Level1 for Complex64 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasZcopy(handle, n, x.cast(), incx, y.cast(), incy) + unsafe { cublasZcopy(handle, n, x.cast(), incx, y.cast(), incy) } } unsafe fn nrm2( handle: cublasHandle_t, @@ -478,7 +478,7 @@ impl Level1 for Complex64 { incx: c_int, result: *mut Self::FloatTy, ) -> cublasStatus_t { - cublasDznrm2(handle, n, x.cast(), incx, result) + unsafe { cublasDznrm2(handle, n, x.cast(), incx, result) } } unsafe fn rot( handle: cublasHandle_t, @@ -490,7 +490,7 @@ impl Level1 for Complex64 { c: *const Self::FloatTy, s: *const Self::FloatTy, ) -> cublasStatus_t { - cublasZdrot(handle, n, x.cast(), incx, y.cast(), incy, c, s) + unsafe { cublasZdrot(handle, n, x.cast(), incx, y.cast(), incy, c, s) } } unsafe fn rotg( handle: cublasHandle_t, @@ -499,7 +499,7 @@ impl Level1 for Complex64 { c: *mut Self::FloatTy, s: *mut Self, ) -> cublasStatus_t { - cublasZrotg(handle, a.cast(), b.cast(), c, s.cast()) + unsafe { cublasZrotg(handle, a.cast(), b.cast(), c, s.cast()) } } unsafe fn rotm( _handle: cublasHandle_t, @@ -529,7 +529,7 @@ impl Level1 for Complex64 { x: *mut Self, incx: c_int, ) -> cublasStatus_t { - cublasZscal(handle, n, alpha.cast(), x.cast(), incx) + unsafe { cublasZscal(handle, n, alpha.cast(), x.cast(), incx) } } unsafe fn swap( handle: cublasHandle_t, @@ -539,7 +539,7 @@ impl Level1 for Complex64 { y: *mut Self, incy: c_int, ) -> cublasStatus_t { - cublasZswap(handle, n, x.cast(), incx, y.cast(), incy) + unsafe { cublasZswap(handle, n, x.cast(), incx, y.cast(), incy) } } } @@ -575,7 +575,7 @@ impl ComplexLevel1 for Complex32 { incy: c_int, result: *mut Self, ) -> cublasStatus_t { - cublasCdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) + unsafe { cublasCdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) } } unsafe fn dotc( handle: cublasHandle_t, @@ -586,7 +586,7 @@ impl ComplexLevel1 for Complex32 { incy: c_int, result: *mut Self, ) -> cublasStatus_t { - cublasCdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) + unsafe { cublasCdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) } } } @@ -600,7 +600,7 @@ impl ComplexLevel1 for Complex64 { incy: c_int, result: *mut Self, ) -> cublasStatus_t { - cublasZdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) + unsafe { cublasZdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) } } unsafe fn dotc( handle: cublasHandle_t, @@ -611,7 +611,7 @@ impl ComplexLevel1 for Complex64 { incy: c_int, result: *mut Self, ) -> cublasStatus_t { - cublasZdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) + unsafe { cublasZdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) } } } @@ -638,7 +638,7 @@ impl FloatLevel1 for f32 { incy: c_int, result: *mut Self, ) -> cublasStatus_t { - cublasSdot(handle, n, x, incx, y, incy, result) + unsafe { cublasSdot(handle, n, x, incx, y, incy, result) } } } @@ -652,6 +652,6 @@ impl FloatLevel1 for f64 { incy: c_int, result: *mut Self, ) -> cublasStatus_t { - cublasDdot(handle, n, x, incx, y, incy, result) + unsafe { cublasDdot(handle, n, x, incx, y, incy, result) } } } diff --git a/crates/blastoff/src/raw/level3.rs b/crates/blastoff/src/raw/level3.rs index 3e770a29..855a8558 100644 --- a/crates/blastoff/src/raw/level3.rs +++ b/crates/blastoff/src/raw/level3.rs @@ -44,7 +44,7 @@ impl GemmOps for half::f16 { ) -> cublasStatus_t { // for some weird reason cublas only defines Hgemm if __cplusplus is defined, no idea why // but for now we just link against it manually, in the future we should figure out why - extern "C" { + unsafe extern "C" { fn cublasHgemm( handle: cublasHandle_t, transa: cublasOperation_t, @@ -62,9 +62,11 @@ impl GemmOps for half::f16 { ldc: c_int, ) -> cublasStatus_t; } - cublasHgemm( - handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - ) + unsafe { + cublasHgemm( + handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + ) + } } } @@ -85,9 +87,11 @@ impl GemmOps for f32 { c: *mut Self, ldc: c_int, ) -> cublasStatus_t { - cublasSgemm( - handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - ) + unsafe { + cublasSgemm( + handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + ) + } } } @@ -108,9 +112,11 @@ impl GemmOps for f64 { c: *mut Self, ldc: c_int, ) -> cublasStatus_t { - cublasDgemm( - handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, - ) + unsafe { + cublasDgemm( + handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, + ) + } } } @@ -131,22 +137,24 @@ impl GemmOps for Complex32 { c: *mut Self, ldc: c_int, ) -> cublasStatus_t { - cublasCgemm( - handle, - transa, - transb, - m, - n, - k, - alpha.cast(), - a.cast(), - lda, - b.cast(), - ldb, - beta.cast(), - c.cast(), - ldc, - ) + unsafe { + cublasCgemm( + handle, + transa, + transb, + m, + n, + k, + alpha.cast(), + a.cast(), + lda, + b.cast(), + ldb, + beta.cast(), + c.cast(), + ldc, + ) + } } } @@ -167,21 +175,23 @@ impl GemmOps for Complex64 { c: *mut Self, ldc: c_int, ) -> cublasStatus_t { - cublasCgemm( - handle, - transa, - transb, - m, - n, - k, - alpha.cast(), - a.cast(), - lda, - b.cast(), - ldb, - beta.cast(), - c.cast(), - ldc, - ) + unsafe { + cublasCgemm( + handle, + transa, + transb, + m, + n, + k, + alpha.cast(), + a.cast(), + lda, + b.cast(), + ldb, + beta.cast(), + c.cast(), + ldc, + ) + } } } diff --git a/crates/cuda_builder/Cargo.toml b/crates/cuda_builder/Cargo.toml index 51fb8828..fd0466ba 100644 --- a/crates/cuda_builder/Cargo.toml +++ b/crates/cuda_builder/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cuda_builder" version = "0.3.0" -edition = "2021" +edition = "2024" authors = ["Riccardo D'Ambrosio ", "The rust-gpu Authors"] license = "MIT OR Apache-2.0" description = "Builder for easily building rustc_codegen_nvvm crates" diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs index f54775c9..6441b827 100644 --- a/crates/cuda_builder/src/lib.rs +++ b/crates/cuda_builder/src/lib.rs @@ -469,12 +469,11 @@ fn find_in_dir(dir: &Path, filename: &str) -> Option { continue; } - if let Some(name) = path.file_name().and_then(|s| s.to_str()) { - if (name == filename) - || (name.starts_with(&hashed_prefix) && name.ends_with(dll_suffix)) - { - return Some(path); - } + if let Some(name) = path.file_name().and_then(|s| s.to_str()) + && (name == filename + || (name.starts_with(&hashed_prefix) && name.ends_with(dll_suffix))) + { + return Some(path); } } } @@ -580,12 +579,11 @@ fn workspace_root_dir() -> Option { loop { let candidate = path.join("Cargo.toml"); - if candidate.is_file() { - if let Ok(contents) = fs::read_to_string(&candidate) { - if contents.contains("[workspace]") { - return Some(path.clone()); - } - } + if candidate.is_file() + && let Ok(contents) = fs::read_to_string(&candidate) + && contents.contains("[workspace]") + { + return Some(path.clone()); } if !path.pop() { diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml index 42c06ab2..dfbfcb40 100644 --- a/crates/cuda_std/Cargo.toml +++ b/crates/cuda_std/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cuda_std" version = "0.2.2" -edition = "2018" +edition = "2024" license = "MIT OR Apache-2.0" description = "Standard library for CUDA with rustc_codegen_nvvm" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/cuda_std/src/atomic/intrinsics.rs b/crates/cuda_std/src/atomic/intrinsics.rs index 7a4fc2a8..bd0c012e 100644 --- a/crates/cuda_std/src/atomic/intrinsics.rs +++ b/crates/cuda_std/src/atomic/intrinsics.rs @@ -1256,15 +1256,9 @@ atomic_fetch_op_4_reg! { #[allow(unused_macros)] macro_rules! negation { - (u32, $val:ident) => {{ - -($val as i32) - }}; - (u64, $val:ident) => {{ - -($val as i64) - }}; - ($type:ty, $val:ident) => {{ - -$val - }}; + (u32, $val:ident) => {{ -($val as i32) }}; + (u64, $val:ident) => {{ -($val as i64) }}; + ($type:ty, $val:ident) => {{ -$val }}; } // atomic sub is a little special, nvcc implements it as an atomic add with a negated operand. PTX diff --git a/crates/cuda_std/src/float.rs b/crates/cuda_std/src/float.rs index 805ab6d2..e29e41cd 100644 --- a/crates/cuda_std/src/float.rs +++ b/crates/cuda_std/src/float.rs @@ -225,11 +225,7 @@ impl GpuFloat for f32 { #[inline] fn rem_euclid(self, rhs: f32) -> f32 { let r = self % rhs; - if r < 0.0 { - r + rhs.abs() - } else { - r - } + if r < 0.0 { r + rhs.abs() } else { r } } /// Raises a number to an integer power. @@ -557,11 +553,7 @@ impl GpuFloat for f64 { #[inline] fn rem_euclid(self, rhs: f64) -> f64 { let r = self % rhs; - if r < 0.0 { - r + rhs.abs() - } else { - r - } + if r < 0.0 { r + rhs.abs() } else { r } } /// Raises a number to an integer power. diff --git a/crates/cuda_std/src/intrinsics.rs b/crates/cuda_std/src/intrinsics.rs index ece3ebb2..1a1bc1b6 100644 --- a/crates/cuda_std/src/intrinsics.rs +++ b/crates/cuda_std/src/intrinsics.rs @@ -14,7 +14,7 @@ // Generated file, do not edit by hand, see scripts/gen_intrinsics.py -extern "C" { +unsafe extern "C" { #[link_name = "__nv_abs"] #[doc = "Determine the absolute value of the 32-bit signed integer x. diff --git a/crates/cuda_std/src/io.rs b/crates/cuda_std/src/io.rs index 54ed81de..53569c30 100644 --- a/crates/cuda_std/src/io.rs +++ b/crates/cuda_std/src/io.rs @@ -21,7 +21,7 @@ //! This does NOT include exiting the program, however, because rust uses RAII, unless you leak the //! context, output will always be flushed. -extern "C" { +unsafe extern "C" { // CUDA syscalls implicitly defined by nvvm you can link to. #[doc(hidden)] diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs index 0cb671bf..06ad4cc5 100644 --- a/crates/cuda_std/src/lib.rs +++ b/crates/cuda_std/src/lib.rs @@ -125,7 +125,7 @@ fn panic(_info: &core::panic::PanicInfo) -> ! { // // crate::println!("{}", msg); - extern "C" { + unsafe extern "C" { fn __nvvm_trap() -> !; } diff --git a/crates/cuda_std/src/mem.rs b/crates/cuda_std/src/mem.rs index d1c9626b..249727ff 100644 --- a/crates/cuda_std/src/mem.rs +++ b/crates/cuda_std/src/mem.rs @@ -7,7 +7,7 @@ use alloc::alloc::*; use core::ffi::c_void; #[cfg(target_arch = "nvptx64")] -extern "C" { +unsafe extern "C" { // implicitly defined by cuda. pub fn malloc(size: usize) -> *mut c_void; @@ -19,10 +19,12 @@ pub struct CUDAAllocator; #[cfg(target_arch = "nvptx64")] unsafe impl GlobalAlloc for CUDAAllocator { unsafe fn alloc(&self, layout: Layout) -> *mut u8 { - malloc(layout.size()) as *mut u8 + unsafe { malloc(layout.size()) as *mut u8 } } unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { - free(ptr as *mut _); + unsafe { + free(ptr as *mut _); + } } } diff --git a/crates/cuda_std/src/ptr.rs b/crates/cuda_std/src/ptr.rs index 3d1cac60..5bb0eb44 100644 --- a/crates/cuda_std/src/ptr.rs +++ b/crates/cuda_std/src/ptr.rs @@ -28,19 +28,22 @@ pub enum AddressSpace { #[gpu_only] pub unsafe fn is_in_address_space(ptr: *const T, address_space: AddressSpace) -> bool { let ret: u32; - // create a predicate register to store the result of the isspacep into. - asm!(".reg .pred p;"); + unsafe { + // Create a predicate register to store the result of the isspacep into. + asm!(".reg .pred p;"); - // perform the actual isspacep operation, and store the result in the predicate register we made. - match address_space { - AddressSpace::Global => asm!("isspacep.global p, {}", in(reg64) ptr), - AddressSpace::Shared => asm!("isspacep.shared p, {}", in(reg64) ptr), - AddressSpace::Constant => asm!("isspacep.const p, {}", in(reg64) ptr), - AddressSpace::Local => asm!("isspacep.local p, {}", in(reg64) ptr), - } + // Perform the actual isspacep operation, and store the result in the predicate register we + // made. + match address_space { + AddressSpace::Global => asm!("isspacep.global p, {}", in(reg64) ptr), + AddressSpace::Shared => asm!("isspacep.shared p, {}", in(reg64) ptr), + AddressSpace::Constant => asm!("isspacep.const p, {}", in(reg64) ptr), + AddressSpace::Local => asm!("isspacep.local p, {}", in(reg64) ptr), + } - // finally, use the predicate register to write out a value. - asm!("selp.u32 {}, 1, 0, p;", out(reg32) ret); + // Finally, use the predicate register to write out a value. + asm!("selp.u32 {}, 1, 0, p;", out(reg32) ret); + } ret != 0 } @@ -59,27 +62,29 @@ pub unsafe fn convert_generic_to_specific_address_space( ) -> *const T { let ret: *const T; - match address_space { - AddressSpace::Global => asm!( - "cvta.to.global.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), - AddressSpace::Shared => asm!( - "cvta.to.shared.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), - AddressSpace::Constant => asm!( - "cvta.to.const.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), - AddressSpace::Local => asm!( - "cvta.to.local.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), + unsafe { + match address_space { + AddressSpace::Global => asm!( + "cvta.to.global.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + AddressSpace::Shared => asm!( + "cvta.to.shared.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + AddressSpace::Constant => asm!( + "cvta.to.const.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + AddressSpace::Local => asm!( + "cvta.to.local.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + } } ret @@ -99,27 +104,29 @@ pub unsafe fn convert_specific_address_space_to_generic( ) -> *const T { let ret: *const T; - match address_space { - AddressSpace::Global => asm!( - "cvta.global.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), - AddressSpace::Shared => asm!( - "cvta.shared.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), - AddressSpace::Constant => asm!( - "cvta.const.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), - AddressSpace::Local => asm!( - "cvta.local.u64 {}, {}", - out(reg64) ret, - in(reg64) ptr - ), + unsafe { + match address_space { + AddressSpace::Global => asm!( + "cvta.global.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + AddressSpace::Shared => asm!( + "cvta.shared.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + AddressSpace::Constant => asm!( + "cvta.const.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + AddressSpace::Local => asm!( + "cvta.local.u64 {}, {}", + out(reg64) ret, + in(reg64) ptr + ), + } } ret diff --git a/crates/cuda_std/src/shared.rs b/crates/cuda_std/src/shared.rs index ddf4be5b..3bfab456 100644 --- a/crates/cuda_std/src/shared.rs +++ b/crates/cuda_std/src/shared.rs @@ -13,7 +13,7 @@ pub fn dynamic_shared_mem() -> *mut T { // it is unclear whether an alignment of 16 is actually required for correctness, however, // it seems like nvcc always generates the global with .align 16 no matter the type, so we just copy // nvcc's behavior for now. - extern "C" { + unsafe extern "C" { // need to use nvvm_internal and not address_space because address_space only parses // static definitions, not extern static definitions. #[nvvm_internal::addrspace(3)] diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs index 80d79f8b..8df6f1b8 100644 --- a/crates/cuda_std/src/thread.rs +++ b/crates/cuda_std/src/thread.rs @@ -69,7 +69,7 @@ use cuda_std_macros::gpu_only; use glam::{UVec2, UVec3}; // different calling conventions dont exist in nvptx, so we just use C as a placeholder. -extern "C" { +unsafe extern "C" { // defined in libintrinsics.ll fn __nvvm_warp_size() -> u32; @@ -286,7 +286,7 @@ pub fn sync_threads() { #[gpu_only] #[inline(always)] pub fn sync_threads_count(predicate: u32) -> u32 { - extern "C" { + unsafe extern "C" { #[link_name = "llvm.nvvm.barrier0.popc"] fn __nvvm_sync_threads_count(predicate: u32) -> u32; } @@ -299,7 +299,7 @@ pub fn sync_threads_count(predicate: u32) -> u32 { #[gpu_only] #[inline(always)] pub fn sync_threads_and(predicate: u32) -> u32 { - extern "C" { + unsafe extern "C" { #[link_name = "llvm.nvvm.barrier0.and"] fn __nvvm_sync_threads_and(predicate: u32) -> u32; } @@ -313,7 +313,7 @@ pub fn sync_threads_and(predicate: u32) -> u32 { #[gpu_only] #[inline(always)] pub fn sync_threads_or(predicate: u32) -> u32 { - extern "C" { + unsafe extern "C" { #[link_name = "llvm.nvvm.barrier0.or"] fn __nvvm_sync_threads_or(predicate: u32) -> u32; } diff --git a/crates/cuda_std/src/warp.rs b/crates/cuda_std/src/warp.rs index 146e2abb..94adec0a 100644 --- a/crates/cuda_std/src/warp.rs +++ b/crates/cuda_std/src/warp.rs @@ -21,12 +21,14 @@ use half::{bf16, f16}; #[gpu_only] #[inline(always)] pub unsafe fn sync_warp(mask: u32) { - extern "C" { + unsafe extern "C" { #[link_name = "llvm.nvvm.bar.warp.sync"] fn sync(mask: u32); } - sync(mask); + unsafe { + sync(mask); + } } /// Returns the thread's lane within its warp. This value ranges from `0` to `WARP_SIZE - 1` (`WARP_SIZE` is 32 on all @@ -72,7 +74,7 @@ pub fn activemask() -> u32 { #[gpu_only] #[inline(always)] pub unsafe fn warp_reduce(mask: u32, value: T, op: WarpReductionOp) -> T { - T::reduce(mask, value, op) + unsafe { T::reduce(mask, value, op) } } /// The type of operation to apply in a warp reduction. @@ -98,7 +100,7 @@ macro_rules! impl_reduce { paste::paste! { impl WarpReduceValue for $type { unsafe fn reduce(mask: u32, value: Self, op: WarpReductionOp) -> Self { - [](mask, value, op) + unsafe { [](mask, value, op) } } } } @@ -114,32 +116,34 @@ impl_reduce! { #[gpu_only] unsafe fn warp_reduce_32(mask: u32, value: u32, op: WarpReductionOp) -> u32 { let out; - match op { - WarpReductionOp::And => { - asm!( - "redux.sync.and.b32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); - } - WarpReductionOp::Or => { - asm!( - "redux.sync.or.b32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); - } - WarpReductionOp::Xor => { - asm!( - "redux.sync.xor.b32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); + unsafe { + match op { + WarpReductionOp::And => { + asm!( + "redux.sync.and.b32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + WarpReductionOp::Or => { + asm!( + "redux.sync.or.b32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + WarpReductionOp::Xor => { + asm!( + "redux.sync.xor.b32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + _ => unreachable!(), } - _ => unreachable!(), } out } @@ -147,32 +151,34 @@ unsafe fn warp_reduce_32(mask: u32, value: u32, op: WarpReductionOp) -> u32 { #[gpu_only] unsafe fn warp_reduce_u32(mask: u32, value: u32, op: WarpReductionOp) -> u32 { let out; - match op { - WarpReductionOp::Add => { - asm!( - "redux.sync.add.u32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); - } - WarpReductionOp::Min => { - asm!( - "redux.sync.min.u32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); - } - WarpReductionOp::Max => { - asm!( - "redux.sync.max.u32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); + unsafe { + match op { + WarpReductionOp::Add => { + asm!( + "redux.sync.add.u32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + WarpReductionOp::Min => { + asm!( + "redux.sync.min.u32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + WarpReductionOp::Max => { + asm!( + "redux.sync.max.u32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + _ => out = warp_reduce_32(mask, value, op), } - _ => out = warp_reduce_32(mask, value, op), } out } @@ -180,32 +186,34 @@ unsafe fn warp_reduce_u32(mask: u32, value: u32, op: WarpReductionOp) -> u32 { #[gpu_only] unsafe fn warp_reduce_i32(mask: u32, value: i32, op: WarpReductionOp) -> i32 { let out; - match op { - WarpReductionOp::Add => { - asm!( - "redux.sync.add.s32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); - } - WarpReductionOp::Min => { - asm!( - "redux.sync.min.s32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); - } - WarpReductionOp::Max => { - asm!( - "redux.sync.max.s32 {}, {}, {};", - out(reg32) out, - in(reg32) value, - in(reg32) mask - ); + unsafe { + match op { + WarpReductionOp::Add => { + asm!( + "redux.sync.add.s32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + WarpReductionOp::Min => { + asm!( + "redux.sync.min.s32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + WarpReductionOp::Max => { + asm!( + "redux.sync.max.s32 {}, {}, {};", + out(reg32) out, + in(reg32) value, + in(reg32) mask + ); + } + _ => out = warp_reduce_32(mask, value as u32, op) as i32, } - _ => out = warp_reduce_32(mask, value as u32, op) as i32, } out } @@ -225,7 +233,7 @@ unsafe fn warp_reduce_i32(mask: u32, value: i32, op: WarpReductionOp) -> i32 { #[gpu_only] #[inline(always)] pub unsafe fn warp_match_any(mask: u32, value: T) -> u32 { - T::match_any(mask, value) + unsafe { T::match_any(mask, value) } } /// Synchronizes threads in a warp and performs a broadcast-and-compare operation between them. @@ -243,7 +251,7 @@ pub unsafe fn warp_match_any(mask: u32, value: T) -> u32 { #[gpu_only] #[inline(always)] pub unsafe fn warp_match_all(mask: u32, value: T) -> Option { - T::match_all(mask, value) + unsafe { T::match_all(mask, value) } } /// A value that can be used inside of a warp match. @@ -260,10 +268,10 @@ macro_rules! impl_match { paste::paste! { impl WarpMatchValue for $type { unsafe fn match_any(mask: u32, value: Self) -> u32 { - [](mask, value as []) + unsafe { [](mask, value as []) } } unsafe fn match_all(mask: u32, value: Self) -> Option { - let (val, pred) = [](mask, value as []); + let (val, pred) = unsafe { [](mask, value as []) }; pred.then(|| val) } } @@ -284,41 +292,41 @@ impl_match! { #[gpu_only] #[inline(always)] unsafe fn match_any_32(mask: u32, value: u32) -> u32 { - extern "C" { + unsafe extern "C" { #[link_name = "llvm.nvvm.match.any.sync.i32"] fn __nvvm_warp_match_any_32(mask: u32, value: u32) -> u32; } - __nvvm_warp_match_any_32(mask, value) + unsafe { __nvvm_warp_match_any_32(mask, value) } } #[gpu_only] #[inline(always)] unsafe fn match_any_64(mask: u32, value: u64) -> u32 { - extern "C" { + unsafe extern "C" { #[link_name = "llvm.nvvm.match.any.sync.i64"] fn __nvvm_warp_match_any_64(mask: u32, value: u64) -> u32; } - __nvvm_warp_match_any_64(mask, value) + unsafe { __nvvm_warp_match_any_64(mask, value) } } #[gpu_only] #[inline(always)] unsafe fn match_all_32(mask: u32, value: u32) -> (u32, bool) { - extern "C" { + unsafe extern "C" { #[allow(improper_ctypes)] fn __nvvm_warp_match_all_32(mask: u32, value: u32) -> (u32, bool); } - __nvvm_warp_match_all_32(mask, value) + unsafe { __nvvm_warp_match_all_32(mask, value) } } #[gpu_only] #[inline(always)] unsafe fn match_all_64(mask: u32, value: u64) -> (u32, bool) { - extern "C" { + unsafe extern "C" { #[allow(improper_ctypes)] fn __nvvm_warp_match_all_64(mask: u32, value: u64) -> (u32, bool); } - __nvvm_warp_match_all_64(mask, value) + unsafe { __nvvm_warp_match_all_64(mask, value) } } /// Synchronizes a subset of threads in a warp then performs a reduce-and-broadcast @@ -337,19 +345,19 @@ unsafe fn match_all_64(mask: u32, value: u64) -> (u32, bool) { #[gpu_only] pub unsafe fn warp_vote_all(mask: u32, predicate: bool) -> bool { let mut out: u32; - - asm!( - "{{", - ".reg .pred %p<3>;", - "setp.eq.u32 %p1, {}, 1;", - "vote.sync.all.pred %p2, %p1, {};", - "selp.u32 {}, 0, 1, %p2;", - "}}", - in(reg32) predicate as u32, - in(reg32) mask, - out(reg32) out - ); - + unsafe { + asm!( + "{{", + ".reg .pred %p<3>;", + "setp.eq.u32 %p1, {}, 1;", + "vote.sync.all.pred %p2, %p1, {};", + "selp.u32 {}, 0, 1, %p2;", + "}}", + in(reg32) predicate as u32, + in(reg32) mask, + out(reg32) out + ); + } out != 0 } @@ -369,19 +377,19 @@ pub unsafe fn warp_vote_all(mask: u32, predicate: bool) -> bool { #[gpu_only] pub unsafe fn warp_vote_any(mask: u32, predicate: bool) -> bool { let mut out: u32; - - asm!( - "{{", - ".reg .pred %p<3>;", - "setp.eq.u32 %p1, {}, 1;", - "vote.sync.any.pred %p2, %p1, {};", - "selp.u32 {}, 0, 1, %p2;", - "}}", - in(reg32) predicate as u32, - in(reg32) mask, - out(reg32) out - ); - + unsafe { + asm!( + "{{", + ".reg .pred %p<3>;", + "setp.eq.u32 %p1, {}, 1;", + "vote.sync.any.pred %p2, %p1, {};", + "selp.u32 {}, 0, 1, %p2;", + "}}", + in(reg32) predicate as u32, + in(reg32) mask, + out(reg32) out + ); + } out != 0 } @@ -402,18 +410,18 @@ pub unsafe fn warp_vote_any(mask: u32, predicate: bool) -> bool { #[gpu_only] pub unsafe fn warp_vote_ballot(mask: u32, predicate: bool) -> u32 { let mut out: u32; - - asm!( - "{{", - ".reg .pred %p1;", - "setp.eq.u32 %p1, {}, 1;", - "vote.sync.ballot.b32 {}, %p1, {};", - "}}", - in(reg32) predicate as u32, - out(reg32) out, - in(reg32) mask, - ); - + unsafe { + asm!( + "{{", + ".reg .pred %p1;", + "setp.eq.u32 %p1, {}, 1;", + "vote.sync.ballot.b32 {}, %p1, {};", + "}}", + in(reg32) predicate as u32, + out(reg32) out, + in(reg32) mask, + ); + } out } @@ -465,7 +473,7 @@ pub unsafe fn warp_shuffle_down( delta: u32, width: u32, ) -> (T, bool) { - T::shuffle(WarpShuffleMode::Down, mask, value, delta, width) + unsafe { T::shuffle(WarpShuffleMode::Down, mask, value, delta, width) } } /// Waits for threads in a warp to reach this point and shuffles a value across the @@ -516,7 +524,7 @@ pub unsafe fn warp_shuffle_up( delta: u32, width: u32, ) -> (T, bool) { - T::shuffle(WarpShuffleMode::Up, mask, value, delta, width) + unsafe { T::shuffle(WarpShuffleMode::Up, mask, value, delta, width) } } /// Waits for threads in a warp to reach this point and shuffles a value across the @@ -568,7 +576,7 @@ pub unsafe fn warp_shuffle_idx( idx: u32, width: u32, ) -> (T, bool) { - T::shuffle(WarpShuffleMode::Idx, mask, value, idx, width) + unsafe { T::shuffle(WarpShuffleMode::Idx, mask, value, idx, width) } } /// Waits for threads in a warp to reach this point and shuffles a value across the @@ -619,7 +627,7 @@ pub unsafe fn warp_shuffle_xor( lane_mask: u32, width: u32, ) -> (T, bool) { - T::shuffle(WarpShuffleMode::Xor, mask, value, lane_mask, width) + unsafe { T::shuffle(WarpShuffleMode::Xor, mask, value, lane_mask, width) } } /// A value that can be used in a warp shuffle @@ -647,7 +655,7 @@ macro_rules! impl_shuffle { b: u32, width: u32, ) -> (Self, bool) { - let (res, oob) = [](mode, mask, value as [], b, width); + let (res, oob) = unsafe { [](mode, mask, value as [], b, width) }; (res as $type, oob) } } @@ -679,7 +687,7 @@ impl WarpShuffleValue for f32 { b: u32, width: u32, ) -> (Self, bool) { - let (res, oob) = warp_shuffle_32(mode, mask, value.to_bits(), b, width); + let (res, oob) = unsafe { warp_shuffle_32(mode, mask, value.to_bits(), b, width) }; (f32::from_bits(res), oob) } } @@ -692,7 +700,7 @@ impl WarpShuffleValue for f64 { b: u32, width: u32, ) -> (Self, bool) { - let (res, oob) = warp_shuffle_64(mode, mask, value.to_bits(), b, width); + let (res, oob) = unsafe { warp_shuffle_64(mode, mask, value.to_bits(), b, width) }; (f64::from_bits(res), oob) } } @@ -705,7 +713,7 @@ impl WarpShuffleValue for f16 { b: u32, width: u32, ) -> (Self, bool) { - let (res, oob) = warp_shuffle_16(mode, mask, value.to_bits(), b, width); + let (res, oob) = unsafe { warp_shuffle_16(mode, mask, value.to_bits(), b, width) }; (f16::from_bits(res), oob) } } @@ -718,7 +726,7 @@ impl WarpShuffleValue for bf16 { b: u32, width: u32, ) -> (Self, bool) { - let (res, oob) = warp_shuffle_16(mode, mask, value.to_bits(), b, width); + let (res, oob) = unsafe { warp_shuffle_16(mode, mask, value.to_bits(), b, width) }; (bf16::from_bits(res), oob) } } @@ -751,7 +759,7 @@ unsafe fn warp_shuffle_32( b: u32, width: u32, ) -> (u32, bool) { - extern "C" { + unsafe extern "C" { // see libintrinsics.ll // Returns {i32, i8} in LLVM IR, which maps to our WarpShuffleResult struct fn __nvvm_warp_shuffle(mask: u32, mode: u32, a: u32, b: u32, c: u32) -> WarpShuffleResult; @@ -767,7 +775,7 @@ unsafe fn warp_shuffle_32( c |= 0b11111; c |= (32 - width) << 8; - let result = __nvvm_warp_shuffle(mask, mode as u32, value, b, c); + let result = unsafe { __nvvm_warp_shuffle(mask, mode as u32, value, b, c) }; (result.value, result.predicate != 0) } @@ -782,8 +790,8 @@ unsafe fn warp_shuffle_128( let second_half = (value >> 64) as u64; // shuffle the first and second half of the value then recombine them // this will perform 4 shuffles in total (4 32-bit shuffles) - let (new_first_half, oob) = warp_shuffle_64(mode, mask, first_half, b, width); - let (new_second_half, _) = warp_shuffle_64(mode, mask, second_half, b, width); + let (new_first_half, oob) = unsafe { warp_shuffle_64(mode, mask, first_half, b, width) }; + let (new_second_half, _) = unsafe { warp_shuffle_64(mode, mask, second_half, b, width) }; ( ((new_second_half as u128) << 64) | (new_first_half as u128), oob, @@ -800,8 +808,8 @@ unsafe fn warp_shuffle_64( let first_half = value as u32; let second_half = (value >> 32) as u32; // shuffle the first and second half of the value then recombine them - let (new_first_half, oob) = warp_shuffle_32(mode, mask, first_half, b, width); - let (new_second_half, _) = warp_shuffle_32(mode, mask, second_half, b, width); + let (new_first_half, oob) = unsafe { warp_shuffle_32(mode, mask, first_half, b, width) }; + let (new_second_half, _) = unsafe { warp_shuffle_32(mode, mask, second_half, b, width) }; ( ((new_second_half as u64) << 32) | (new_first_half as u64), oob, @@ -815,7 +823,7 @@ unsafe fn warp_shuffle_16( b: u32, width: u32, ) -> (u16, bool) { - let (value, oob) = warp_shuffle_32(mode, mask, value as u32, b, width); + let (value, oob) = unsafe { warp_shuffle_32(mode, mask, value as u32, b, width) }; ((value as u16), oob) } @@ -826,6 +834,6 @@ unsafe fn warp_shuffle_8( b: u32, width: u32, ) -> (u8, bool) { - let (value, oob) = warp_shuffle_32(mode, mask, value as u32, b, width); + let (value, oob) = unsafe { warp_shuffle_32(mode, mask, value as u32, b, width) }; ((value as u8), oob) } diff --git a/crates/cuda_std_macros/Cargo.toml b/crates/cuda_std_macros/Cargo.toml index 4557dcff..1fc98d11 100644 --- a/crates/cuda_std_macros/Cargo.toml +++ b/crates/cuda_std_macros/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cuda_std_macros" version = "0.2.0" -edition = "2018" +edition = "2024" license = "MIT OR Apache-2.0" description = "Macros for cuda_std" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs index 26eb47d9..467a6319 100644 --- a/crates/cuda_std_macros/src/lib.rs +++ b/crates/cuda_std_macros/src/lib.rs @@ -1,9 +1,9 @@ use proc_macro::TokenStream; use proc_macro2::Span; -use quote::{quote_spanned, ToTokens}; +use quote::{ToTokens, quote_spanned}; use syn::{ - parse::Parse, parse_macro_input, parse_quote, punctuated::Punctuated, spanned::Spanned, Error, - FnArg, Ident, ItemFn, ReturnType, Stmt, Token, + Error, FnArg, Ident, ItemFn, ReturnType, Stmt, Token, parse::Parse, parse_macro_input, + parse_quote, punctuated::Punctuated, spanned::Spanned, }; /// Registers a function as a gpu kernel. @@ -25,7 +25,7 @@ pub fn kernel(input: proc_macro::TokenStream, item: proc_macro::TokenStream) -> let _ = parse_macro_input!(input as KernelHints); let input = parse_macro_input!(cloned as proc_macro2::TokenStream); let mut item = parse_macro_input!(item as ItemFn); - let no_mangle = parse_quote!(#[no_mangle]); + let no_mangle = parse_quote!(#[unsafe(no_mangle)]); item.attrs.push(no_mangle); let internal = parse_quote!(#[cfg_attr(target_arch="nvptx64", nvvm_internal::kernel(#input))]); item.attrs.push(internal); diff --git a/crates/cudnn/Cargo.toml b/crates/cudnn/Cargo.toml index 05de479f..99a8f388 100644 --- a/crates/cudnn/Cargo.toml +++ b/crates/cudnn/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["frjnn "] -edition = "2021" +edition = "2024" name = "cudnn" version = "0.1.0" diff --git a/crates/cudnn/src/activation/mod.rs b/crates/cudnn/src/activation/mod.rs index c5193204..b51d2799 100644 --- a/crates/cudnn/src/activation/mod.rs +++ b/crates/cudnn/src/activation/mod.rs @@ -1,7 +1,7 @@ use cust::memory::GpuBuffer; use crate::{ - private, CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor, + CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor, private, }; mod activation_descriptor; diff --git a/crates/cudnn/src/attention/seq_data_descriptor.rs b/crates/cudnn/src/attention/seq_data_descriptor.rs index 9c47e53f..4dd007e4 100644 --- a/crates/cudnn/src/attention/seq_data_descriptor.rs +++ b/crates/cudnn/src/attention/seq_data_descriptor.rs @@ -1,6 +1,6 @@ use std::{marker::PhantomData, mem::MaybeUninit}; -use crate::{private, CudnnError, DataType, IntoResult, SeqDataAxis}; +use crate::{CudnnError, DataType, IntoResult, SeqDataAxis, private}; /// Specifies the allowed types for the sequential data buffer. /// diff --git a/crates/cudnn/src/backend/conv_bwd_data.rs b/crates/cudnn/src/backend/conv_bwd_data.rs index 793e8659..cff56be3 100644 --- a/crates/cudnn/src/backend/conv_bwd_data.rs +++ b/crates/cudnn/src/backend/conv_bwd_data.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor}, CudnnError, DataType, IntoResult, + backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor}, }; pub struct ConvBwdDataBuilder { diff --git a/crates/cudnn/src/backend/conv_bwd_filter.rs b/crates/cudnn/src/backend/conv_bwd_filter.rs index 5ddce6f8..18b49617 100644 --- a/crates/cudnn/src/backend/conv_bwd_filter.rs +++ b/crates/cudnn/src/backend/conv_bwd_filter.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor}, CudnnError, DataType, IntoResult, + backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor}, }; pub struct ConvBwdFilterBuilder { diff --git a/crates/cudnn/src/backend/conv_cfg.rs b/crates/cudnn/src/backend/conv_cfg.rs index f89d99d1..cb5b5a6e 100644 --- a/crates/cudnn/src/backend/conv_cfg.rs +++ b/crates/cudnn/src/backend/conv_cfg.rs @@ -1,4 +1,4 @@ -use crate::{backend::Descriptor, ConvMode, CudnnError, DataType, IntoResult}; +use crate::{ConvMode, CudnnError, DataType, IntoResult, backend::Descriptor}; #[derive(Default, Clone, PartialEq, Eq, Hash, Debug)] pub struct ConvCfgBuilder<'a> { diff --git a/crates/cudnn/src/backend/conv_fwd.rs b/crates/cudnn/src/backend/conv_fwd.rs index ffde1de2..873165ab 100644 --- a/crates/cudnn/src/backend/conv_fwd.rs +++ b/crates/cudnn/src/backend/conv_fwd.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor}, CudnnError, DataType, IntoResult, + backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor}, }; pub struct ConvFwdBuilder { diff --git a/crates/cudnn/src/backend/engine.rs b/crates/cudnn/src/backend/engine.rs index 9eb3d726..43e1c421 100644 --- a/crates/cudnn/src/backend/engine.rs +++ b/crates/cudnn/src/backend/engine.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, Graph}, CudnnError, IntoResult, + backend::{Descriptor, Graph}, }; #[derive(Default, Debug, PartialEq)] diff --git a/crates/cudnn/src/backend/engine_cfg.rs b/crates/cudnn/src/backend/engine_cfg.rs index 9a8b39a1..8531e12c 100644 --- a/crates/cudnn/src/backend/engine_cfg.rs +++ b/crates/cudnn/src/backend/engine_cfg.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, Engine}, CudnnError, IntoResult, + backend::{Descriptor, Engine}, }; #[derive(Default, PartialEq, Debug)] diff --git a/crates/cudnn/src/backend/engine_heuristic.rs b/crates/cudnn/src/backend/engine_heuristic.rs index ac7c4fe0..ebdda555 100644 --- a/crates/cudnn/src/backend/engine_heuristic.rs +++ b/crates/cudnn/src/backend/engine_heuristic.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, EngineCfgBuilder, Graph}, CudnnContext, CudnnError, IntoResult, + backend::{Descriptor, EngineCfgBuilder, Graph}, }; pub enum HeuristicMode { diff --git a/crates/cudnn/src/backend/execution_plan.rs b/crates/cudnn/src/backend/execution_plan.rs index 14f313cf..4dffc922 100644 --- a/crates/cudnn/src/backend/execution_plan.rs +++ b/crates/cudnn/src/backend/execution_plan.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, EngineCfg}, CudnnContext, CudnnError, IntoResult, + backend::{Descriptor, EngineCfg}, }; #[derive(Default, PartialEq, Debug)] diff --git a/crates/cudnn/src/backend/graph.rs b/crates/cudnn/src/backend/graph.rs index a30af78f..c070600c 100644 --- a/crates/cudnn/src/backend/graph.rs +++ b/crates/cudnn/src/backend/graph.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, Operation}, CudnnContext, CudnnError, + backend::{Descriptor, Operation}, }; #[derive(Default, PartialEq, Debug)] diff --git a/crates/cudnn/src/backend/matmul.rs b/crates/cudnn/src/backend/matmul.rs index 14badc8b..f4e55ef8 100644 --- a/crates/cudnn/src/backend/matmul.rs +++ b/crates/cudnn/src/backend/matmul.rs @@ -1,8 +1,8 @@ use cust::memory::bytemuck::Contiguous; use crate::{ - backend::{Descriptor, MatMulCfg, Operation, Tensor}, CudnnError, DataType, IntoResult, + backend::{Descriptor, MatMulCfg, Operation, Tensor}, }; #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)] diff --git a/crates/cudnn/src/backend/matmul_cfg.rs b/crates/cudnn/src/backend/matmul_cfg.rs index 3fba7107..d8314d76 100644 --- a/crates/cudnn/src/backend/matmul_cfg.rs +++ b/crates/cudnn/src/backend/matmul_cfg.rs @@ -1,4 +1,4 @@ -use crate::{backend::Descriptor, CudnnError, DataType, IntoResult}; +use crate::{CudnnError, DataType, IntoResult, backend::Descriptor}; #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)] pub struct MatMulCfgBuilder { diff --git a/crates/cudnn/src/backend/pointwise.rs b/crates/cudnn/src/backend/pointwise.rs index baa0347e..9ca3c60c 100644 --- a/crates/cudnn/src/backend/pointwise.rs +++ b/crates/cudnn/src/backend/pointwise.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, FloatDataType, Operation, PointwiseCfg, PointwiseMode, Real, Tensor}, CudnnError, DataType, IntoResult, NanPropagation, + backend::{Descriptor, FloatDataType, Operation, PointwiseCfg, PointwiseMode, Real, Tensor}, }; #[derive(Clone, Default, Debug, PartialEq)] diff --git a/crates/cudnn/src/backend/pointwise_cfg.rs b/crates/cudnn/src/backend/pointwise_cfg.rs index bae13e72..20a09475 100644 --- a/crates/cudnn/src/backend/pointwise_cfg.rs +++ b/crates/cudnn/src/backend/pointwise_cfg.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, PointwiseMode}, CudnnError, DataType, IntoResult, NanPropagation, + backend::{Descriptor, PointwiseMode}, }; #[derive(Clone, Default, PartialEq, Debug)] diff --git a/crates/cudnn/src/backend/reduction.rs b/crates/cudnn/src/backend/reduction.rs index 3a05b767..d5794192 100644 --- a/crates/cudnn/src/backend/reduction.rs +++ b/crates/cudnn/src/backend/reduction.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, Operation, ReductionCfg, Tensor}, CudnnError, IntoResult, + backend::{Descriptor, Operation, ReductionCfg, Tensor}, }; #[derive(Default, Clone, Debug, PartialEq, Eq, Hash)] diff --git a/crates/cudnn/src/backend/reduction_cfg.rs b/crates/cudnn/src/backend/reduction_cfg.rs index bb1689e3..292d1ecc 100644 --- a/crates/cudnn/src/backend/reduction_cfg.rs +++ b/crates/cudnn/src/backend/reduction_cfg.rs @@ -1,6 +1,6 @@ use crate::{ - backend::{Descriptor, ReductionMode}, CudnnError, DataType, IntoResult, + backend::{Descriptor, ReductionMode}, }; #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)] diff --git a/crates/cudnn/src/backend/tensor.rs b/crates/cudnn/src/backend/tensor.rs index 1b9c7e88..b1fcdacc 100644 --- a/crates/cudnn/src/backend/tensor.rs +++ b/crates/cudnn/src/backend/tensor.rs @@ -1,4 +1,4 @@ -use crate::{backend::Descriptor, CudnnError, DataType, IntoResult}; +use crate::{CudnnError, DataType, IntoResult, backend::Descriptor}; #[derive(Clone, Default, Debug, PartialEq, Eq, Hash)] pub struct TensorBuilder<'a> { diff --git a/crates/cudnn/src/convolution/convolution_config.rs b/crates/cudnn/src/convolution/convolution_config.rs index c5186e53..ea3c2a37 100644 --- a/crates/cudnn/src/convolution/convolution_config.rs +++ b/crates/cudnn/src/convolution/convolution_config.rs @@ -1,4 +1,4 @@ -use crate::{private, DataType}; +use crate::{DataType, private}; /// Supported data types configurations for convolution operations. /// diff --git a/crates/cudnn/src/op/op_tensor_descriptor.rs b/crates/cudnn/src/op/op_tensor_descriptor.rs index 11e1a89a..80819b7f 100644 --- a/crates/cudnn/src/op/op_tensor_descriptor.rs +++ b/crates/cudnn/src/op/op_tensor_descriptor.rs @@ -15,12 +15,12 @@ unsafe fn init_raw_op_descriptor( ) -> Result { let mut raw = MaybeUninit::uninit(); - cudnn_sys::cudnnCreateOpTensorDescriptor(raw.as_mut_ptr()).into_result()?; - - let raw = raw.assume_init(); - - cudnn_sys::cudnnSetOpTensorDescriptor(raw, op, T::into_raw(), nan_opt).into_result()?; - Ok(raw) + unsafe { + cudnn_sys::cudnnCreateOpTensorDescriptor(raw.as_mut_ptr()).into_result()?; + let raw = raw.assume_init(); + cudnn_sys::cudnnSetOpTensorDescriptor(raw, op, T::into_raw(), nan_opt).into_result()?; + Ok(raw) + } } /// The description of a unary Tensor Core operation. diff --git a/crates/cudnn/src/pooling/mod.rs b/crates/cudnn/src/pooling/mod.rs index 5953d2b7..2b46139c 100644 --- a/crates/cudnn/src/pooling/mod.rs +++ b/crates/cudnn/src/pooling/mod.rs @@ -1,7 +1,7 @@ use cust::memory::GpuBuffer; use crate::{ - private, CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor, + CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor, private, }; mod pooling_descriptor; diff --git a/crates/cudnn/src/rnn/rnn_data_descriptor.rs b/crates/cudnn/src/rnn/rnn_data_descriptor.rs index 696f8d8b..4c46dfae 100644 --- a/crates/cudnn/src/rnn/rnn_data_descriptor.rs +++ b/crates/cudnn/src/rnn/rnn_data_descriptor.rs @@ -1,6 +1,6 @@ use std::{marker::PhantomData, mem::MaybeUninit}; -use crate::{private, CudnnError, DataType, IntoResult, RnnDataLayout}; +use crate::{CudnnError, DataType, IntoResult, RnnDataLayout, private}; /// Specifies the allowed types for the recurrent neural network inputs and outputs. /// diff --git a/crates/cudnn/src/softmax/mod.rs b/crates/cudnn/src/softmax/mod.rs index ed979fc6..1ae6c422 100644 --- a/crates/cudnn/src/softmax/mod.rs +++ b/crates/cudnn/src/softmax/mod.rs @@ -1,6 +1,6 @@ use cust::memory::GpuBuffer; -use crate::{private, CudnnContext, CudnnError, DataType, IntoResult, TensorDescriptor}; +use crate::{CudnnContext, CudnnError, DataType, IntoResult, TensorDescriptor, private}; mod softmax_algo; mod softmax_mode; diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml index 370fe331..3e59f386 100644 --- a/crates/cust/Cargo.toml +++ b/crates/cust/Cargo.toml @@ -6,7 +6,7 @@ authors = [ "Riccardo D'Ambrosio ", "Brook Heisler ", ] -edition = "2021" +edition = "2024" license = "MIT OR Apache-2.0" description = "High level bindings to the CUDA Driver API" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/cust/src/context/legacy.rs b/crates/cust/src/context/legacy.rs index 3e39ce47..f4d332f1 100644 --- a/crates/cust/src/context/legacy.rs +++ b/crates/cust/src/context/legacy.rs @@ -121,11 +121,11 @@ use std::ptr; use cust_raw::driver_sys; use cust_raw::driver_sys::CUcontext; +use crate::CudaApiVersion; use crate::context::ContextHandle; use crate::device::Device; use crate::error::{CudaResult, DropResult, ToResult}; use crate::private::Sealed; -use crate::CudaApiVersion; /// This enumeration represents configuration settings for devices which share hardware resources /// between L1 cache and shared memory. diff --git a/crates/cust/src/context/mod.rs b/crates/cust/src/context/mod.rs index e52104f2..69fb2cdb 100644 --- a/crates/cust/src/context/mod.rs +++ b/crates/cust/src/context/mod.rs @@ -34,7 +34,7 @@ //! with the [`legacy`] module. use std::{ - mem::{self, transmute, MaybeUninit}, + mem::{self, MaybeUninit, transmute}, ptr, }; @@ -45,10 +45,10 @@ pub mod legacy; use crate::context::legacy::StreamPriorityRange; use crate::{ + CudaApiVersion, device::Device, error::{CudaResult, DropResult, ToResult}, private::Sealed, - CudaApiVersion, }; pub trait ContextHandle: Sealed { @@ -215,7 +215,7 @@ impl Context { /// Nothing else should be using the primary context for this device, otherwise, /// spurious errors or segfaults will occur. pub unsafe fn reset(device: &Device) -> CudaResult<()> { - driver_sys::cuDevicePrimaryCtxReset(device.as_raw()).to_result() + unsafe { driver_sys::cuDevicePrimaryCtxReset(device.as_raw()).to_result() } } /// Sets the flags for the device context, these flags will apply to any user of the primary diff --git a/crates/cust/src/event.rs b/crates/cust/src/event.rs index efa8130c..61bc0ff0 100644 --- a/crates/cust/src/event.rs +++ b/crates/cust/src/event.rs @@ -18,8 +18,8 @@ use std::ptr; use std::time::Duration; use cust_raw::driver_sys::{ - cuEventCreate, cuEventDestroy, cuEventElapsedTime, cuEventQuery, cuEventRecord, - cuEventSynchronize, CUevent, + CUevent, cuEventCreate, cuEventDestroy, cuEventElapsedTime, cuEventQuery, cuEventRecord, + cuEventSynchronize, }; use crate::error::{CudaError, CudaResult, DropResult, ToResult}; diff --git a/crates/cust/src/external.rs b/crates/cust/src/external.rs index a634645f..44faa8a6 100644 --- a/crates/cust/src/external.rs +++ b/crates/cust/src/external.rs @@ -22,15 +22,17 @@ impl ExternalMemory { let mut memory: driver_sys::CUexternalMemory = std::ptr::null_mut(); - driver_sys::cuImportExternalMemory(&mut memory, &desc) - .to_result() - .map(|_| ExternalMemory(memory)) + unsafe { + driver_sys::cuImportExternalMemory(&mut memory, &desc) + .to_result() + .map(|_| ExternalMemory(memory)) + } } #[allow(clippy::missing_safety_doc)] pub unsafe fn reimport(&mut self, fd: i32, size: usize) -> CudaResult<()> { // import new memory - this will call drop to destroy the old one - *self = ExternalMemory::import(fd, size)?; + *self = unsafe { ExternalMemory::import(fd, size)? }; Ok(()) } diff --git a/crates/cust/src/function.rs b/crates/cust/src/function.rs index b01e889e..67a5fb84 100644 --- a/crates/cust/src/function.rs +++ b/crates/cust/src/function.rs @@ -1,7 +1,7 @@ //! Functions and types for working with CUDA kernels. use std::marker::PhantomData; -use std::mem::{transmute, MaybeUninit}; +use std::mem::{MaybeUninit, transmute}; use cust_raw::driver_sys; use cust_raw::driver_sys::CUfunction; diff --git a/crates/cust/src/graph.rs b/crates/cust/src/graph.rs index 3db77dd5..9f04329e 100644 --- a/crates/cust/src/graph.rs +++ b/crates/cust/src/graph.rs @@ -281,7 +281,10 @@ impl Graph { .enumerate() .position(|(cur_idx, x)| x == node && cur_idx != idx) { - panic!("Duplicate dependency found in call to `{}`, the first instance is at index {}, the second instance is at index {}", func_name, idx, pos); + panic!( + "Duplicate dependency found in call to `{}`, the first instance is at index {}, the second instance is at index {}", + func_name, idx, pos + ); } assert!( diff --git a/crates/cust/src/memory/array.rs b/crates/cust/src/memory/array.rs index 40b5ba06..36525b70 100644 --- a/crates/cust/src/memory/array.rs +++ b/crates/cust/src/memory/array.rs @@ -5,19 +5,19 @@ use std::ffi::c_void; use std::mem; -use std::mem::zeroed; use std::mem::ManuallyDrop; use std::mem::MaybeUninit; +use std::mem::zeroed; use std::os::raw::c_uint; use std::panic; use std::ptr::null; use std::ptr::null_mut; use cust_raw::driver_sys; +use cust_raw::driver_sys::CUDA_MEMCPY2D; use cust_raw::driver_sys::cuMemcpy2D; use cust_raw::driver_sys::cuMemcpyAtoH; use cust_raw::driver_sys::cuMemcpyHtoA; -use cust_raw::driver_sys::CUDA_MEMCPY2D; use cust_raw::driver_sys::{CUarray, CUarray_format, CUarray_format_enum}; use crate::context::CurrentContext; diff --git a/crates/cust/src/memory/device/device_box.rs b/crates/cust/src/memory/device/device_box.rs index edccd3da..4b3ab825 100644 --- a/crates/cust/src/memory/device/device_box.rs +++ b/crates/cust/src/memory/device/device_box.rs @@ -5,11 +5,11 @@ use std::os::raw::c_void; use cust_raw::driver_sys; use crate::error::{CudaResult, DropResult, ToResult}; +use crate::memory::DevicePointer; use crate::memory::device::AsyncCopyDestination; use crate::memory::device::CopyDestination; use crate::memory::malloc::{cuda_free, cuda_malloc}; -use crate::memory::DevicePointer; -use crate::memory::{cuda_free_async, cuda_malloc_async, DeviceCopy}; +use crate::memory::{DeviceCopy, cuda_free_async, cuda_malloc_async}; use crate::stream::Stream; /// A pointer type for heap-allocation in CUDA device memory. @@ -86,9 +86,11 @@ impl DeviceBox { /// # Ok(()) /// # } pub unsafe fn new_async(val: &T, stream: &Stream) -> CudaResult { - let mut dev_box = DeviceBox::uninitialized_async(stream)?; - dev_box.async_copy_from(val, stream)?; - Ok(dev_box) + unsafe { + let mut dev_box = DeviceBox::uninitialized_async(stream)?; + dev_box.async_copy_from(val, stream)?; + Ok(dev_box) + } } /// Enqueues an operation to free the memory backed by this [`DeviceBox`] on a @@ -204,14 +206,16 @@ impl DeviceBox { /// ``` #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))] pub unsafe fn zeroed_async(stream: &Stream) -> CudaResult { - let new_box = DeviceBox::uninitialized_async(stream)?; + let new_box = unsafe { DeviceBox::uninitialized_async(stream)? }; if mem::size_of::() != 0 { - driver_sys::cuMemsetD8Async( - new_box.as_device_ptr().as_raw(), - 0, - mem::size_of::(), - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemsetD8Async( + new_box.as_device_ptr().as_raw(), + 0, + mem::size_of::(), + stream.as_inner(), + ) + } .to_result()?; } Ok(new_box) @@ -243,7 +247,7 @@ impl DeviceBox { ptr: DevicePointer::null(), }) } else { - let ptr = cuda_malloc(1)?; + let ptr = unsafe { cuda_malloc(1)? }; Ok(DeviceBox { ptr }) } } @@ -266,7 +270,7 @@ impl DeviceBox { ptr: DevicePointer::null(), }) } else { - let ptr = cuda_malloc_async(stream, 1)?; + let ptr = unsafe { cuda_malloc_async(stream, 1)? }; Ok(DeviceBox { ptr }) } } @@ -473,12 +477,14 @@ impl AsyncCopyDestination for DeviceBox { unsafe fn async_copy_from(&mut self, val: &T, stream: &Stream) -> CudaResult<()> { let size = mem::size_of::(); if size != 0 { - driver_sys::cuMemcpyHtoDAsync( - self.ptr.as_raw(), - val as *const _ as *const c_void, - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyHtoDAsync( + self.ptr.as_raw(), + val as *const _ as *const c_void, + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) @@ -487,12 +493,14 @@ impl AsyncCopyDestination for DeviceBox { unsafe fn async_copy_to(&self, val: &mut T, stream: &Stream) -> CudaResult<()> { let size = mem::size_of::(); if size != 0 { - driver_sys::cuMemcpyDtoHAsync( - val as *mut _ as *mut c_void, - self.ptr.as_raw(), - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyDtoHAsync( + val as *mut _ as *mut c_void, + self.ptr.as_raw(), + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) @@ -502,12 +510,14 @@ impl AsyncCopyDestination> for DeviceBox { unsafe fn async_copy_from(&mut self, val: &DeviceBox, stream: &Stream) -> CudaResult<()> { let size = mem::size_of::(); if size != 0 { - driver_sys::cuMemcpyDtoDAsync( - self.ptr.as_raw(), - val.ptr.as_raw(), - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyDtoDAsync( + self.ptr.as_raw(), + val.ptr.as_raw(), + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) @@ -516,12 +526,14 @@ impl AsyncCopyDestination> for DeviceBox { unsafe fn async_copy_to(&self, val: &mut DeviceBox, stream: &Stream) -> CudaResult<()> { let size = mem::size_of::(); if size != 0 { - driver_sys::cuMemcpyDtoDAsync( - val.ptr.as_raw(), - self.ptr.as_raw(), - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyDtoDAsync( + val.ptr.as_raw(), + self.ptr.as_raw(), + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) diff --git a/crates/cust/src/memory/device/device_buffer.rs b/crates/cust/src/memory/device/device_buffer.rs index eae9fb4b..80a80e01 100644 --- a/crates/cust/src/memory/device/device_buffer.rs +++ b/crates/cust/src/memory/device/device_buffer.rs @@ -1,4 +1,4 @@ -use std::mem::{self, align_of, size_of, transmute, ManuallyDrop}; +use std::mem::{self, ManuallyDrop, align_of, size_of, transmute}; use std::ops::{Deref, DerefMut}; #[cfg(feature = "bytemuck")] @@ -10,8 +10,8 @@ use cust_raw::driver_sys; use crate::error::{CudaResult, DropResult, ToResult}; use crate::memory::device::{AsyncCopyDestination, CopyDestination, DeviceSlice}; use crate::memory::malloc::{cuda_free, cuda_malloc}; -use crate::memory::{cuda_free_async, DevicePointer}; -use crate::memory::{cuda_malloc_async, DeviceCopy}; +use crate::memory::{DeviceCopy, cuda_malloc_async}; +use crate::memory::{DevicePointer, cuda_free_async}; use crate::stream::Stream; /// Fixed-size device-side buffer. Provides basic access to device memory. @@ -49,7 +49,7 @@ impl DeviceBuffer { /// ``` pub unsafe fn uninitialized(size: usize) -> CudaResult { let ptr = if size > 0 && size_of::() > 0 { - cuda_malloc(size)? + unsafe { cuda_malloc(size)? } } else { // FIXME (AL): Do we /really/ want to allow creating an invalid buffer? DevicePointer::null() @@ -74,7 +74,7 @@ impl DeviceBuffer { /// You can synchronize the stream to ensure the memory allocation operation is complete. pub unsafe fn uninitialized_async(size: usize, stream: &Stream) -> CudaResult { let ptr = if size > 0 && size_of::() > 0 { - cuda_malloc_async(stream, size)? + unsafe { cuda_malloc_async(stream, size)? } } else { DevicePointer::null() }; @@ -272,14 +272,16 @@ impl DeviceBuffer { /// ``` #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))] pub unsafe fn zeroed_async(size: usize, stream: &Stream) -> CudaResult { - let new_buf = DeviceBuffer::uninitialized_async(size, stream)?; + let new_buf = unsafe { DeviceBuffer::uninitialized_async(size, stream)? }; if size_of::() != 0 { - driver_sys::cuMemsetD8Async( - new_buf.as_device_ptr().as_raw(), - 0, - size_of::() * size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemsetD8Async( + new_buf.as_device_ptr().as_raw(), + 0, + size_of::() * size, + stream.as_inner(), + ) + } .to_result()?; } Ok(new_buf) @@ -394,9 +396,11 @@ impl DeviceBuffer { /// } /// ``` pub unsafe fn from_slice_async(slice: &[T], stream: &Stream) -> CudaResult { - let mut uninit = DeviceBuffer::uninitialized_async(slice.len(), stream)?; - uninit.async_copy_from(slice, stream)?; - Ok(uninit) + unsafe { + let mut uninit = DeviceBuffer::uninitialized_async(slice.len(), stream)?; + uninit.async_copy_from(slice, stream)?; + Ok(uninit) + } } /// Explicitly creates a [`DeviceSlice`] from this buffer. diff --git a/crates/cust/src/memory/device/device_slice.rs b/crates/cust/src/memory/device/device_slice.rs index ff0db3dd..1c30b205 100644 --- a/crates/cust/src/memory/device/device_slice.rs +++ b/crates/cust/src/memory/device/device_slice.rs @@ -12,9 +12,9 @@ use bytemuck::{Pod, Zeroable}; use cust_raw::driver_sys; use crate::error::{CudaResult, ToResult}; +use crate::memory::DevicePointer; use crate::memory::device::AsyncCopyDestination; use crate::memory::device::{CopyDestination, DeviceBuffer}; -use crate::memory::DevicePointer; use crate::memory::{DeviceCopy, DeviceMemory}; use crate::stream::Stream; @@ -204,7 +204,7 @@ impl DeviceSlice { /// ``` #[allow(clippy::needless_pass_by_value)] pub unsafe fn from_raw_parts<'a>(ptr: DevicePointer, len: usize) -> &'a DeviceSlice { - &*(slice_from_raw_parts(ptr.as_ptr(), len) as *const DeviceSlice) + unsafe { &*(slice_from_raw_parts(ptr.as_ptr(), len) as *const DeviceSlice) } } /// Performs the same functionality as `from_raw_parts`, except that a @@ -226,7 +226,7 @@ impl DeviceSlice { ptr: DevicePointer, len: usize, ) -> &'a mut DeviceSlice { - &mut *(slice_from_raw_parts_mut(ptr.as_mut_ptr(), len) as *mut DeviceSlice) + unsafe { &mut *(slice_from_raw_parts_mut(ptr.as_mut_ptr(), len) as *mut DeviceSlice) } } } @@ -268,12 +268,14 @@ impl DeviceSlice { return Ok(()); } - driver_sys::cuMemsetD8Async( - self.as_raw_ptr(), - value, - self.size_in_bytes(), - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemsetD8Async( + self.as_raw_ptr(), + value, + self.size_in_bytes(), + stream.as_inner(), + ) + } .to_result() } @@ -331,8 +333,10 @@ impl DeviceSlice { 0, "Buffer pointer is not aligned to at least 2 bytes!" ); - driver_sys::cuMemsetD16Async(self.as_raw_ptr(), value, data_len / 2, stream.as_inner()) - .to_result() + unsafe { + driver_sys::cuMemsetD16Async(self.as_raw_ptr(), value, data_len / 2, stream.as_inner()) + } + .to_result() } /// Sets the memory range of this buffer to contiguous `32-bit` values of `value`. @@ -389,8 +393,10 @@ impl DeviceSlice { 0, "Buffer pointer is not aligned to at least 4 bytes!" ); - driver_sys::cuMemsetD32Async(self.as_raw_ptr(), value, data_len / 4, stream.as_inner()) - .to_result() + unsafe { + driver_sys::cuMemsetD32Async(self.as_raw_ptr(), value, data_len / 4, stream.as_inner()) + } + .to_result() } } @@ -415,11 +421,13 @@ impl DeviceSlice { pub unsafe fn set_zero_async(&mut self, stream: &Stream) -> CudaResult<()> { // SAFETY: this is fine because Zeroable guarantees a zero byte-pattern is safe // for this type. And a slice of bytes can represent any type. - let erased = DeviceSlice::from_raw_parts_mut( - self.as_device_ptr().cast::(), - self.size_in_bytes(), - ); - erased.set_8_async(0, stream) + unsafe { + let erased = DeviceSlice::from_raw_parts_mut( + self.as_device_ptr().cast::(), + self.size_in_bytes(), + ); + erased.set_8_async(0, stream) + } } } @@ -471,14 +479,14 @@ fn slice_end_index_overflow_fail() -> ! { impl DeviceSliceIndex for usize { unsafe fn get_unchecked(self, slice: &DeviceSlice) -> &DeviceSlice { - (self..self + 1).get_unchecked(slice) + unsafe { (self..self + 1).get_unchecked(slice) } } fn index(self, slice: &DeviceSlice) -> &DeviceSlice { slice.index(self..self + 1) } unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { - (self..self + 1).get_unchecked_mut(slice) + unsafe { (self..self + 1).get_unchecked_mut(slice) } } fn index_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { slice.index_mut(self..self + 1) @@ -487,7 +495,12 @@ impl DeviceSliceIndex for usize { impl DeviceSliceIndex for Range { unsafe fn get_unchecked(self, slice: &DeviceSlice) -> &DeviceSlice { - DeviceSlice::from_raw_parts(slice.as_device_ptr().add(self.start), self.end - self.start) + unsafe { + DeviceSlice::from_raw_parts( + slice.as_device_ptr().add(self.start), + self.end - self.start, + ) + } } fn index(self, slice: &DeviceSlice) -> &DeviceSlice { if self.start > self.end { @@ -500,10 +513,12 @@ impl DeviceSliceIndex for Range { } unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { - DeviceSlice::from_raw_parts_mut( - slice.as_device_ptr().add(self.start), - self.end - self.start, - ) + unsafe { + DeviceSlice::from_raw_parts_mut( + slice.as_device_ptr().add(self.start), + self.end - self.start, + ) + } } fn index_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { if self.start > self.end { @@ -518,14 +533,14 @@ impl DeviceSliceIndex for Range { impl DeviceSliceIndex for RangeTo { unsafe fn get_unchecked(self, slice: &DeviceSlice) -> &DeviceSlice { - (0..self.end).get_unchecked(slice) + unsafe { (0..self.end).get_unchecked(slice) } } fn index(self, slice: &DeviceSlice) -> &DeviceSlice { (0..self.end).index(slice) } unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { - (0..self.end).get_unchecked_mut(slice) + unsafe { (0..self.end).get_unchecked_mut(slice) } } fn index_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { (0..self.end).index_mut(slice) @@ -534,7 +549,7 @@ impl DeviceSliceIndex for RangeTo { impl DeviceSliceIndex for RangeFrom { unsafe fn get_unchecked(self, slice: &DeviceSlice) -> &DeviceSlice { - (self.start..slice.len()).get_unchecked(slice) + unsafe { (self.start..slice.len()).get_unchecked(slice) } } fn index(self, slice: &DeviceSlice) -> &DeviceSlice { if self.start > slice.len() { @@ -545,7 +560,7 @@ impl DeviceSliceIndex for RangeFrom { } unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { - (self.start..slice.len()).get_unchecked_mut(slice) + unsafe { (self.start..slice.len()).get_unchecked_mut(slice) } } fn index_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { if self.start > slice.len() { @@ -584,7 +599,7 @@ fn into_slice_range(range: RangeInclusive) -> Range { impl DeviceSliceIndex for RangeInclusive { unsafe fn get_unchecked(self, slice: &DeviceSlice) -> &DeviceSlice { - into_slice_range(self).get_unchecked(slice) + unsafe { into_slice_range(self).get_unchecked(slice) } } fn index(self, slice: &DeviceSlice) -> &DeviceSlice { if *self.end() == usize::MAX { @@ -594,7 +609,7 @@ impl DeviceSliceIndex for RangeInclusive { } unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { - into_slice_range(self).get_unchecked_mut(slice) + unsafe { into_slice_range(self).get_unchecked_mut(slice) } } fn index_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { if *self.end() == usize::MAX { @@ -606,14 +621,14 @@ impl DeviceSliceIndex for RangeInclusive { impl DeviceSliceIndex for RangeToInclusive { unsafe fn get_unchecked(self, slice: &DeviceSlice) -> &DeviceSlice { - (0..=self.end).get_unchecked(slice) + unsafe { (0..=self.end).get_unchecked(slice) } } fn index(self, slice: &DeviceSlice) -> &DeviceSlice { (0..=self.end).index(slice) } unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { - (0..=self.end).get_unchecked_mut(slice) + unsafe { (0..=self.end).get_unchecked_mut(slice) } } fn index_mut(self, slice: &mut DeviceSlice) -> &mut DeviceSlice { (0..=self.end).index_mut(slice) @@ -717,12 +732,14 @@ impl + AsMut<[T]> + ?Sized> AsyncCopyDestination ); let size = self.size_in_bytes(); if size != 0 { - driver_sys::cuMemcpyHtoDAsync( - self.as_raw_ptr(), - val.as_ptr() as *const c_void, - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyHtoDAsync( + self.as_raw_ptr(), + val.as_ptr() as *const c_void, + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) @@ -736,12 +753,14 @@ impl + AsMut<[T]> + ?Sized> AsyncCopyDestination ); let size = self.size_in_bytes(); if size != 0 { - driver_sys::cuMemcpyDtoHAsync( - val.as_mut_ptr() as *mut c_void, - self.as_raw_ptr(), - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyDtoHAsync( + val.as_mut_ptr() as *mut c_void, + self.as_raw_ptr(), + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) @@ -755,12 +774,14 @@ impl AsyncCopyDestination> for DeviceSlice { ); let size = self.size_in_bytes(); if size != 0 { - driver_sys::cuMemcpyDtoDAsync( - self.as_raw_ptr(), - val.as_raw_ptr(), - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyDtoDAsync( + self.as_raw_ptr(), + val.as_raw_ptr(), + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) @@ -773,12 +794,14 @@ impl AsyncCopyDestination> for DeviceSlice { ); let size = self.size_in_bytes(); if size != 0 { - driver_sys::cuMemcpyDtoDAsync( - val.as_raw_ptr(), - self.as_raw_ptr(), - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemcpyDtoDAsync( + val.as_raw_ptr(), + self.as_raw_ptr(), + size, + stream.as_inner(), + ) + } .to_result()? } Ok(()) @@ -786,10 +809,10 @@ impl AsyncCopyDestination> for DeviceSlice { } impl AsyncCopyDestination> for DeviceSlice { unsafe fn async_copy_from(&mut self, val: &DeviceBuffer, stream: &Stream) -> CudaResult<()> { - self.async_copy_from(val as &DeviceSlice, stream) + unsafe { self.async_copy_from(val as &DeviceSlice, stream) } } unsafe fn async_copy_to(&self, val: &mut DeviceBuffer, stream: &Stream) -> CudaResult<()> { - self.async_copy_to(val as &mut DeviceSlice, stream) + unsafe { self.async_copy_to(val as &mut DeviceSlice, stream) } } } diff --git a/crates/cust/src/memory/device/device_variable.rs b/crates/cust/src/memory/device/device_variable.rs index 14eb52bf..33c80d14 100644 --- a/crates/cust/src/memory/device/device_variable.rs +++ b/crates/cust/src/memory/device/device_variable.rs @@ -1,6 +1,6 @@ use crate::error::CudaResult; -use crate::memory::device::CopyDestination; use crate::memory::DeviceCopy; +use crate::memory::device::CopyDestination; use crate::memory::{DeviceBox, DevicePointer}; use std::ops::{Deref, DerefMut}; diff --git a/crates/cust/src/memory/locked/locked_box.rs b/crates/cust/src/memory/locked/locked_box.rs index 0af28245..eb19cd38 100644 --- a/crates/cust/src/memory/locked/locked_box.rs +++ b/crates/cust/src/memory/locked/locked_box.rs @@ -6,7 +6,7 @@ use std::{ use crate::{ error::CudaResult, - memory::{cuda_free_locked, cuda_malloc_locked, DeviceCopy}, + memory::{DeviceCopy, cuda_free_locked, cuda_malloc_locked}, }; /// Page-locked box in host memory. @@ -45,7 +45,7 @@ impl LockedBox { if mem::size_of::() == 0 { Ok(LockedBox { ptr: null_mut() }) } else { - let ptr = cuda_malloc_locked(1)?; + let ptr = unsafe { cuda_malloc_locked(1)? }; Ok(LockedBox { ptr }) } } diff --git a/crates/cust/src/memory/locked/locked_buffer.rs b/crates/cust/src/memory/locked/locked_buffer.rs index 14fd321d..e310c619 100644 --- a/crates/cust/src/memory/locked/locked_buffer.rs +++ b/crates/cust/src/memory/locked/locked_buffer.rs @@ -1,6 +1,6 @@ use crate::error::*; -use crate::memory::malloc::{cuda_free_locked, cuda_malloc_locked}; use crate::memory::DeviceCopy; +use crate::memory::malloc::{cuda_free_locked, cuda_malloc_locked}; use std::mem; use std::ops; use std::ptr; @@ -98,7 +98,7 @@ impl LockedBuffer { /// ``` pub unsafe fn uninitialized(size: usize) -> CudaResult { let ptr: *mut T = if size > 0 && mem::size_of::() > 0 { - cuda_malloc_locked(size)? + unsafe { cuda_malloc_locked(size)? } } else { ptr::NonNull::dangling().as_ptr() }; diff --git a/crates/cust/src/memory/malloc.rs b/crates/cust/src/memory/malloc.rs index 6255778c..83b17c63 100644 --- a/crates/cust/src/memory/malloc.rs +++ b/crates/cust/src/memory/malloc.rs @@ -48,7 +48,7 @@ pub unsafe fn cuda_malloc(count: usize) -> CudaResult( } let mut ptr: *mut c_void = ptr::null_mut(); - driver_sys::cuMemAllocAsync( - &mut ptr as *mut *mut c_void as *mut u64, - size, - stream.as_inner(), - ) + unsafe { + driver_sys::cuMemAllocAsync( + &mut ptr as *mut *mut c_void as *mut u64, + size, + stream.as_inner(), + ) + } .to_result()?; let ptr = ptr as *mut T; Ok(DevicePointer::from_raw(ptr as driver_sys::CUdeviceptr)) @@ -97,7 +99,7 @@ pub unsafe fn cuda_free_async( return Err(CudaError::InvalidMemoryAllocation); } - driver_sys::cuMemFreeAsync(p.as_raw(), stream.as_inner()).to_result() + unsafe { driver_sys::cuMemFreeAsync(p.as_raw(), stream.as_inner()) }.to_result() } /// Unsafe wrapper around the `cuMemAllocManaged` function, which allocates some unified memory and @@ -140,14 +142,16 @@ pub unsafe fn cuda_malloc_unified(count: usize) -> CudaResult( let mut ptr = 0; let mut pitch = 0; - driver_sys::cuMemAllocPitch(&mut ptr, &mut pitch, width_bytes, height, element_size) + unsafe { driver_sys::cuMemAllocPitch(&mut ptr, &mut pitch, width_bytes, height, element_size) } .to_result()?; Ok((DevicePointer::from_raw(ptr), pitch)) } @@ -236,7 +240,7 @@ pub unsafe fn cuda_free(ptr: DevicePointer) -> CudaResult<()> return Err(CudaError::InvalidMemoryAllocation); } - driver_sys::cuMemFree(ptr.as_raw()).to_result()?; + unsafe { driver_sys::cuMemFree(ptr.as_raw()).to_result()? }; Ok(()) } @@ -269,7 +273,7 @@ pub unsafe fn cuda_free_unified(mut p: UnifiedPointer) -> Cuda return Err(CudaError::InvalidMemoryAllocation); } - driver_sys::cuMemFree(ptr as u64).to_result()?; + unsafe { driver_sys::cuMemFree(ptr as u64) }.to_result()?; Ok(()) } @@ -311,7 +315,7 @@ pub unsafe fn cuda_malloc_locked(count: usize) -> CudaResult<*mut T> { } let mut ptr: *mut c_void = ptr::null_mut(); - driver_sys::cuMemAllocHost(&mut ptr as *mut *mut c_void, size).to_result()?; + unsafe { driver_sys::cuMemAllocHost(&mut ptr as *mut *mut c_void, size) }.to_result()?; let ptr = ptr as *mut T; Ok(ptr) } @@ -344,7 +348,7 @@ pub unsafe fn cuda_free_locked(ptr: *mut T) -> CudaResult<()> { return Err(CudaError::InvalidMemoryAllocation); } - driver_sys::cuMemFreeHost(ptr as *mut c_void).to_result()?; + unsafe { driver_sys::cuMemFreeHost(ptr as *mut c_void) }.to_result()?; Ok(()) } diff --git a/crates/cust/src/memory/mod.rs b/crates/cust/src/memory/mod.rs index aa349145..11b487d5 100644 --- a/crates/cust/src/memory/mod.rs +++ b/crates/cust/src/memory/mod.rs @@ -212,7 +212,7 @@ pub unsafe fn memcpy_htod( src_ptr: *const c_void, size: usize, ) -> CudaResult<()> { - driver_sys::cuMemcpyHtoD(d_ptr, src_ptr, size).to_result()?; + unsafe { driver_sys::cuMemcpyHtoD(d_ptr, src_ptr, size).to_result()? }; Ok(()) } @@ -223,7 +223,7 @@ pub unsafe fn memcpy_dtoh( src_ptr: driver_sys::CUdeviceptr, size: usize, ) -> CudaResult<()> { - driver_sys::cuMemcpyDtoH(d_ptr, src_ptr, size).to_result()?; + unsafe { driver_sys::cuMemcpyDtoH(d_ptr, src_ptr, size).to_result()? }; Ok(()) } @@ -309,7 +309,7 @@ pub unsafe fn memcpy_2d_htod( Height: height, }; - driver_sys::cuMemcpy2D(&pcopy).to_result()?; + unsafe { driver_sys::cuMemcpy2D(&pcopy).to_result()? }; Ok(()) } @@ -395,7 +395,7 @@ pub unsafe fn memcpy_2d_dtoh( Height: height, }; - driver_sys::cuMemcpy2D(&pcopy).to_result()?; + unsafe { driver_sys::cuMemcpy2D(&pcopy).to_result()? }; Ok(()) } diff --git a/crates/cust/src/memory/pointer.rs b/crates/cust/src/memory/pointer.rs index b22ff5d7..d40beed0 100644 --- a/crates/cust/src/memory/pointer.rs +++ b/crates/cust/src/memory/pointer.rs @@ -214,7 +214,7 @@ impl DevicePointer { where T: Sized, { - self.offset(count as isize) + unsafe { self.offset(count as isize) } } /// Calculates the offset from a pointer (convenience for @@ -256,7 +256,7 @@ impl DevicePointer { where T: Sized, { - self.offset((count as isize).wrapping_neg()) + unsafe { self.offset((count as isize).wrapping_neg()) } } /// Calculates the offset from a pointer using wrapping arithmetic. @@ -481,7 +481,7 @@ impl UnifiedPointer { where T: Sized, { - Self::wrap(self.0.offset(count)) + unsafe { Self::wrap(self.0.offset(count)) } } /// Calculates the offset from a unified pointer using wrapping arithmetic. @@ -559,7 +559,7 @@ impl UnifiedPointer { where T: Sized, { - self.offset(count as isize) + unsafe { self.offset(count as isize) } } /// Calculates the offset from a pointer (convenience for @@ -600,7 +600,7 @@ impl UnifiedPointer { where T: Sized, { - self.offset((count as isize).wrapping_neg()) + unsafe { self.offset((count as isize).wrapping_neg()) } } /// Calculates the offset from a pointer using wrapping arithmetic. diff --git a/crates/cust/src/memory/unified.rs b/crates/cust/src/memory/unified.rs index d833a62d..c45f5ce3 100644 --- a/crates/cust/src/memory/unified.rs +++ b/crates/cust/src/memory/unified.rs @@ -15,8 +15,8 @@ use crate::device::Device; #[allow(unused_imports)] use crate::device::DeviceAttribute; use crate::error::*; -use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified}; use crate::memory::UnifiedPointer; +use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified}; use crate::prelude::Stream; /// A pointer type for heap-allocation in CUDA unified memory. @@ -88,7 +88,7 @@ impl UnifiedBox { ptr: UnifiedPointer::null(), }) } else { - let ptr = cuda_malloc_unified(1)?; + let ptr = unsafe { cuda_malloc_unified(1)? }; Ok(UnifiedBox { ptr }) } } @@ -117,7 +117,7 @@ impl UnifiedBox { /// ``` pub unsafe fn from_raw(ptr: *mut T) -> Self { UnifiedBox { - ptr: UnifiedPointer::wrap(ptr), + ptr: unsafe { UnifiedPointer::wrap(ptr) }, } } @@ -420,9 +420,9 @@ impl UnifiedBuffer { /// ``` pub unsafe fn uninitialized(size: usize) -> CudaResult { let ptr = if size > 0 && mem::size_of::() > 0 { - cuda_malloc_unified(size)? + unsafe { cuda_malloc_unified(size)? } } else { - UnifiedPointer::wrap(ptr::NonNull::dangling().as_ptr()) + unsafe { UnifiedPointer::wrap(ptr::NonNull::dangling().as_ptr()) } }; Ok(UnifiedBuffer { buf: ptr, diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs index b7efd93c..124a67fd 100644 --- a/crates/cust/src/module.rs +++ b/crates/cust/src/module.rs @@ -1,6 +1,6 @@ //! Functions and types for working with CUDA modules. -use std::ffi::{c_void, CStr, CString}; +use std::ffi::{CStr, CString, c_void}; use std::fmt; use std::marker::PhantomData; use std::mem; @@ -63,7 +63,7 @@ pub enum ModuleJitOption { /// architecture. Fallback(JitFallback), /// Generates debug info in the compiled binary. - GenenerateDebugInfo(bool), + GenerateDebugInfo(bool), /// Generates line info in the compiled binary. GenerateLineInfo(bool), } @@ -103,13 +103,13 @@ impl ModuleJitOption { raw_opts.push(driver_sys::CUjit_option::CU_JIT_FALLBACK_STRATEGY); raw_vals.push(*fallback as usize as *mut c_void); } - Self::GenenerateDebugInfo(gen) => { + Self::GenerateDebugInfo(gen_) => { raw_opts.push(driver_sys::CUjit_option::CU_JIT_GENERATE_DEBUG_INFO); - raw_vals.push(*gen as usize as *mut c_void); + raw_vals.push(*gen_ as usize as *mut c_void); } - Self::GenerateLineInfo(gen) => { + Self::GenerateLineInfo(gen_) => { raw_opts.push(driver_sys::CUjit_option::CU_JIT_GENERATE_LINE_INFO); - raw_vals.push(*gen as usize as *mut c_void) + raw_vals.push(*gen_ as usize as *mut c_void) } } } @@ -231,13 +231,15 @@ impl Module { inner: ptr::null_mut(), }; let (mut options, mut option_values) = ModuleJitOption::into_raw(options); - driver_sys::cuModuleLoadDataEx( - &mut module.inner as *mut driver_sys::CUmodule, - image, - options.len() as c_uint, - options.as_mut_ptr(), - option_values.as_mut_ptr(), - ) + unsafe { + driver_sys::cuModuleLoadDataEx( + &mut module.inner as *mut driver_sys::CUmodule, + image, + options.len() as c_uint, + options.as_mut_ptr(), + option_values.as_mut_ptr(), + ) + } .to_result()?; Ok(module) } diff --git a/crates/cust/src/prelude.rs b/crates/cust/src/prelude.rs index 559a55ba..41758f3e 100644 --- a/crates/cust/src/prelude.rs +++ b/crates/cust/src/prelude.rs @@ -3,6 +3,7 @@ //! This allows the user to `use cust::prelude::*;` and have the most commonly-used types //! available quickly. +pub use crate::CudaFlags; pub use crate::context::{Context, ContextFlags}; pub use crate::device::Device; pub use crate::event::{Event, EventFlags, EventStatus}; @@ -15,4 +16,3 @@ pub use crate::memory::{ pub use crate::module::Module; pub use crate::stream::{Stream, StreamFlags}; pub use crate::util::*; -pub use crate::CudaFlags; diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs index 3ee5c5f1..2cbbab84 100644 --- a/crates/cust/src/stream.rs +++ b/crates/cust/src/stream.rs @@ -16,7 +16,7 @@ use std::panic; use std::ptr; use cust_raw::driver_sys; -use cust_raw::driver_sys::{cudaError_enum, CUstream}; +use cust_raw::driver_sys::{CUstream, cudaError_enum}; use crate::error::{CudaResult, DropResult, ToResult}; use crate::event::Event; @@ -271,19 +271,21 @@ impl Stream { let grid_size: GridSize = grid_size.into(); let block_size: BlockSize = block_size.into(); - driver_sys::cuLaunchKernel( - func.to_raw(), - grid_size.x, - grid_size.y, - grid_size.z, - block_size.x, - block_size.y, - block_size.z, - shared_mem_bytes, - self.inner, - args.as_ptr() as *mut _, - ptr::null_mut(), - ) + unsafe { + driver_sys::cuLaunchKernel( + func.to_raw(), + grid_size.x, + grid_size.y, + grid_size.z, + block_size.x, + block_size.y, + block_size.z, + shared_mem_bytes, + self.inner, + args.as_ptr() as *mut _, + ptr::null_mut(), + ) + } .to_result() } @@ -357,7 +359,7 @@ unsafe extern "C" fn callback_wrapper( { // Stop panics from unwinding across the FFI let _ = panic::catch_unwind(|| { - let callback: Box = Box::from_raw(callback as *mut T); + let callback: Box = unsafe { Box::from_raw(callback as *mut T) }; callback(status.to_result()); }); } diff --git a/crates/cust/src/surface.rs b/crates/cust/src/surface.rs index 181b9411..79c31a94 100644 --- a/crates/cust/src/surface.rs +++ b/crates/cust/src/surface.rs @@ -4,8 +4,8 @@ use std::{ }; use cust_raw::driver_sys::{ - cuSurfObjectCreate, cuSurfObjectDestroy, cuSurfObjectGetResourceDesc, CUsurfObject, - CUDA_RESOURCE_DESC, + CUDA_RESOURCE_DESC, CUsurfObject, cuSurfObjectCreate, cuSurfObjectDestroy, + cuSurfObjectGetResourceDesc, }; use crate::{ @@ -75,8 +75,10 @@ impl Surface { unsafe fn resource_desc(&mut self) -> CudaResult> { let raw = { let mut uninit = MaybeUninit::::uninit(); - cuSurfObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?; - uninit.assume_init() + unsafe { + cuSurfObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?; + uninit.assume_init() + } }; Ok(ManuallyDrop::new(ResourceDescriptor::from_raw(raw))) } diff --git a/crates/cust/src/texture.rs b/crates/cust/src/texture.rs index f8019b26..60a918dd 100644 --- a/crates/cust/src/texture.rs +++ b/crates/cust/src/texture.rs @@ -1,15 +1,15 @@ -use std::mem::transmute; use std::mem::ManuallyDrop; use std::mem::MaybeUninit; +use std::mem::transmute; use std::os::raw::c_ulonglong; use std::os::raw::{c_float, c_uint}; use std::ptr; use cust_raw::driver_sys; use cust_raw::driver_sys::{ - cuTexObjectCreate, cuTexObjectDestroy, cuTexObjectGetResourceDesc, - CUDA_RESOURCE_DESC_st__bindgen_ty_1, CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1, - CUresourcetype, CUtexObject, CUDA_RESOURCE_DESC, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC, + CUDA_RESOURCE_DESC, CUDA_RESOURCE_DESC_st__bindgen_ty_1, + CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC, + CUresourcetype, CUtexObject, cuTexObjectCreate, cuTexObjectDestroy, cuTexObjectGetResourceDesc, }; use crate::error::CudaResult; @@ -491,8 +491,10 @@ impl Texture { unsafe fn resource_desc(&mut self) -> CudaResult> { let raw = { let mut uninit = MaybeUninit::::uninit(); - cuTexObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?; - uninit.assume_init() + unsafe { + cuTexObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?; + uninit.assume_init() + } }; Ok(ManuallyDrop::new(ResourceDescriptor::from_raw(raw))) } diff --git a/crates/cust/src/util.rs b/crates/cust/src/util.rs index 160622b9..066bca45 100644 --- a/crates/cust/src/util.rs +++ b/crates/cust/src/util.rs @@ -1,8 +1,8 @@ use crate::{ error::CudaResult, memory::{ - array::{ArrayObject, ArrayPrimitive}, DeviceBox, DeviceCopy, UnifiedBuffer, + array::{ArrayObject, ArrayPrimitive}, }, prelude::DeviceBuffer, surface::Surface, diff --git a/crates/cust_core/Cargo.toml b/crates/cust_core/Cargo.toml index 60e2d17f..87826a56 100644 --- a/crates/cust_core/Cargo.toml +++ b/crates/cust_core/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cust_core" version = "0.1.1" -edition = "2021" +edition = "2024" license = "MIT OR Apache-2.0" description = "Core library for cust that can be shared across CPU and GPU" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/cust_core/src/lib.rs b/crates/cust_core/src/lib.rs index 40d4819e..0afb8750 100644 --- a/crates/cust_core/src/lib.rs +++ b/crates/cust_core/src/lib.rs @@ -108,36 +108,36 @@ pub mod _hidden { { } unsafe impl< - A: DeviceCopy, - B: DeviceCopy, - C: DeviceCopy, - D: DeviceCopy, - E: DeviceCopy, - F: DeviceCopy, - > DeviceCopy for (A, B, C, D, E, F) + A: DeviceCopy, + B: DeviceCopy, + C: DeviceCopy, + D: DeviceCopy, + E: DeviceCopy, + F: DeviceCopy, + > DeviceCopy for (A, B, C, D, E, F) { } unsafe impl< - A: DeviceCopy, - B: DeviceCopy, - C: DeviceCopy, - D: DeviceCopy, - E: DeviceCopy, - F: DeviceCopy, - G: DeviceCopy, - > DeviceCopy for (A, B, C, D, E, F, G) + A: DeviceCopy, + B: DeviceCopy, + C: DeviceCopy, + D: DeviceCopy, + E: DeviceCopy, + F: DeviceCopy, + G: DeviceCopy, + > DeviceCopy for (A, B, C, D, E, F, G) { } unsafe impl< - A: DeviceCopy, - B: DeviceCopy, - C: DeviceCopy, - D: DeviceCopy, - E: DeviceCopy, - F: DeviceCopy, - G: DeviceCopy, - H: DeviceCopy, - > DeviceCopy for (A, B, C, D, E, F, G, H) + A: DeviceCopy, + B: DeviceCopy, + C: DeviceCopy, + D: DeviceCopy, + E: DeviceCopy, + F: DeviceCopy, + G: DeviceCopy, + H: DeviceCopy, + > DeviceCopy for (A, B, C, D, E, F, G, H) { } diff --git a/crates/cust_derive/Cargo.toml b/crates/cust_derive/Cargo.toml index 207f7207..9d21ba0c 100644 --- a/crates/cust_derive/Cargo.toml +++ b/crates/cust_derive/Cargo.toml @@ -2,7 +2,7 @@ name = "cust_derive" version = "0.2.0" authors = ["Brook Heisler ", "Riccardo D'Ambrosio "] -edition = "2018" +edition = "2024" license = "MIT OR Apache-2.0" description = "Macros for cust" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/cust_derive/src/lib.rs b/crates/cust_derive/src/lib.rs index ef9f7d21..5a3a4011 100644 --- a/crates/cust_derive/src/lib.rs +++ b/crates/cust_derive/src/lib.rs @@ -6,21 +6,21 @@ extern crate syn; use proc_macro2::{Ident, Span, TokenStream}; use syn::{ - parse_str, Data, DataEnum, DataStruct, DataUnion, DeriveInput, Field, Fields, Generics, - TypeParamBound, + Data, DataEnum, DataStruct, DataUnion, DeriveInput, Field, Fields, Generics, TypeParamBound, + parse_str, }; #[proc_macro_derive(DeviceCopyCore)] pub fn device_copy_core(input: BaseTokenStream) -> BaseTokenStream { let ast = syn::parse(input).unwrap(); - let gen = impl_device_copy(&ast, quote!(::cust_core::DeviceCopy)); - BaseTokenStream::from(gen) + let code = impl_device_copy(&ast, quote!(::cust_core::DeviceCopy)); + BaseTokenStream::from(code) } #[proc_macro_derive(DeviceCopy)] pub fn device_copy(input: BaseTokenStream) -> BaseTokenStream { let ast = syn::parse(input).unwrap(); - let gen = impl_device_copy(&ast, quote!(::cust::memory::DeviceCopy)); - BaseTokenStream::from(gen) + let code = impl_device_copy(&ast, quote!(::cust::memory::DeviceCopy)); + BaseTokenStream::from(code) } use proc_macro::TokenStream as BaseTokenStream; @@ -53,11 +53,11 @@ fn impl_device_copy(input: &DeriveInput, import: TokenStream) -> TokenStream { // Finally, generate the unsafe impl and the type-checking function. let generated_code = quote! { - unsafe impl#impl_generics #import for #input_type#type_generics #where_clause {} + unsafe impl #impl_generics #import for #input_type #type_generics #where_clause {} #[doc(hidden)] #[allow(all)] - fn #type_test_func_ident#impl_generics(value: &#input_type#type_generics) #where_clause { + fn #type_test_func_ident #impl_generics(value: &#input_type #type_generics) #where_clause { fn assert_impl() {} #check_types_code } diff --git a/crates/cust_raw/Cargo.toml b/crates/cust_raw/Cargo.toml index f5fe1f86..c930adce 100644 --- a/crates/cust_raw/Cargo.toml +++ b/crates/cust_raw/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "cust_raw" version = "0.11.3" -edition = "2021" +edition = "2024" license = "MIT OR Apache-2.0" description = "Low level bindings to the CUDA Driver API" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/cust_raw/build/main.rs b/crates/cust_raw/build/main.rs index 212d5510..0c0d1175 100644 --- a/crates/cust_raw/build/main.rs +++ b/crates/cust_raw/build/main.rs @@ -154,6 +154,7 @@ fn create_cuda_driver_bindings( .size_t_is_usize(true) .layout_tests(true) .must_use_type("CUresult") + .wrap_unsafe_ops(true) // The CUDA docs have lots of malformed Doxygen directives, e.g. // // \sa @@ -217,6 +218,7 @@ fn create_cuda_runtime_bindings( .size_t_is_usize(true) .layout_tests(true) .must_use_type("cudaError_t") + .wrap_unsafe_ops(true) // See the comment on `generate_comments` in `create_cuda_runtime_bindings`. .generate_comments(false) .generate() @@ -270,6 +272,7 @@ fn create_cublas_bindings(sdk: &cuda_sdk::CudaSdk, outdir: &path::Path, manifest .size_t_is_usize(true) .layout_tests(true) .must_use_type("cublasStatus_t") + .wrap_unsafe_ops(true) // See the comment on `generate_comments` in `create_cuda_runtime_bindings`. .generate_comments(false) .generate() @@ -312,6 +315,7 @@ fn create_nvptx_compiler_bindings( .size_t_is_usize(true) .layout_tests(true) .must_use_type("nvPTXCompileResult") + .wrap_unsafe_ops(true) // See the comment on `generate_comments` in `create_cuda_runtime_bindings`. .generate_comments(false) .generate() @@ -347,6 +351,7 @@ fn create_nvvm_bindings(sdk: &cuda_sdk::CudaSdk, outdir: &path::Path, manifest_d .size_t_is_usize(true) .layout_tests(true) .must_use_type("nvvmResult") + .wrap_unsafe_ops(true) // See the comment on `generate_comments` in `create_cuda_runtime_bindings`. .generate_comments(false) .generate() diff --git a/crates/gpu_rand/Cargo.toml b/crates/gpu_rand/Cargo.toml index f8ede974..6f2cee1b 100644 --- a/crates/gpu_rand/Cargo.toml +++ b/crates/gpu_rand/Cargo.toml @@ -3,7 +3,7 @@ name = "gpu_rand" version = "0.1.3" authors = ["The Rand Project Developers", "The Rust CUDA Project Developers"] license = "MIT OR Apache-2.0" -edition = "2021" +edition = "2024" description = "GPU-friendly random number generators for the Rust CUDA Project" repository = "https://github.com/Rust-GPU/rust-cuda" readme = "../../README.md" diff --git a/crates/gpu_rand/src/lib.rs b/crates/gpu_rand/src/lib.rs index 9ddeccaf..d8189cc7 100644 --- a/crates/gpu_rand/src/lib.rs +++ b/crates/gpu_rand/src/lib.rs @@ -5,7 +5,7 @@ //! assembly. However, it is supposed to also work on the CPU, allowing you to reuse the same random states across CPU and GPU. //! //! A lot of the initial code is taken from the [rust-random project](https://github.com/rust-random) and modified to make it able to -//! pass to the GPU, as well as cleaning up certain things and updating it to edition 2021. +//! pass to the GPU, as well as cleaning up certain things and updating it to edition 2024. //! The following generators are implemented: //! diff --git a/crates/gpu_rand/src/xoroshiro/mod.rs b/crates/gpu_rand/src/xoroshiro/mod.rs index 7fcda15f..79113942 100644 --- a/crates/gpu_rand/src/xoroshiro/mod.rs +++ b/crates/gpu_rand/src/xoroshiro/mod.rs @@ -75,11 +75,11 @@ mod xoshiro512starstar; pub use common::Seed512; pub use rand_core; pub use splitmix64::SplitMix64; +pub use xoroshiro64star::Xoroshiro64Star; +pub use xoroshiro64starstar::Xoroshiro64StarStar; pub use xoroshiro128plus::Xoroshiro128Plus; pub use xoroshiro128plusplus::Xoroshiro128PlusPlus; pub use xoroshiro128starstar::Xoroshiro128StarStar; -pub use xoroshiro64star::Xoroshiro64Star; -pub use xoroshiro64starstar::Xoroshiro64StarStar; pub use xoshiro128plus::Xoshiro128Plus; pub use xoshiro128plusplus::Xoshiro128PlusPlus; pub use xoshiro128starstar::Xoshiro128StarStar; diff --git a/crates/nvvm/Cargo.toml b/crates/nvvm/Cargo.toml index 22cf71d2..3498a286 100644 --- a/crates/nvvm/Cargo.toml +++ b/crates/nvvm/Cargo.toml @@ -2,7 +2,7 @@ name = "nvvm" version = "0.1.1" authors = ["Riccardo D'Ambrosio "] -edition = "2018" +edition = "2024" license = "MIT OR Apache-2.0" description = "High level bindings to libnvvm" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs index c6706ea0..9a460c6c 100644 --- a/crates/nvvm/src/lib.rs +++ b/crates/nvvm/src/lib.rs @@ -746,8 +746,8 @@ impl NvvmProgram { #[cfg(test)] mod tests { use super::*; - use std::str::FromStr; use NvvmArch::*; + use std::str::FromStr; #[test] fn nvvm_arch_capability_value() { @@ -836,7 +836,9 @@ mod tests { assert_eq!( Compute70.all_target_features(), - vec![Compute50, Compute52, Compute53, Compute60, Compute61, Compute62, Compute70] + vec![ + Compute50, Compute52, Compute53, Compute60, Compute61, Compute62, Compute70 + ] ); assert_eq!( diff --git a/crates/optix/examples/path_tracer/Cargo.toml b/crates/optix/examples/path_tracer/Cargo.toml index 4b1bf7aa..ed25bf73 100644 --- a/crates/optix/examples/path_tracer/Cargo.toml +++ b/crates/optix/examples/path_tracer/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "path-tracer" version = "0.1.0" -edition = "2018" +edition = "2024" [dependencies] glam = { version = "0.30", features = ["bytemuck", "cuda", "mint"] } diff --git a/crates/optix/examples/path_tracer/kernels/Cargo.toml b/crates/optix/examples/path_tracer/kernels/Cargo.toml index 1f3c34ea..87f93da1 100644 --- a/crates/optix/examples/path_tracer/kernels/Cargo.toml +++ b/crates/optix/examples/path_tracer/kernels/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "path-tracer-kernels" version = "0.1.0" -edition = "2018" +edition = "2024" [dependencies] cuda_std = { version = "0.2", path = "../../../../cuda_std" } diff --git a/crates/ptx/Cargo.toml b/crates/ptx/Cargo.toml index eff4da3c..9ce27d8d 100644 --- a/crates/ptx/Cargo.toml +++ b/crates/ptx/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "ptx" version = "0.1.0" -edition = "2018" +edition = "2024" license = "MIT OR Apache-2.0" description = "PTX parser and analyzer" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/ptx/src/lexer.rs b/crates/ptx/src/lexer.rs index 440e5b5f..354c1c26 100644 --- a/crates/ptx/src/lexer.rs +++ b/crates/ptx/src/lexer.rs @@ -107,10 +107,10 @@ impl<'src> Lexer<'src> { Some(Ok(match cur { AsciiChar::Percent => { let peek = self.peek(); - if let Some(peeked) = peek { - if is_ident_continue(peeked) { - return Some(Ok(self.opcode_or_ident())); - } + if let Some(peeked) = peek + && is_ident_continue(peeked) + { + return Some(Ok(self.opcode_or_ident())); } self.eat_and_ret_token(1, TokenKind::Modulo) } @@ -440,14 +440,14 @@ impl<'src> Lexer<'src> { let cur = self.cur; let ident = self.eat_until(|c, _| is_ident_continue(c)); // check if its an instruction - if ident.chars().all(|c| c.is_ascii_alphanumeric()) { - if let Ok(kind) = InstructionKind::from_str(ident.as_str()) { - *self.values.last_mut().unwrap() = Some(TokenValue::Instruction(kind)); - return Token { - kind: TokenKind::Instruction, - range: cur..self.cur, - }; - } + if ident.chars().all(|c| c.is_ascii_alphanumeric()) + && let Ok(kind) = InstructionKind::from_str(ident.as_str()) + { + *self.values.last_mut().unwrap() = Some(TokenValue::Instruction(kind)); + return Token { + kind: TokenKind::Instruction, + range: cur..self.cur, + }; } *self.values.last_mut().unwrap() = @@ -468,7 +468,7 @@ impl<'src> Lexer<'src> { return Ok(Token { kind: TokenKind::Dot, range: cur..self.cur, - }) + }); } Some(c) if !is_ident_continue(c) => { return Ok(Token { diff --git a/crates/ptx_compiler/Cargo.toml b/crates/ptx_compiler/Cargo.toml index fc6c2e28..dfc9c66e 100644 --- a/crates/ptx_compiler/Cargo.toml +++ b/crates/ptx_compiler/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "ptx_compiler" version = "0.1.1" -edition = "2021" +edition = "2024" license = "MIT OR Apache-2.0" description = "High level bindings to CUDA's ptx compilation APIs" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/crates/rustc_codegen_nvvm_macros/Cargo.toml b/crates/rustc_codegen_nvvm_macros/Cargo.toml index 9d78a324..2cac50cc 100644 --- a/crates/rustc_codegen_nvvm_macros/Cargo.toml +++ b/crates/rustc_codegen_nvvm_macros/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "rustc_codegen_nvvm_macros" version = "0.1.0" -edition = "2021" +edition = "2024" license = "MIT OR Apache-2.0" description = "Macros for rustc_codegen_nvvm" repository = "https://github.com/Rust-GPU/rust-cuda" diff --git a/tests/compiletests/Cargo.toml b/tests/compiletests/Cargo.toml index 989726d6..4c212d73 100644 --- a/tests/compiletests/Cargo.toml +++ b/tests/compiletests/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "compiletests" version = "0.1.0" -edition = "2021" +edition = "2024" [[bin]] name = "compiletests" diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs index b6888e95..8672854b 100644 --- a/tests/compiletests/src/main.rs +++ b/tests/compiletests/src/main.rs @@ -109,7 +109,7 @@ impl Runner { .iter() .map(|dir| format!("-L dependency={}", dir.display())) .fold(String::new(), |a, b| b + " " + &a), - "--edition 2021", + "--edition 2024", &*format!("--extern noprelude:core={}", deps.core.display()), &*format!( "--extern noprelude:compiler_builtins={}", @@ -458,7 +458,9 @@ fn setup_windows_dll_path(codegen_backend_path: &Path) { } else { format!("{dir_str}{separator}{existing_path}") }; - env::set_var(lib_path_var, new_path); + unsafe { + env::set_var(lib_path_var, new_path); + } } } @@ -477,24 +479,24 @@ fn setup_windows_dll_path(codegen_backend_path: &Path) { ]; for llvm_config in &llvm_config_paths { - if let Ok(output) = Command::new(llvm_config).arg("--bindir").output() { - if output.status.success() { - if let Ok(bindir) = String::from_utf8(output.stdout) { - let bindir = bindir.trim(); - let bindir_path = Path::new(bindir); - if bindir_path.exists() { - add_to_dylib_path(bindir_path); - // Also add the lib directory which might contain DLLs - if let Some(parent) = bindir_path.parent() { - let libdir = parent.join("lib"); - if libdir.exists() { - add_to_dylib_path(&libdir); - } + if let Ok(output) = Command::new(llvm_config).arg("--bindir").output() + && output.status.success() + { + if let Ok(bindir) = String::from_utf8(output.stdout) { + let bindir = bindir.trim(); + let bindir_path = Path::new(bindir); + if bindir_path.exists() { + add_to_dylib_path(bindir_path); + // Also add the lib directory which might contain DLLs + if let Some(parent) = bindir_path.parent() { + let libdir = parent.join("lib"); + if libdir.exists() { + add_to_dylib_path(&libdir); } } } - break; } + break; } } @@ -629,6 +631,8 @@ fn setup_cuda_environment() { format!("{new_paths}{separator}{existing_path}") }; - env::set_var(lib_path_var, new_lib_path); + unsafe { + env::set_var(lib_path_var, new_lib_path); + } } } diff --git a/tests/compiletests/ui/atomic/atomic_operations.rs b/tests/compiletests/ui/atomic/atomic_operations.rs index 48ae8d9c..812a0adb 100644 --- a/tests/compiletests/ui/atomic/atomic_operations.rs +++ b/tests/compiletests/ui/atomic/atomic_operations.rs @@ -17,7 +17,7 @@ pub unsafe fn test_cuda_atomic_floats() { // Block-scoped atomic float let block_atomic = BlockAtomicF32::new(1.5); - let _old = block_atomic.fetch_add(0.5, Ordering::Relaxed); + let _old = unsafe { block_atomic.fetch_add(0.5, Ordering::Relaxed) }; // System-scoped atomic float let system_atomic = SystemAtomicF32::new(0.0); @@ -29,7 +29,7 @@ pub unsafe fn test_cuda_atomic_floats() { // Test block-scoped f64 let block_f64 = BlockAtomicF64::new(2.718); - let _old = block_f64.fetch_sub(0.5, Ordering::Relaxed); + let _old = unsafe { block_f64.fetch_sub(0.5, Ordering::Relaxed) }; // Test bitwise operations on floats let _old = atomic_f32.fetch_and(3.14, Ordering::Relaxed); diff --git a/tests/compiletests/ui/core/ops/range_contains.rs b/tests/compiletests/ui/core/ops/range_contains.rs index f7f733b7..0a9c5b25 100644 --- a/tests/compiletests/ui/core/ops/range_contains.rs +++ b/tests/compiletests/ui/core/ops/range_contains.rs @@ -8,5 +8,7 @@ fn has_two_decimal_digits(x: u32) -> bool { #[kernel] pub unsafe fn main(i: u32, o: *mut u32) { - *o = has_two_decimal_digits(i) as u32; + unsafe { + *o = has_two_decimal_digits(i) as u32; + } } diff --git a/tests/compiletests/ui/dis/simple_add.rs b/tests/compiletests/ui/dis/simple_add.rs index 15b6b179..47f526cc 100644 --- a/tests/compiletests/ui/dis/simple_add.rs +++ b/tests/compiletests/ui/dis/simple_add.rs @@ -7,6 +7,8 @@ use cuda_std::kernel; #[kernel] pub unsafe fn simple_add_kernel(a: *const f32, b: *const f32, c: *mut f32) { - let sum = *a + *b; - *c = sum; + unsafe { + let sum = *a + *b; + *c = sum; + } } diff --git a/tests/compiletests/ui/dis/simple_add.stderr b/tests/compiletests/ui/dis/simple_add.stderr index 0386db78..5aa92e63 100644 --- a/tests/compiletests/ui/dis/simple_add.stderr +++ b/tests/compiletests/ui/dis/simple_add.stderr @@ -19,17 +19,17 @@ $L__tmp0: cvta.to.global.u64 %rd4, %rd3; cvta.to.global.u64 %rd5, %rd2; cvta.to.global.u64 %rd6, %rd1; - .loc 1 10 15 + .loc 1 11 19 ld.global.f32 %f1, [%rd6]; - .loc 1 10 20 + .loc 1 11 24 ld.global.f32 %f2, [%rd5]; - .loc 1 10 15 + .loc 1 11 19 add.f32 %f3, %f1, %f2; $L__tmp1: - .loc 1 11 5 + .loc 1 12 9 st.global.f32 [%rd4], %f3; $L__tmp2: - .loc 1 12 2 + .loc 1 14 2 ret; $L__tmp3: $L__func_end0: diff --git a/tests/compiletests/ui/glam/mat3_vec3_multiply.rs b/tests/compiletests/ui/glam/mat3_vec3_multiply.rs index adda6e82..de2e0d17 100644 --- a/tests/compiletests/ui/glam/mat3_vec3_multiply.rs +++ b/tests/compiletests/ui/glam/mat3_vec3_multiply.rs @@ -7,5 +7,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn mat3_vec3_multiply(input: glam::Mat3, output: *mut glam::Vec3) { let vector = input * glam::Vec3::new(1.0, 2.0, 3.0); - *output = vector; + unsafe { + *output = vector; + } } diff --git a/tests/compiletests/ui/glam/mat4_operations.rs b/tests/compiletests/ui/glam/mat4_operations.rs index e54d2d10..d15c5b28 100644 --- a/tests/compiletests/ui/glam/mat4_operations.rs +++ b/tests/compiletests/ui/glam/mat4_operations.rs @@ -16,15 +16,21 @@ pub unsafe fn mat4_transform_operations( ) { // Transform a 3D point (w=1 implied) let transformed_point = matrix.transform_point3(point); - *result_point = transformed_point; + unsafe { + *result_point = transformed_point; + } // Transform a 4D vector let transformed_vector = matrix * vector; - *result_vector = transformed_vector; + unsafe { + *result_vector = transformed_vector; + } // Calculate determinant let det = matrix.determinant(); - *result_determinant = det; + unsafe { + *result_determinant = det; + } } #[kernel] @@ -39,13 +45,19 @@ pub unsafe fn mat4_construction( ) { // Create translation matrix let trans_mat = Mat4::from_translation(translation); - *result_translation = trans_mat; + unsafe { + *result_translation = trans_mat; + } // Create scale matrix let scale_mat = Mat4::from_scale(scale); - *result_scale = scale_mat; + unsafe { + *result_scale = scale_mat; + } // Create rotation matrix let rot_mat = Mat4::from_axis_angle(axis, angle_radians); - *result_rotation = rot_mat; + unsafe { + *result_rotation = rot_mat; + } } diff --git a/tests/compiletests/ui/glam/vec3_operations.rs b/tests/compiletests/ui/glam/vec3_operations.rs index 9ab43513..2c33c87e 100644 --- a/tests/compiletests/ui/glam/vec3_operations.rs +++ b/tests/compiletests/ui/glam/vec3_operations.rs @@ -15,15 +15,21 @@ pub unsafe fn vec3_basic_ops( ) { // Vector addition let sum = a + b; - *result_add = sum; + unsafe { + *result_add = sum; + } // Dot product let dot = a.dot(b); - *result_dot = dot; + unsafe { + *result_dot = dot; + } // Cross product let cross = a.cross(b); - *result_cross = cross; + unsafe { + *result_cross = cross; + } } #[kernel] @@ -34,9 +40,13 @@ pub unsafe fn vec3_normalization( ) { // Get length let len = input.length(); - *result_length = len; + unsafe { + *result_length = len; + } // Normalize let normalized = input.normalize(); - *result_normalized = normalized; + unsafe { + *result_normalized = normalized; + } } diff --git a/tests/compiletests/ui/hello_world.rs b/tests/compiletests/ui/hello_world.rs index 82174c88..ae093134 100644 --- a/tests/compiletests/ui/hello_world.rs +++ b/tests/compiletests/ui/hello_world.rs @@ -4,5 +4,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn add_one(x: *mut f32) { - *x = *x + 1.0; + unsafe { + *x = *x + 1.0; + } } diff --git a/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs b/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs index ad38b46b..c3993952 100644 --- a/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs +++ b/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs @@ -17,5 +17,5 @@ static BIG_ARRAY_3: [u32; ARRAY_SIZE] = [333u32; ARRAY_SIZE]; #[kernel] pub unsafe fn test_kernel(out: *mut u32) { - *out = BIG_ARRAY_1[0] + BIG_ARRAY_2[0] + BIG_ARRAY_3[0]; + unsafe { *out = BIG_ARRAY_1[0] + BIG_ARRAY_2[0] + BIG_ARRAY_3[0] }; } diff --git a/tests/compiletests/ui/lang/consts/shallow-ref.rs b/tests/compiletests/ui/lang/consts/shallow-ref.rs index 0065d47f..7d9051c0 100644 --- a/tests/compiletests/ui/lang/consts/shallow-ref.rs +++ b/tests/compiletests/ui/lang/consts/shallow-ref.rs @@ -20,7 +20,9 @@ pub unsafe fn test_shallow_ref( bool_out: *mut u32, vec_out: *mut Vec2, ) { - *scalar_out = scalar_load(&123); - *bool_out = (vec_in == Vec2::ZERO) as u32; - *vec_out = ROT90.transpose() * vec_in; + unsafe { + *scalar_out = scalar_load(&123); + *bool_out = (vec_in == Vec2::ZERO) as u32; + *vec_out = ROT90.transpose() * vec_in; + } } diff --git a/tests/compiletests/ui/lang/core/array/init_array_i16.rs b/tests/compiletests/ui/lang/core/array/init_array_i16.rs index 61eb4b6b..ede3b2cd 100644 --- a/tests/compiletests/ui/lang/core/array/init_array_i16.rs +++ b/tests/compiletests/ui/lang/core/array/init_array_i16.rs @@ -6,5 +6,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn test_init_array_i16(o: *mut i16) { let array = [0i16; 4]; - *o = array[1]; + unsafe { + *o = array[1]; + } } diff --git a/tests/compiletests/ui/lang/core/array/init_array_i32.rs b/tests/compiletests/ui/lang/core/array/init_array_i32.rs index 4efb24f7..59c5850c 100644 --- a/tests/compiletests/ui/lang/core/array/init_array_i32.rs +++ b/tests/compiletests/ui/lang/core/array/init_array_i32.rs @@ -6,5 +6,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn test_init_array_i32(o: *mut i32) { let array = [0i32; 4]; - *o = array[1]; + unsafe { + *o = array[1]; + } } diff --git a/tests/compiletests/ui/lang/core/array/init_array_i64.rs b/tests/compiletests/ui/lang/core/array/init_array_i64.rs index 0c161014..7625cac5 100644 --- a/tests/compiletests/ui/lang/core/array/init_array_i64.rs +++ b/tests/compiletests/ui/lang/core/array/init_array_i64.rs @@ -6,5 +6,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn test_init_array_i64(o: *mut i64) { let array = [0i64; 4]; - *o = array[1]; + unsafe { + *o = array[1]; + } } diff --git a/tests/compiletests/ui/lang/core/array/init_array_i8.rs b/tests/compiletests/ui/lang/core/array/init_array_i8.rs index b39b274f..a443bb7f 100644 --- a/tests/compiletests/ui/lang/core/array/init_array_i8.rs +++ b/tests/compiletests/ui/lang/core/array/init_array_i8.rs @@ -6,5 +6,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn test_init_array_i8(o: *mut i8) { let array = [0i8; 4]; - *o = array[1]; + unsafe { + *o = array[1]; + } } diff --git a/tests/compiletests/ui/lang/core/intrinsics/log10.rs b/tests/compiletests/ui/lang/core/intrinsics/log10.rs index a133c7e8..f4f3c72c 100644 --- a/tests/compiletests/ui/lang/core/intrinsics/log10.rs +++ b/tests/compiletests/ui/lang/core/intrinsics/log10.rs @@ -10,5 +10,7 @@ use cuda_std::GpuFloat; #[kernel] pub unsafe fn test_log10(input: *const f32, output: *mut f32) { - *output = (*input).log10(); + unsafe { + *output = (*input).log10(); + } } diff --git a/tests/compiletests/ui/lang/core/unwrap_or.rs b/tests/compiletests/ui/lang/core/unwrap_or.rs index 1deb6858..a6b54c7f 100644 --- a/tests/compiletests/ui/lang/core/unwrap_or.rs +++ b/tests/compiletests/ui/lang/core/unwrap_or.rs @@ -8,5 +8,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn test_unwrap_or(out: *mut u32) { - *out = None.unwrap_or(15); + unsafe { + *out = None.unwrap_or(15); + } } diff --git a/tests/compiletests/ui/lang/f32/signum.rs b/tests/compiletests/ui/lang/f32/signum.rs index 09beaa23..8f7d6e02 100644 --- a/tests/compiletests/ui/lang/f32/signum.rs +++ b/tests/compiletests/ui/lang/f32/signum.rs @@ -5,5 +5,7 @@ use cuda_std::kernel; #[kernel] pub unsafe fn test_signum(i: f32, o: *mut f32) { - *o = i.signum(); + unsafe { + *o = i.signum(); + } } diff --git a/tests/compiletests/ui/lang/u32/bit_reverse.rs b/tests/compiletests/ui/lang/u32/bit_reverse.rs index f8a8379f..188319f7 100644 --- a/tests/compiletests/ui/lang/u32/bit_reverse.rs +++ b/tests/compiletests/ui/lang/u32/bit_reverse.rs @@ -6,40 +6,56 @@ use cuda_std::kernel; #[kernel] pub unsafe fn reverse_bits_u8(buffer: *const u8, out: *mut u8) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } #[kernel] pub unsafe fn reverse_bits_u16(buffer: *const u16, out: *mut u16) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } #[kernel] pub unsafe fn reverse_bits_u32(buffer: *const u32, out: *mut u32) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } #[kernel] pub unsafe fn reverse_bits_u64(buffer: *const u64, out: *mut u64) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } #[kernel] pub unsafe fn reverse_bits_i8(buffer: *const i8, out: *mut i8) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } #[kernel] pub unsafe fn reverse_bits_i16(buffer: *const i16, out: *mut i16) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } #[kernel] pub unsafe fn reverse_bits_i32(buffer: *const i32, out: *mut i32) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } #[kernel] pub unsafe fn reverse_bits_i64(buffer: *const i64, out: *mut i64) { - *out = (*buffer).reverse_bits(); + unsafe { + *out = (*buffer).reverse_bits(); + } } diff --git a/tests/compiletests/ui/lang/u32/count_ones.rs b/tests/compiletests/ui/lang/u32/count_ones.rs index cbb36fcc..310eb508 100644 --- a/tests/compiletests/ui/lang/u32/count_ones.rs +++ b/tests/compiletests/ui/lang/u32/count_ones.rs @@ -6,40 +6,56 @@ use cuda_std::kernel; #[kernel] pub unsafe fn count_ones_u8(buffer: *const u8, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } #[kernel] pub unsafe fn count_ones_u16(buffer: *const u16, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } #[kernel] pub unsafe fn count_ones_u32(buffer: *const u32, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } #[kernel] pub unsafe fn count_ones_u64(buffer: *const u64, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } #[kernel] pub unsafe fn count_ones_i8(buffer: *const i8, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } #[kernel] pub unsafe fn count_ones_i16(buffer: *const i16, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } #[kernel] pub unsafe fn count_ones_i32(buffer: *const i32, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } #[kernel] pub unsafe fn count_ones_i64(buffer: *const i64, out: *mut u32) { - *out = (*buffer).count_ones(); + unsafe { + *out = (*buffer).count_ones(); + } } diff --git a/tests/compiletests/ui/shared/shared_memory.rs b/tests/compiletests/ui/shared/shared_memory.rs index 545930be..a9ed3b7b 100644 --- a/tests/compiletests/ui/shared/shared_memory.rs +++ b/tests/compiletests/ui/shared/shared_memory.rs @@ -16,13 +16,13 @@ pub unsafe fn test_static_shared_memory() { let tid = thread::thread_idx_x() as usize; // Write to shared memory - SHARED_DATA[tid].write(tid as i32); + unsafe { SHARED_DATA[tid] }.write(tid as i32); // Synchronize threads before reading thread::sync_threads(); // Read from shared memory - let _value = SHARED_DATA[tid].assume_init(); + let _value = unsafe { SHARED_DATA[tid].assume_init() }; } #[kernel] diff --git a/tests/compiletests/ui/warp/warp_functions.rs b/tests/compiletests/ui/warp/warp_functions.rs index 32af2bec..2c0eea27 100644 --- a/tests/compiletests/ui/warp/warp_functions.rs +++ b/tests/compiletests/ui/warp/warp_functions.rs @@ -6,15 +6,17 @@ use cuda_std::warp; #[kernel] pub unsafe fn test_warp_functions() { - // Test lane ID function - let _lane = warp::lane_id(); + unsafe { + // Test lane ID function + let _lane = warp::lane_id(); - // Test active mask function - let _mask = warp::activemask(); + // Test active mask function + let _mask = warp::activemask(); - // Test warp sync with full mask - warp::sync_warp(0xFFFFFFFF); + // Test warp sync with full mask + warp::sync_warp(0xFFFFFFFF); - // Test warp sync with partial mask - warp::sync_warp(0x0000FFFF); + // Test warp sync with partial mask + warp::sync_warp(0x0000FFFF); + } } diff --git a/tests/compiletests/ui/warp/warp_shuffle.rs b/tests/compiletests/ui/warp/warp_shuffle.rs index dca08649..bb25f530 100644 --- a/tests/compiletests/ui/warp/warp_shuffle.rs +++ b/tests/compiletests/ui/warp/warp_shuffle.rs @@ -10,7 +10,7 @@ pub unsafe fn test_warp_shuffle_functions() { let width = 32_u32; // Full warp width // Test warp_shuffle_xor with various types - { + unsafe { // 8-bit types let val_i8: i8 = 42; let (res_i8, pred_i8) = warp::warp_shuffle_xor(mask, val_i8, 1, width); @@ -54,7 +54,7 @@ pub unsafe fn test_warp_shuffle_functions() { } // Test warp_shuffle_down with various types - { + unsafe { let delta = 1_u32; let val_i32: i32 = 42; @@ -77,7 +77,7 @@ pub unsafe fn test_warp_shuffle_functions() { } // Test warp_shuffle_up with various types - { + unsafe { let delta = 1_u32; let val_i32: i32 = 42; @@ -100,7 +100,7 @@ pub unsafe fn test_warp_shuffle_functions() { } // Test warp_shuffle_idx with various types - { + unsafe { let idx = 5_u32; let val_i32: i32 = 42; @@ -123,14 +123,14 @@ pub unsafe fn test_warp_shuffle_functions() { } // Test with different mask values - { + unsafe { let partial_mask = 0x0000FFFF_u32; // Lower 16 lanes let val: i32 = 123; let (res, pred) = warp::warp_shuffle_xor(partial_mask, val, 1, width); } // Test with different width values (must be power of 2 and <= 32) - { + unsafe { let val: i32 = 456; let lane_mask = 1_u32; @@ -149,7 +149,7 @@ pub unsafe fn test_warp_shuffle_functions() { // Test with half-precision floating point types (if available) #[cfg(feature = "half")] - { + unsafe { use half::{bf16, f16}; let val_f16 = f16::from_f32(1.5); @@ -166,31 +166,31 @@ pub unsafe fn test_warp_shuffle_edge_cases() { let mask = 0xFFFFFFFF_u32; // Test with lane_mask = 0 (should shuffle with same lane) - { + unsafe { let val: i32 = 999; let (res, pred) = warp::warp_shuffle_xor(mask, val, 0, 32); } // Test with maximum lane_mask - { + unsafe { let val: i32 = 888; let (res, pred) = warp::warp_shuffle_xor(mask, val, 31, 32); } // Test shuffle_down with delta = 0 - { + unsafe { let val: i32 = 777; let (res, pred) = warp::warp_shuffle_down(mask, val, 0, 32); } // Test shuffle_up with delta = 0 - { + unsafe { let val: i32 = 666; let (res, pred) = warp::warp_shuffle_up(mask, val, 0, 32); } // Test shuffle_idx with idx = 0 and idx = 31 - { + unsafe { let val: i32 = 555; let (res0, pred0) = warp::warp_shuffle_idx(mask, val, 0, 32); let (res31, pred31) = warp::warp_shuffle_idx(mask, val, 31, 32); @@ -204,7 +204,7 @@ pub unsafe fn test_warp_shuffle_practical() { let mask = 0xFFFFFFFF_u32; // Butterfly reduction pattern using XOR shuffle - { + unsafe { let mut val = lane_id as i32; // Stage 1: XOR with distance 16 @@ -229,19 +229,19 @@ pub unsafe fn test_warp_shuffle_practical() { } // Broadcast from lane 0 using shuffle_idx - { + unsafe { let my_val = lane_id * 10; let (broadcast_val, is_valid) = warp::warp_shuffle_idx(mask, my_val, 0, 32); } // Shift pattern using shuffle_down - { + unsafe { let my_val = lane_id as f32; let (shifted_val, is_valid) = warp::warp_shuffle_down(mask, my_val, 1, 32); } // Reverse shift using shuffle_up - { + unsafe { let my_val = (31 - lane_id) as f32; let (shifted_val, is_valid) = warp::warp_shuffle_up(mask, my_val, 1, 32); } diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 2e8fff28..7467646b 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "xtask" version = "0.0.0" -edition = "2021" +edition = "2024" license = "MIT" [dependencies]