diff --git a/crates/blastoff/Cargo.toml b/crates/blastoff/Cargo.toml
index 23340f1b..869bbfca 100644
--- a/crates/blastoff/Cargo.toml
+++ b/crates/blastoff/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "blastoff"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
 repository = "https://github.com/Rust-GPU/rust-cuda"
 
diff --git a/crates/blastoff/src/context.rs b/crates/blastoff/src/context.rs
index 83c53aa5..277e6be9 100644
--- a/crates/blastoff/src/context.rs
+++ b/crates/blastoff/src/context.rs
@@ -320,9 +320,11 @@ impl CublasContext {
     ///
     /// The callback must not panic and unwind.
     pub unsafe fn set_logger_callback(callback: Option<unsafe extern "C" fn(*const c_char)>) {
-        cublas_sys::cublasSetLoggerCallback(callback)
-            .to_result()
-            .unwrap();
+        unsafe {
+            cublas_sys::cublasSetLoggerCallback(callback)
+                .to_result()
+                .unwrap();
+        }
     }
 
     /// Gets the logger callback that was previously set.
diff --git a/crates/blastoff/src/error.rs b/crates/blastoff/src/error.rs
index b4b16420..35c0691f 100644
--- a/crates/blastoff/src/error.rs
+++ b/crates/blastoff/src/error.rs
@@ -38,8 +38,8 @@ pub trait ToResult {
 
 impl ToResult for cublas_sys::cublasStatus_t {
     fn to_result(self) -> Result<(), CublasError> {
-        use cust_raw::cublas_sys::cublasStatus_t::*;
         use CublasError::*;
+        use cust_raw::cublas_sys::cublasStatus_t::*;
 
         Err(match self {
             CUBLAS_STATUS_SUCCESS => return Ok(()),
@@ -58,8 +58,8 @@ impl ToResult for cublas_sys::cublasStatus_t {
 
 impl CublasError {
     pub fn into_raw(self) -> cublas_sys::cublasStatus_t {
-        use cust_raw::cublas_sys::cublasStatus_t::*;
         use CublasError::*;
+        use cust_raw::cublas_sys::cublasStatus_t::*;
 
         match self {
             NotInitialized => CUBLAS_STATUS_NOT_INITIALIZED,
diff --git a/crates/blastoff/src/level1.rs b/crates/blastoff/src/level1.rs
index 40a1bc87..01dfd643 100644
--- a/crates/blastoff/src/level1.rs
+++ b/crates/blastoff/src/level1.rs
@@ -1,10 +1,10 @@
 //! Scalar and Vector-based operations
 
 use crate::{
+    BlasDatatype, Float,
     context::CublasContext,
     error::{Error, ToResult},
     raw::{ComplexLevel1, FloatLevel1, Level1},
-    BlasDatatype, Float,
 };
 use cust::memory::{GpuBox, GpuBuffer};
 use cust::stream::Stream;
diff --git a/crates/blastoff/src/level3.rs b/crates/blastoff/src/level3.rs
index ce08e6c0..a8ff81de 100644
--- a/crates/blastoff/src/level3.rs
+++ b/crates/blastoff/src/level3.rs
@@ -1,8 +1,8 @@
 use crate::{
+    GemmDatatype, MatrixOp,
     context::CublasContext,
     error::{Error, ToResult},
     raw::GemmOps,
-    GemmDatatype, MatrixOp,
 };
 use cust::memory::{GpuBox, GpuBuffer};
 use cust::stream::Stream;
diff --git a/crates/blastoff/src/raw/level1.rs b/crates/blastoff/src/raw/level1.rs
index 7268d642..73103e74 100644
--- a/crates/blastoff/src/raw/level1.rs
+++ b/crates/blastoff/src/raw/level1.rs
@@ -103,7 +103,7 @@ impl Level1 for f32 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIsamax(handle, n, x, incx, result)
+        unsafe { cublasIsamax(handle, n, x, incx, result) }
     }
     unsafe fn amin(
         handle: cublasHandle_t,
@@ -112,7 +112,7 @@ impl Level1 for f32 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIsamin(handle, n, x, incx, result)
+        unsafe { cublasIsamin(handle, n, x, incx, result) }
     }
     unsafe fn axpy(
         handle: cublasHandle_t,
@@ -123,7 +123,7 @@ impl Level1 for f32 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasSaxpy(handle, n, alpha, x, incx, y, incy)
+        unsafe { cublasSaxpy(handle, n, alpha, x, incx, y, incy) }
     }
     unsafe fn copy(
         handle: cublasHandle_t,
@@ -133,7 +133,7 @@ impl Level1 for f32 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasScopy(handle, n, x, incx, y, incy)
+        unsafe { cublasScopy(handle, n, x, incx, y, incy) }
     }
     unsafe fn nrm2(
         handle: cublasHandle_t,
@@ -142,7 +142,7 @@ impl Level1 for f32 {
         incx: c_int,
         result: *mut Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasSnrm2(handle, n, x, incx, result)
+        unsafe { cublasSnrm2(handle, n, x, incx, result) }
     }
     unsafe fn rot(
         handle: cublasHandle_t,
@@ -154,7 +154,7 @@ impl Level1 for f32 {
         c: *const Self::FloatTy,
         s: *const Self,
     ) -> cublasStatus_t {
-        cublasSrot(handle, n, x, incx, y, incy, c, s)
+        unsafe { cublasSrot(handle, n, x, incx, y, incy, c, s) }
     }
     unsafe fn rotg(
         handle: cublasHandle_t,
@@ -163,7 +163,7 @@ impl Level1 for f32 {
         c: *mut Self::FloatTy,
         s: *mut Self,
     ) -> cublasStatus_t {
-        cublasSrotg(handle, a, b, c, s)
+        unsafe { cublasSrotg(handle, a, b, c, s) }
     }
     unsafe fn rotm(
         handle: cublasHandle_t,
@@ -174,7 +174,7 @@ impl Level1 for f32 {
         incy: c_int,
         param: *const Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasSrotm(handle, n, x, incx, y, incy, param)
+        unsafe { cublasSrotm(handle, n, x, incx, y, incy, param) }
     }
     unsafe fn rotmg(
         handle: cublasHandle_t,
@@ -184,7 +184,7 @@ impl Level1 for f32 {
         y1: *const Self,
         param: *mut Self,
     ) -> cublasStatus_t {
-        cublasSrotmg(handle, d1, d2, x1, y1, param)
+        unsafe { cublasSrotmg(handle, d1, d2, x1, y1, param) }
     }
     unsafe fn scal(
         handle: cublasHandle_t,
@@ -193,7 +193,7 @@ impl Level1 for f32 {
         x: *mut Self,
         incx: c_int,
     ) -> cublasStatus_t {
-        cublasSscal(handle, n, alpha, x, incx)
+        unsafe { cublasSscal(handle, n, alpha, x, incx) }
     }
     unsafe fn swap(
         handle: cublasHandle_t,
@@ -203,7 +203,7 @@ impl Level1 for f32 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasSswap(handle, n, x, incx, y, incy)
+        unsafe { cublasSswap(handle, n, x, incx, y, incy) }
     }
 }
 
@@ -215,7 +215,7 @@ impl Level1 for f64 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIdamax(handle, n, x, incx, result)
+        unsafe { cublasIdamax(handle, n, x, incx, result) }
     }
     unsafe fn amin(
         handle: cublasHandle_t,
@@ -224,7 +224,7 @@ impl Level1 for f64 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIdamin(handle, n, x, incx, result)
+        unsafe { cublasIdamin(handle, n, x, incx, result) }
     }
     unsafe fn axpy(
         handle: cublasHandle_t,
@@ -235,7 +235,7 @@ impl Level1 for f64 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasDaxpy(handle, n, alpha, x, incx, y, incy)
+        unsafe { cublasDaxpy(handle, n, alpha, x, incx, y, incy) }
     }
     unsafe fn copy(
         handle: cublasHandle_t,
@@ -245,7 +245,7 @@ impl Level1 for f64 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasDcopy(handle, n, x, incx, y, incy)
+        unsafe { cublasDcopy(handle, n, x, incx, y, incy) }
     }
     unsafe fn nrm2(
         handle: cublasHandle_t,
@@ -254,7 +254,7 @@ impl Level1 for f64 {
         incx: c_int,
         result: *mut Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasDnrm2(handle, n, x, incx, result)
+        unsafe { cublasDnrm2(handle, n, x, incx, result) }
     }
     unsafe fn rot(
         handle: cublasHandle_t,
@@ -266,7 +266,7 @@ impl Level1 for f64 {
         c: *const Self::FloatTy,
         s: *const Self,
     ) -> cublasStatus_t {
-        cublasDrot(handle, n, x, incx, y, incy, c, s)
+        unsafe { cublasDrot(handle, n, x, incx, y, incy, c, s) }
     }
     unsafe fn rotg(
         handle: cublasHandle_t,
@@ -275,7 +275,7 @@ impl Level1 for f64 {
         c: *mut Self::FloatTy,
         s: *mut Self,
     ) -> cublasStatus_t {
-        cublasDrotg(handle, a, b, c, s)
+        unsafe { cublasDrotg(handle, a, b, c, s) }
     }
     unsafe fn rotm(
         handle: cublasHandle_t,
@@ -286,7 +286,7 @@ impl Level1 for f64 {
         incy: c_int,
         param: *const Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasDrotm(handle, n, x, incx, y, incy, param)
+        unsafe { cublasDrotm(handle, n, x, incx, y, incy, param) }
     }
     unsafe fn rotmg(
         handle: cublasHandle_t,
@@ -296,7 +296,7 @@ impl Level1 for f64 {
         y1: *const Self,
         param: *mut Self,
     ) -> cublasStatus_t {
-        cublasDrotmg(handle, d1, d2, x1, y1, param)
+        unsafe { cublasDrotmg(handle, d1, d2, x1, y1, param) }
     }
     unsafe fn scal(
         handle: cublasHandle_t,
@@ -305,7 +305,7 @@ impl Level1 for f64 {
         x: *mut Self,
         incx: c_int,
     ) -> cublasStatus_t {
-        cublasDscal(handle, n, alpha, x, incx)
+        unsafe { cublasDscal(handle, n, alpha, x, incx) }
     }
     unsafe fn swap(
         handle: cublasHandle_t,
@@ -315,7 +315,7 @@ impl Level1 for f64 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasDswap(handle, n, x, incx, y, incy)
+        unsafe { cublasDswap(handle, n, x, incx, y, incy) }
     }
 }
 
@@ -327,7 +327,7 @@ impl Level1 for Complex32 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIcamax(handle, n, x.cast(), incx, result)
+        unsafe { cublasIcamax(handle, n, x.cast(), incx, result) }
     }
     unsafe fn amin(
         handle: cublasHandle_t,
@@ -336,7 +336,7 @@ impl Level1 for Complex32 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIcamin(handle, n, x.cast(), incx, result)
+        unsafe { cublasIcamin(handle, n, x.cast(), incx, result) }
     }
     unsafe fn axpy(
         handle: cublasHandle_t,
@@ -347,7 +347,7 @@ impl Level1 for Complex32 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasCaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy)
+        unsafe { cublasCaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy) }
     }
     unsafe fn copy(
         handle: cublasHandle_t,
@@ -357,7 +357,7 @@ impl Level1 for Complex32 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasCcopy(handle, n, x.cast(), incx, y.cast(), incy)
+        unsafe { cublasCcopy(handle, n, x.cast(), incx, y.cast(), incy) }
     }
     unsafe fn nrm2(
         handle: cublasHandle_t,
@@ -366,7 +366,7 @@ impl Level1 for Complex32 {
         incx: c_int,
         result: *mut Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasScnrm2(handle, n, x.cast(), incx, result)
+        unsafe { cublasScnrm2(handle, n, x.cast(), incx, result) }
     }
     unsafe fn rot(
         handle: cublasHandle_t,
@@ -378,7 +378,7 @@ impl Level1 for Complex32 {
         c: *const Self::FloatTy,
         s: *const Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasCsrot(handle, n, x.cast(), incx, y.cast(), incy, c, s)
+        unsafe { cublasCsrot(handle, n, x.cast(), incx, y.cast(), incy, c, s) }
     }
     unsafe fn rotg(
         handle: cublasHandle_t,
@@ -387,7 +387,7 @@ impl Level1 for Complex32 {
         c: *mut Self::FloatTy,
         s: *mut Self,
     ) -> cublasStatus_t {
-        cublasCrotg(handle, a.cast(), b.cast(), c, s.cast())
+        unsafe { cublasCrotg(handle, a.cast(), b.cast(), c, s.cast()) }
     }
     unsafe fn rotm(
         _handle: cublasHandle_t,
@@ -417,7 +417,7 @@ impl Level1 for Complex32 {
         x: *mut Self,
         incx: c_int,
     ) -> cublasStatus_t {
-        cublasCscal(handle, n, alpha.cast(), x.cast(), incx)
+        unsafe { cublasCscal(handle, n, alpha.cast(), x.cast(), incx) }
     }
     unsafe fn swap(
         handle: cublasHandle_t,
@@ -427,7 +427,7 @@ impl Level1 for Complex32 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasCswap(handle, n, x.cast(), incx, y.cast(), incy)
+        unsafe { cublasCswap(handle, n, x.cast(), incx, y.cast(), incy) }
     }
 }
 
@@ -439,7 +439,7 @@ impl Level1 for Complex64 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIzamax(handle, n, x.cast(), incx, result)
+        unsafe { cublasIzamax(handle, n, x.cast(), incx, result) }
     }
     unsafe fn amin(
         handle: cublasHandle_t,
@@ -448,7 +448,7 @@ impl Level1 for Complex64 {
         incx: c_int,
         result: *mut c_int,
     ) -> cublasStatus_t {
-        cublasIzamin(handle, n, x.cast(), incx, result)
+        unsafe { cublasIzamin(handle, n, x.cast(), incx, result) }
     }
     unsafe fn axpy(
         handle: cublasHandle_t,
@@ -459,7 +459,7 @@ impl Level1 for Complex64 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasZaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy)
+        unsafe { cublasZaxpy(handle, n, alpha.cast(), x.cast(), incx, y.cast(), incy) }
     }
     unsafe fn copy(
         handle: cublasHandle_t,
@@ -469,7 +469,7 @@ impl Level1 for Complex64 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasZcopy(handle, n, x.cast(), incx, y.cast(), incy)
+        unsafe { cublasZcopy(handle, n, x.cast(), incx, y.cast(), incy) }
     }
     unsafe fn nrm2(
         handle: cublasHandle_t,
@@ -478,7 +478,7 @@ impl Level1 for Complex64 {
         incx: c_int,
         result: *mut Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasDznrm2(handle, n, x.cast(), incx, result)
+        unsafe { cublasDznrm2(handle, n, x.cast(), incx, result) }
     }
     unsafe fn rot(
         handle: cublasHandle_t,
@@ -490,7 +490,7 @@ impl Level1 for Complex64 {
         c: *const Self::FloatTy,
         s: *const Self::FloatTy,
     ) -> cublasStatus_t {
-        cublasZdrot(handle, n, x.cast(), incx, y.cast(), incy, c, s)
+        unsafe { cublasZdrot(handle, n, x.cast(), incx, y.cast(), incy, c, s) }
     }
     unsafe fn rotg(
         handle: cublasHandle_t,
@@ -499,7 +499,7 @@ impl Level1 for Complex64 {
         c: *mut Self::FloatTy,
         s: *mut Self,
     ) -> cublasStatus_t {
-        cublasZrotg(handle, a.cast(), b.cast(), c, s.cast())
+        unsafe { cublasZrotg(handle, a.cast(), b.cast(), c, s.cast()) }
     }
     unsafe fn rotm(
         _handle: cublasHandle_t,
@@ -529,7 +529,7 @@ impl Level1 for Complex64 {
         x: *mut Self,
         incx: c_int,
     ) -> cublasStatus_t {
-        cublasZscal(handle, n, alpha.cast(), x.cast(), incx)
+        unsafe { cublasZscal(handle, n, alpha.cast(), x.cast(), incx) }
     }
     unsafe fn swap(
         handle: cublasHandle_t,
@@ -539,7 +539,7 @@ impl Level1 for Complex64 {
         y: *mut Self,
         incy: c_int,
     ) -> cublasStatus_t {
-        cublasZswap(handle, n, x.cast(), incx, y.cast(), incy)
+        unsafe { cublasZswap(handle, n, x.cast(), incx, y.cast(), incy) }
     }
 }
 
@@ -575,7 +575,7 @@ impl ComplexLevel1 for Complex32 {
         incy: c_int,
         result: *mut Self,
     ) -> cublasStatus_t {
-        cublasCdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast())
+        unsafe { cublasCdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) }
     }
     unsafe fn dotc(
         handle: cublasHandle_t,
@@ -586,7 +586,7 @@ impl ComplexLevel1 for Complex32 {
         incy: c_int,
         result: *mut Self,
     ) -> cublasStatus_t {
-        cublasCdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast())
+        unsafe { cublasCdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) }
     }
 }
 
@@ -600,7 +600,7 @@ impl ComplexLevel1 for Complex64 {
         incy: c_int,
         result: *mut Self,
     ) -> cublasStatus_t {
-        cublasZdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast())
+        unsafe { cublasZdotu(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) }
     }
     unsafe fn dotc(
         handle: cublasHandle_t,
@@ -611,7 +611,7 @@ impl ComplexLevel1 for Complex64 {
         incy: c_int,
         result: *mut Self,
     ) -> cublasStatus_t {
-        cublasZdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast())
+        unsafe { cublasZdotc(handle, n, x.cast(), incx, y.cast(), incy, result.cast()) }
     }
 }
 
@@ -638,7 +638,7 @@ impl FloatLevel1 for f32 {
         incy: c_int,
         result: *mut Self,
     ) -> cublasStatus_t {
-        cublasSdot(handle, n, x, incx, y, incy, result)
+        unsafe { cublasSdot(handle, n, x, incx, y, incy, result) }
     }
 }
 
@@ -652,6 +652,6 @@ impl FloatLevel1 for f64 {
         incy: c_int,
         result: *mut Self,
     ) -> cublasStatus_t {
-        cublasDdot(handle, n, x, incx, y, incy, result)
+        unsafe { cublasDdot(handle, n, x, incx, y, incy, result) }
     }
 }
diff --git a/crates/blastoff/src/raw/level3.rs b/crates/blastoff/src/raw/level3.rs
index 3e770a29..855a8558 100644
--- a/crates/blastoff/src/raw/level3.rs
+++ b/crates/blastoff/src/raw/level3.rs
@@ -44,7 +44,7 @@ impl GemmOps for half::f16 {
     ) -> cublasStatus_t {
         // for some weird reason cublas only defines Hgemm if __cplusplus is defined, no idea why
         // but for now we just link against it manually, in the future we should figure out why
-        extern "C" {
+        unsafe extern "C" {
             fn cublasHgemm(
                 handle: cublasHandle_t,
                 transa: cublasOperation_t,
@@ -62,9 +62,11 @@ impl GemmOps for half::f16 {
                 ldc: c_int,
             ) -> cublasStatus_t;
         }
-        cublasHgemm(
-            handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        )
+        unsafe {
+            cublasHgemm(
+                handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+            )
+        }
     }
 }
 
@@ -85,9 +87,11 @@ impl GemmOps for f32 {
         c: *mut Self,
         ldc: c_int,
     ) -> cublasStatus_t {
-        cublasSgemm(
-            handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        )
+        unsafe {
+            cublasSgemm(
+                handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+            )
+        }
     }
 }
 
@@ -108,9 +112,11 @@ impl GemmOps for f64 {
         c: *mut Self,
         ldc: c_int,
     ) -> cublasStatus_t {
-        cublasDgemm(
-            handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        )
+        unsafe {
+            cublasDgemm(
+                handle, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+            )
+        }
     }
 }
 
@@ -131,22 +137,24 @@ impl GemmOps for Complex32 {
         c: *mut Self,
         ldc: c_int,
     ) -> cublasStatus_t {
-        cublasCgemm(
-            handle,
-            transa,
-            transb,
-            m,
-            n,
-            k,
-            alpha.cast(),
-            a.cast(),
-            lda,
-            b.cast(),
-            ldb,
-            beta.cast(),
-            c.cast(),
-            ldc,
-        )
+        unsafe {
+            cublasCgemm(
+                handle,
+                transa,
+                transb,
+                m,
+                n,
+                k,
+                alpha.cast(),
+                a.cast(),
+                lda,
+                b.cast(),
+                ldb,
+                beta.cast(),
+                c.cast(),
+                ldc,
+            )
+        }
     }
 }
 
@@ -167,21 +175,23 @@ impl GemmOps for Complex64 {
         c: *mut Self,
         ldc: c_int,
     ) -> cublasStatus_t {
-        cublasCgemm(
-            handle,
-            transa,
-            transb,
-            m,
-            n,
-            k,
-            alpha.cast(),
-            a.cast(),
-            lda,
-            b.cast(),
-            ldb,
-            beta.cast(),
-            c.cast(),
-            ldc,
-        )
+        unsafe {
+            cublasCgemm(
+                handle,
+                transa,
+                transb,
+                m,
+                n,
+                k,
+                alpha.cast(),
+                a.cast(),
+                lda,
+                b.cast(),
+                ldb,
+                beta.cast(),
+                c.cast(),
+                ldc,
+            )
+        }
     }
 }
diff --git a/crates/cuda_builder/Cargo.toml b/crates/cuda_builder/Cargo.toml
index 51fb8828..fd0466ba 100644
--- a/crates/cuda_builder/Cargo.toml
+++ b/crates/cuda_builder/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "cuda_builder"
 version = "0.3.0"
-edition = "2021"
+edition = "2024"
 authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>", "The rust-gpu Authors"]
 license = "MIT OR Apache-2.0"
 description = "Builder for easily building rustc_codegen_nvvm crates"
diff --git a/crates/cuda_builder/src/lib.rs b/crates/cuda_builder/src/lib.rs
index f54775c9..6441b827 100644
--- a/crates/cuda_builder/src/lib.rs
+++ b/crates/cuda_builder/src/lib.rs
@@ -469,12 +469,11 @@ fn find_in_dir(dir: &Path, filename: &str) -> Option<PathBuf> {
                 continue;
             }
 
-            if let Some(name) = path.file_name().and_then(|s| s.to_str()) {
-                if (name == filename)
-                    || (name.starts_with(&hashed_prefix) && name.ends_with(dll_suffix))
-                {
-                    return Some(path);
-                }
+            if let Some(name) = path.file_name().and_then(|s| s.to_str())
+                && (name == filename
+                    || (name.starts_with(&hashed_prefix) && name.ends_with(dll_suffix)))
+            {
+                return Some(path);
             }
         }
     }
@@ -580,12 +579,11 @@ fn workspace_root_dir() -> Option<PathBuf> {
 
     loop {
         let candidate = path.join("Cargo.toml");
-        if candidate.is_file() {
-            if let Ok(contents) = fs::read_to_string(&candidate) {
-                if contents.contains("[workspace]") {
-                    return Some(path.clone());
-                }
-            }
+        if candidate.is_file()
+            && let Ok(contents) = fs::read_to_string(&candidate)
+            && contents.contains("[workspace]")
+        {
+            return Some(path.clone());
         }
 
         if !path.pop() {
diff --git a/crates/cuda_std/Cargo.toml b/crates/cuda_std/Cargo.toml
index 42c06ab2..dfbfcb40 100644
--- a/crates/cuda_std/Cargo.toml
+++ b/crates/cuda_std/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "cuda_std"
 version = "0.2.2"
-edition = "2018"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "Standard library for CUDA with rustc_codegen_nvvm"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/cuda_std/src/atomic/intrinsics.rs b/crates/cuda_std/src/atomic/intrinsics.rs
index 7a4fc2a8..bd0c012e 100644
--- a/crates/cuda_std/src/atomic/intrinsics.rs
+++ b/crates/cuda_std/src/atomic/intrinsics.rs
@@ -1256,15 +1256,9 @@ atomic_fetch_op_4_reg! {
 
 #[allow(unused_macros)]
 macro_rules! negation {
-    (u32, $val:ident) => {{
-        -($val as i32)
-    }};
-    (u64, $val:ident) => {{
-        -($val as i64)
-    }};
-    ($type:ty, $val:ident) => {{
-        -$val
-    }};
+    (u32, $val:ident) => {{ -($val as i32) }};
+    (u64, $val:ident) => {{ -($val as i64) }};
+    ($type:ty, $val:ident) => {{ -$val }};
 }
 
 // atomic sub is a little special, nvcc implements it as an atomic add with a negated operand. PTX
diff --git a/crates/cuda_std/src/float.rs b/crates/cuda_std/src/float.rs
index 805ab6d2..e29e41cd 100644
--- a/crates/cuda_std/src/float.rs
+++ b/crates/cuda_std/src/float.rs
@@ -225,11 +225,7 @@ impl GpuFloat for f32 {
     #[inline]
     fn rem_euclid(self, rhs: f32) -> f32 {
         let r = self % rhs;
-        if r < 0.0 {
-            r + rhs.abs()
-        } else {
-            r
-        }
+        if r < 0.0 { r + rhs.abs() } else { r }
     }
 
     /// Raises a number to an integer power.
@@ -557,11 +553,7 @@ impl GpuFloat for f64 {
     #[inline]
     fn rem_euclid(self, rhs: f64) -> f64 {
         let r = self % rhs;
-        if r < 0.0 {
-            r + rhs.abs()
-        } else {
-            r
-        }
+        if r < 0.0 { r + rhs.abs() } else { r }
     }
 
     /// Raises a number to an integer power.
diff --git a/crates/cuda_std/src/intrinsics.rs b/crates/cuda_std/src/intrinsics.rs
index ece3ebb2..1a1bc1b6 100644
--- a/crates/cuda_std/src/intrinsics.rs
+++ b/crates/cuda_std/src/intrinsics.rs
@@ -14,7 +14,7 @@
 
 // Generated file, do not edit by hand, see scripts/gen_intrinsics.py
 
-extern "C" {
+unsafe extern "C" {
     #[link_name = "__nv_abs"]
     #[doc = "Determine the absolute value of the 32-bit signed integer x.
 
diff --git a/crates/cuda_std/src/io.rs b/crates/cuda_std/src/io.rs
index 54ed81de..53569c30 100644
--- a/crates/cuda_std/src/io.rs
+++ b/crates/cuda_std/src/io.rs
@@ -21,7 +21,7 @@
 //! This does NOT include exiting the program, however, because rust uses RAII, unless you leak the
 //! context, output will always be flushed.
 
-extern "C" {
+unsafe extern "C" {
     // CUDA syscalls implicitly defined by nvvm you can link to.
 
     #[doc(hidden)]
diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
index 0cb671bf..06ad4cc5 100644
--- a/crates/cuda_std/src/lib.rs
+++ b/crates/cuda_std/src/lib.rs
@@ -125,7 +125,7 @@ fn panic(_info: &core::panic::PanicInfo) -> ! {
 
     // // crate::println!("{}", msg);
 
-    extern "C" {
+    unsafe extern "C" {
         fn __nvvm_trap() -> !;
     }
 
diff --git a/crates/cuda_std/src/mem.rs b/crates/cuda_std/src/mem.rs
index d1c9626b..249727ff 100644
--- a/crates/cuda_std/src/mem.rs
+++ b/crates/cuda_std/src/mem.rs
@@ -7,7 +7,7 @@ use alloc::alloc::*;
 use core::ffi::c_void;
 
 #[cfg(target_arch = "nvptx64")]
-extern "C" {
+unsafe extern "C" {
     // implicitly defined by cuda.
     pub fn malloc(size: usize) -> *mut c_void;
 
@@ -19,10 +19,12 @@ pub struct CUDAAllocator;
 #[cfg(target_arch = "nvptx64")]
 unsafe impl GlobalAlloc for CUDAAllocator {
     unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
-        malloc(layout.size()) as *mut u8
+        unsafe { malloc(layout.size()) as *mut u8 }
     }
     unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
-        free(ptr as *mut _);
+        unsafe {
+            free(ptr as *mut _);
+        }
     }
 }
 
diff --git a/crates/cuda_std/src/ptr.rs b/crates/cuda_std/src/ptr.rs
index 3d1cac60..5bb0eb44 100644
--- a/crates/cuda_std/src/ptr.rs
+++ b/crates/cuda_std/src/ptr.rs
@@ -28,19 +28,22 @@ pub enum AddressSpace {
 #[gpu_only]
 pub unsafe fn is_in_address_space<T>(ptr: *const T, address_space: AddressSpace) -> bool {
     let ret: u32;
-    // create a predicate register to store the result of the isspacep into.
-    asm!(".reg .pred p;");
+    unsafe {
+        // Create a predicate register to store the result of the isspacep into.
+        asm!(".reg .pred p;");
 
-    // perform the actual isspacep operation, and store the result in the predicate register we made.
-    match address_space {
-        AddressSpace::Global => asm!("isspacep.global p, {}", in(reg64) ptr),
-        AddressSpace::Shared => asm!("isspacep.shared p, {}", in(reg64) ptr),
-        AddressSpace::Constant => asm!("isspacep.const p, {}", in(reg64) ptr),
-        AddressSpace::Local => asm!("isspacep.local p, {}", in(reg64) ptr),
-    }
+        // Perform the actual isspacep operation, and store the result in the predicate register we
+        // made.
+        match address_space {
+            AddressSpace::Global => asm!("isspacep.global p, {}", in(reg64) ptr),
+            AddressSpace::Shared => asm!("isspacep.shared p, {}", in(reg64) ptr),
+            AddressSpace::Constant => asm!("isspacep.const p, {}", in(reg64) ptr),
+            AddressSpace::Local => asm!("isspacep.local p, {}", in(reg64) ptr),
+        }
 
-    // finally, use the predicate register to write out a value.
-    asm!("selp.u32 {}, 1, 0, p;", out(reg32) ret);
+        // Finally, use the predicate register to write out a value.
+        asm!("selp.u32 {}, 1, 0, p;", out(reg32) ret);
+    }
 
     ret != 0
 }
@@ -59,27 +62,29 @@ pub unsafe fn convert_generic_to_specific_address_space<T>(
 ) -> *const T {
     let ret: *const T;
 
-    match address_space {
-        AddressSpace::Global => asm!(
-            "cvta.to.global.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
-        AddressSpace::Shared => asm!(
-            "cvta.to.shared.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
-        AddressSpace::Constant => asm!(
-            "cvta.to.const.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
-        AddressSpace::Local => asm!(
-            "cvta.to.local.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
+    unsafe {
+        match address_space {
+            AddressSpace::Global => asm!(
+                "cvta.to.global.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+            AddressSpace::Shared => asm!(
+                "cvta.to.shared.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+            AddressSpace::Constant => asm!(
+                "cvta.to.const.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+            AddressSpace::Local => asm!(
+                "cvta.to.local.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+        }
     }
 
     ret
@@ -99,27 +104,29 @@ pub unsafe fn convert_specific_address_space_to_generic<T>(
 ) -> *const T {
     let ret: *const T;
 
-    match address_space {
-        AddressSpace::Global => asm!(
-            "cvta.global.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
-        AddressSpace::Shared => asm!(
-            "cvta.shared.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
-        AddressSpace::Constant => asm!(
-            "cvta.const.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
-        AddressSpace::Local => asm!(
-            "cvta.local.u64 {}, {}",
-            out(reg64) ret,
-            in(reg64) ptr
-        ),
+    unsafe {
+        match address_space {
+            AddressSpace::Global => asm!(
+                "cvta.global.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+            AddressSpace::Shared => asm!(
+                "cvta.shared.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+            AddressSpace::Constant => asm!(
+                "cvta.const.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+            AddressSpace::Local => asm!(
+                "cvta.local.u64 {}, {}",
+                out(reg64) ret,
+                in(reg64) ptr
+            ),
+        }
     }
 
     ret
diff --git a/crates/cuda_std/src/shared.rs b/crates/cuda_std/src/shared.rs
index ddf4be5b..3bfab456 100644
--- a/crates/cuda_std/src/shared.rs
+++ b/crates/cuda_std/src/shared.rs
@@ -13,7 +13,7 @@ pub fn dynamic_shared_mem<T>() -> *mut T {
     // it is unclear whether an alignment of 16 is actually required for correctness, however,
     // it seems like nvcc always generates the global with .align 16 no matter the type, so we just copy
     // nvcc's behavior for now.
-    extern "C" {
+    unsafe extern "C" {
         // need to use nvvm_internal and not address_space because address_space only parses
         // static definitions, not extern static definitions.
         #[nvvm_internal::addrspace(3)]
diff --git a/crates/cuda_std/src/thread.rs b/crates/cuda_std/src/thread.rs
index 80d79f8b..8df6f1b8 100644
--- a/crates/cuda_std/src/thread.rs
+++ b/crates/cuda_std/src/thread.rs
@@ -69,7 +69,7 @@ use cuda_std_macros::gpu_only;
 use glam::{UVec2, UVec3};
 
 // different calling conventions dont exist in nvptx, so we just use C as a placeholder.
-extern "C" {
+unsafe extern "C" {
     // defined in libintrinsics.ll
     fn __nvvm_warp_size() -> u32;
 
@@ -286,7 +286,7 @@ pub fn sync_threads() {
 #[gpu_only]
 #[inline(always)]
 pub fn sync_threads_count(predicate: u32) -> u32 {
-    extern "C" {
+    unsafe extern "C" {
         #[link_name = "llvm.nvvm.barrier0.popc"]
         fn __nvvm_sync_threads_count(predicate: u32) -> u32;
     }
@@ -299,7 +299,7 @@ pub fn sync_threads_count(predicate: u32) -> u32 {
 #[gpu_only]
 #[inline(always)]
 pub fn sync_threads_and(predicate: u32) -> u32 {
-    extern "C" {
+    unsafe extern "C" {
         #[link_name = "llvm.nvvm.barrier0.and"]
         fn __nvvm_sync_threads_and(predicate: u32) -> u32;
     }
@@ -313,7 +313,7 @@ pub fn sync_threads_and(predicate: u32) -> u32 {
 #[gpu_only]
 #[inline(always)]
 pub fn sync_threads_or(predicate: u32) -> u32 {
-    extern "C" {
+    unsafe extern "C" {
         #[link_name = "llvm.nvvm.barrier0.or"]
         fn __nvvm_sync_threads_or(predicate: u32) -> u32;
     }
diff --git a/crates/cuda_std/src/warp.rs b/crates/cuda_std/src/warp.rs
index 146e2abb..94adec0a 100644
--- a/crates/cuda_std/src/warp.rs
+++ b/crates/cuda_std/src/warp.rs
@@ -21,12 +21,14 @@ use half::{bf16, f16};
 #[gpu_only]
 #[inline(always)]
 pub unsafe fn sync_warp(mask: u32) {
-    extern "C" {
+    unsafe extern "C" {
         #[link_name = "llvm.nvvm.bar.warp.sync"]
         fn sync(mask: u32);
     }
 
-    sync(mask);
+    unsafe {
+        sync(mask);
+    }
 }
 
 /// Returns the thread's lane within its warp. This value ranges from `0` to `WARP_SIZE - 1` (`WARP_SIZE` is 32 on all
@@ -72,7 +74,7 @@ pub fn activemask() -> u32 {
 #[gpu_only]
 #[inline(always)]
 pub unsafe fn warp_reduce<T: WarpReduceValue>(mask: u32, value: T, op: WarpReductionOp) -> T {
-    T::reduce(mask, value, op)
+    unsafe { T::reduce(mask, value, op) }
 }
 
 /// The type of operation to apply in a warp reduction.
@@ -98,7 +100,7 @@ macro_rules! impl_reduce {
             paste::paste! {
                 impl WarpReduceValue for $type {
                     unsafe fn reduce(mask: u32, value: Self, op: WarpReductionOp) -> Self {
-                        [<warp_reduce_ $type>](mask, value, op)
+                        unsafe { [<warp_reduce_ $type>](mask, value, op) }
                     }
                 }
             }
@@ -114,32 +116,34 @@ impl_reduce! {
 #[gpu_only]
 unsafe fn warp_reduce_32(mask: u32, value: u32, op: WarpReductionOp) -> u32 {
     let out;
-    match op {
-        WarpReductionOp::And => {
-            asm!(
-                "redux.sync.and.b32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
-        }
-        WarpReductionOp::Or => {
-            asm!(
-                "redux.sync.or.b32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
-        }
-        WarpReductionOp::Xor => {
-            asm!(
-                "redux.sync.xor.b32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
+    unsafe {
+        match op {
+            WarpReductionOp::And => {
+                asm!(
+                    "redux.sync.and.b32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            WarpReductionOp::Or => {
+                asm!(
+                    "redux.sync.or.b32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            WarpReductionOp::Xor => {
+                asm!(
+                    "redux.sync.xor.b32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            _ => unreachable!(),
         }
-        _ => unreachable!(),
     }
     out
 }
@@ -147,32 +151,34 @@ unsafe fn warp_reduce_32(mask: u32, value: u32, op: WarpReductionOp) -> u32 {
 #[gpu_only]
 unsafe fn warp_reduce_u32(mask: u32, value: u32, op: WarpReductionOp) -> u32 {
     let out;
-    match op {
-        WarpReductionOp::Add => {
-            asm!(
-                "redux.sync.add.u32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
-        }
-        WarpReductionOp::Min => {
-            asm!(
-                "redux.sync.min.u32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
-        }
-        WarpReductionOp::Max => {
-            asm!(
-                "redux.sync.max.u32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
+    unsafe {
+        match op {
+            WarpReductionOp::Add => {
+                asm!(
+                    "redux.sync.add.u32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            WarpReductionOp::Min => {
+                asm!(
+                    "redux.sync.min.u32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            WarpReductionOp::Max => {
+                asm!(
+                    "redux.sync.max.u32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            _ => out = warp_reduce_32(mask, value, op),
         }
-        _ => out = warp_reduce_32(mask, value, op),
     }
     out
 }
@@ -180,32 +186,34 @@ unsafe fn warp_reduce_u32(mask: u32, value: u32, op: WarpReductionOp) -> u32 {
 #[gpu_only]
 unsafe fn warp_reduce_i32(mask: u32, value: i32, op: WarpReductionOp) -> i32 {
     let out;
-    match op {
-        WarpReductionOp::Add => {
-            asm!(
-                "redux.sync.add.s32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
-        }
-        WarpReductionOp::Min => {
-            asm!(
-                "redux.sync.min.s32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
-        }
-        WarpReductionOp::Max => {
-            asm!(
-                "redux.sync.max.s32 {}, {}, {};",
-                out(reg32) out,
-                in(reg32) value,
-                in(reg32) mask
-            );
+    unsafe {
+        match op {
+            WarpReductionOp::Add => {
+                asm!(
+                    "redux.sync.add.s32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            WarpReductionOp::Min => {
+                asm!(
+                    "redux.sync.min.s32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            WarpReductionOp::Max => {
+                asm!(
+                    "redux.sync.max.s32 {}, {}, {};",
+                    out(reg32) out,
+                    in(reg32) value,
+                    in(reg32) mask
+                );
+            }
+            _ => out = warp_reduce_32(mask, value as u32, op) as i32,
         }
-        _ => out = warp_reduce_32(mask, value as u32, op) as i32,
     }
     out
 }
@@ -225,7 +233,7 @@ unsafe fn warp_reduce_i32(mask: u32, value: i32, op: WarpReductionOp) -> i32 {
 #[gpu_only]
 #[inline(always)]
 pub unsafe fn warp_match_any<T: WarpMatchValue>(mask: u32, value: T) -> u32 {
-    T::match_any(mask, value)
+    unsafe { T::match_any(mask, value) }
 }
 
 /// Synchronizes threads in a warp and performs a broadcast-and-compare operation between them.
@@ -243,7 +251,7 @@ pub unsafe fn warp_match_any<T: WarpMatchValue>(mask: u32, value: T) -> u32 {
 #[gpu_only]
 #[inline(always)]
 pub unsafe fn warp_match_all<T: WarpMatchValue>(mask: u32, value: T) -> Option<u32> {
-    T::match_all(mask, value)
+    unsafe { T::match_all(mask, value) }
 }
 
 /// A value that can be used inside of a warp match.
@@ -260,10 +268,10 @@ macro_rules! impl_match {
             paste::paste! {
                 impl WarpMatchValue for $type {
                     unsafe fn match_any(mask: u32, value: Self) -> u32 {
-                        [<match_any_ $width>](mask, value as [<u $width>])
+                        unsafe { [<match_any_ $width>](mask, value as [<u $width>]) }
                     }
                     unsafe fn match_all(mask: u32, value: Self) -> Option<u32> {
-                        let (val, pred) = [<match_all_ $width>](mask, value as [<u $width>]);
+                        let (val, pred) = unsafe { [<match_all_ $width>](mask, value as [<u $width>]) };
                         pred.then(|| val)
                     }
                 }
@@ -284,41 +292,41 @@ impl_match! {
 #[gpu_only]
 #[inline(always)]
 unsafe fn match_any_32(mask: u32, value: u32) -> u32 {
-    extern "C" {
+    unsafe extern "C" {
         #[link_name = "llvm.nvvm.match.any.sync.i32"]
         fn __nvvm_warp_match_any_32(mask: u32, value: u32) -> u32;
     }
-    __nvvm_warp_match_any_32(mask, value)
+    unsafe { __nvvm_warp_match_any_32(mask, value) }
 }
 
 #[gpu_only]
 #[inline(always)]
 unsafe fn match_any_64(mask: u32, value: u64) -> u32 {
-    extern "C" {
+    unsafe extern "C" {
         #[link_name = "llvm.nvvm.match.any.sync.i64"]
         fn __nvvm_warp_match_any_64(mask: u32, value: u64) -> u32;
     }
-    __nvvm_warp_match_any_64(mask, value)
+    unsafe { __nvvm_warp_match_any_64(mask, value) }
 }
 
 #[gpu_only]
 #[inline(always)]
 unsafe fn match_all_32(mask: u32, value: u32) -> (u32, bool) {
-    extern "C" {
+    unsafe extern "C" {
         #[allow(improper_ctypes)]
         fn __nvvm_warp_match_all_32(mask: u32, value: u32) -> (u32, bool);
     }
-    __nvvm_warp_match_all_32(mask, value)
+    unsafe { __nvvm_warp_match_all_32(mask, value) }
 }
 
 #[gpu_only]
 #[inline(always)]
 unsafe fn match_all_64(mask: u32, value: u64) -> (u32, bool) {
-    extern "C" {
+    unsafe extern "C" {
         #[allow(improper_ctypes)]
         fn __nvvm_warp_match_all_64(mask: u32, value: u64) -> (u32, bool);
     }
-    __nvvm_warp_match_all_64(mask, value)
+    unsafe { __nvvm_warp_match_all_64(mask, value) }
 }
 
 /// Synchronizes a subset of threads in a warp then performs a reduce-and-broadcast
@@ -337,19 +345,19 @@ unsafe fn match_all_64(mask: u32, value: u64) -> (u32, bool) {
 #[gpu_only]
 pub unsafe fn warp_vote_all(mask: u32, predicate: bool) -> bool {
     let mut out: u32;
-
-    asm!(
-        "{{",
-        ".reg .pred %p<3>;",
-        "setp.eq.u32 %p1, {}, 1;",
-        "vote.sync.all.pred %p2, %p1, {};",
-        "selp.u32 {}, 0, 1, %p2;",
-        "}}",
-        in(reg32) predicate as u32,
-        in(reg32) mask,
-        out(reg32) out
-    );
-
+    unsafe {
+        asm!(
+            "{{",
+            ".reg .pred %p<3>;",
+            "setp.eq.u32 %p1, {}, 1;",
+            "vote.sync.all.pred %p2, %p1, {};",
+            "selp.u32 {}, 0, 1, %p2;",
+            "}}",
+            in(reg32) predicate as u32,
+            in(reg32) mask,
+            out(reg32) out
+        );
+    }
     out != 0
 }
 
@@ -369,19 +377,19 @@ pub unsafe fn warp_vote_all(mask: u32, predicate: bool) -> bool {
 #[gpu_only]
 pub unsafe fn warp_vote_any(mask: u32, predicate: bool) -> bool {
     let mut out: u32;
-
-    asm!(
-        "{{",
-        ".reg .pred %p<3>;",
-        "setp.eq.u32 %p1, {}, 1;",
-        "vote.sync.any.pred %p2, %p1, {};",
-        "selp.u32 {}, 0, 1, %p2;",
-        "}}",
-        in(reg32) predicate as u32,
-        in(reg32) mask,
-        out(reg32) out
-    );
-
+    unsafe {
+        asm!(
+            "{{",
+            ".reg .pred %p<3>;",
+            "setp.eq.u32 %p1, {}, 1;",
+            "vote.sync.any.pred %p2, %p1, {};",
+            "selp.u32 {}, 0, 1, %p2;",
+            "}}",
+            in(reg32) predicate as u32,
+            in(reg32) mask,
+            out(reg32) out
+        );
+    }
     out != 0
 }
 
@@ -402,18 +410,18 @@ pub unsafe fn warp_vote_any(mask: u32, predicate: bool) -> bool {
 #[gpu_only]
 pub unsafe fn warp_vote_ballot(mask: u32, predicate: bool) -> u32 {
     let mut out: u32;
-
-    asm!(
-        "{{",
-        ".reg .pred %p1;",
-        "setp.eq.u32 %p1, {}, 1;",
-        "vote.sync.ballot.b32 {}, %p1, {};",
-        "}}",
-        in(reg32) predicate as u32,
-        out(reg32) out,
-        in(reg32) mask,
-    );
-
+    unsafe {
+        asm!(
+            "{{",
+            ".reg .pred %p1;",
+            "setp.eq.u32 %p1, {}, 1;",
+            "vote.sync.ballot.b32 {}, %p1, {};",
+            "}}",
+            in(reg32) predicate as u32,
+            out(reg32) out,
+            in(reg32) mask,
+        );
+    }
     out
 }
 
@@ -465,7 +473,7 @@ pub unsafe fn warp_shuffle_down<T: WarpShuffleValue>(
     delta: u32,
     width: u32,
 ) -> (T, bool) {
-    T::shuffle(WarpShuffleMode::Down, mask, value, delta, width)
+    unsafe { T::shuffle(WarpShuffleMode::Down, mask, value, delta, width) }
 }
 
 /// Waits for threads in a warp to reach this point and shuffles a value across the
@@ -516,7 +524,7 @@ pub unsafe fn warp_shuffle_up<T: WarpShuffleValue>(
     delta: u32,
     width: u32,
 ) -> (T, bool) {
-    T::shuffle(WarpShuffleMode::Up, mask, value, delta, width)
+    unsafe { T::shuffle(WarpShuffleMode::Up, mask, value, delta, width) }
 }
 
 /// Waits for threads in a warp to reach this point and shuffles a value across the
@@ -568,7 +576,7 @@ pub unsafe fn warp_shuffle_idx<T: WarpShuffleValue>(
     idx: u32,
     width: u32,
 ) -> (T, bool) {
-    T::shuffle(WarpShuffleMode::Idx, mask, value, idx, width)
+    unsafe { T::shuffle(WarpShuffleMode::Idx, mask, value, idx, width) }
 }
 
 /// Waits for threads in a warp to reach this point and shuffles a value across the
@@ -619,7 +627,7 @@ pub unsafe fn warp_shuffle_xor<T: WarpShuffleValue>(
     lane_mask: u32,
     width: u32,
 ) -> (T, bool) {
-    T::shuffle(WarpShuffleMode::Xor, mask, value, lane_mask, width)
+    unsafe { T::shuffle(WarpShuffleMode::Xor, mask, value, lane_mask, width) }
 }
 
 /// A value that can be used in a warp shuffle
@@ -647,7 +655,7 @@ macro_rules! impl_shuffle {
                         b: u32,
                         width: u32,
                     ) -> (Self, bool) {
-                        let (res, oob) = [<warp_shuffle_ $width>](mode, mask, value as [<u $width>], b, width);
+                        let (res, oob) = unsafe { [<warp_shuffle_ $width>](mode, mask, value as [<u $width>], b, width) };
                         (res as $type, oob)
                     }
                 }
@@ -679,7 +687,7 @@ impl WarpShuffleValue for f32 {
         b: u32,
         width: u32,
     ) -> (Self, bool) {
-        let (res, oob) = warp_shuffle_32(mode, mask, value.to_bits(), b, width);
+        let (res, oob) = unsafe { warp_shuffle_32(mode, mask, value.to_bits(), b, width) };
         (f32::from_bits(res), oob)
     }
 }
@@ -692,7 +700,7 @@ impl WarpShuffleValue for f64 {
         b: u32,
         width: u32,
     ) -> (Self, bool) {
-        let (res, oob) = warp_shuffle_64(mode, mask, value.to_bits(), b, width);
+        let (res, oob) = unsafe { warp_shuffle_64(mode, mask, value.to_bits(), b, width) };
         (f64::from_bits(res), oob)
     }
 }
@@ -705,7 +713,7 @@ impl WarpShuffleValue for f16 {
         b: u32,
         width: u32,
     ) -> (Self, bool) {
-        let (res, oob) = warp_shuffle_16(mode, mask, value.to_bits(), b, width);
+        let (res, oob) = unsafe { warp_shuffle_16(mode, mask, value.to_bits(), b, width) };
         (f16::from_bits(res), oob)
     }
 }
@@ -718,7 +726,7 @@ impl WarpShuffleValue for bf16 {
         b: u32,
         width: u32,
     ) -> (Self, bool) {
-        let (res, oob) = warp_shuffle_16(mode, mask, value.to_bits(), b, width);
+        let (res, oob) = unsafe { warp_shuffle_16(mode, mask, value.to_bits(), b, width) };
         (bf16::from_bits(res), oob)
     }
 }
@@ -751,7 +759,7 @@ unsafe fn warp_shuffle_32(
     b: u32,
     width: u32,
 ) -> (u32, bool) {
-    extern "C" {
+    unsafe extern "C" {
         // see libintrinsics.ll
         // Returns {i32, i8} in LLVM IR, which maps to our WarpShuffleResult struct
         fn __nvvm_warp_shuffle(mask: u32, mode: u32, a: u32, b: u32, c: u32) -> WarpShuffleResult;
@@ -767,7 +775,7 @@ unsafe fn warp_shuffle_32(
     c |= 0b11111;
     c |= (32 - width) << 8;
 
-    let result = __nvvm_warp_shuffle(mask, mode as u32, value, b, c);
+    let result = unsafe { __nvvm_warp_shuffle(mask, mode as u32, value, b, c) };
     (result.value, result.predicate != 0)
 }
 
@@ -782,8 +790,8 @@ unsafe fn warp_shuffle_128(
     let second_half = (value >> 64) as u64;
     // shuffle the first and second half of the value then recombine them
     // this will perform 4 shuffles in total (4 32-bit shuffles)
-    let (new_first_half, oob) = warp_shuffle_64(mode, mask, first_half, b, width);
-    let (new_second_half, _) = warp_shuffle_64(mode, mask, second_half, b, width);
+    let (new_first_half, oob) = unsafe { warp_shuffle_64(mode, mask, first_half, b, width) };
+    let (new_second_half, _) = unsafe { warp_shuffle_64(mode, mask, second_half, b, width) };
     (
         ((new_second_half as u128) << 64) | (new_first_half as u128),
         oob,
@@ -800,8 +808,8 @@ unsafe fn warp_shuffle_64(
     let first_half = value as u32;
     let second_half = (value >> 32) as u32;
     // shuffle the first and second half of the value then recombine them
-    let (new_first_half, oob) = warp_shuffle_32(mode, mask, first_half, b, width);
-    let (new_second_half, _) = warp_shuffle_32(mode, mask, second_half, b, width);
+    let (new_first_half, oob) = unsafe { warp_shuffle_32(mode, mask, first_half, b, width) };
+    let (new_second_half, _) = unsafe { warp_shuffle_32(mode, mask, second_half, b, width) };
     (
         ((new_second_half as u64) << 32) | (new_first_half as u64),
         oob,
@@ -815,7 +823,7 @@ unsafe fn warp_shuffle_16(
     b: u32,
     width: u32,
 ) -> (u16, bool) {
-    let (value, oob) = warp_shuffle_32(mode, mask, value as u32, b, width);
+    let (value, oob) = unsafe { warp_shuffle_32(mode, mask, value as u32, b, width) };
     ((value as u16), oob)
 }
 
@@ -826,6 +834,6 @@ unsafe fn warp_shuffle_8(
     b: u32,
     width: u32,
 ) -> (u8, bool) {
-    let (value, oob) = warp_shuffle_32(mode, mask, value as u32, b, width);
+    let (value, oob) = unsafe { warp_shuffle_32(mode, mask, value as u32, b, width) };
     ((value as u8), oob)
 }
diff --git a/crates/cuda_std_macros/Cargo.toml b/crates/cuda_std_macros/Cargo.toml
index 4557dcff..1fc98d11 100644
--- a/crates/cuda_std_macros/Cargo.toml
+++ b/crates/cuda_std_macros/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "cuda_std_macros"
 version = "0.2.0"
-edition = "2018"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "Macros for cuda_std"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/cuda_std_macros/src/lib.rs b/crates/cuda_std_macros/src/lib.rs
index 26eb47d9..467a6319 100644
--- a/crates/cuda_std_macros/src/lib.rs
+++ b/crates/cuda_std_macros/src/lib.rs
@@ -1,9 +1,9 @@
 use proc_macro::TokenStream;
 use proc_macro2::Span;
-use quote::{quote_spanned, ToTokens};
+use quote::{ToTokens, quote_spanned};
 use syn::{
-    parse::Parse, parse_macro_input, parse_quote, punctuated::Punctuated, spanned::Spanned, Error,
-    FnArg, Ident, ItemFn, ReturnType, Stmt, Token,
+    Error, FnArg, Ident, ItemFn, ReturnType, Stmt, Token, parse::Parse, parse_macro_input,
+    parse_quote, punctuated::Punctuated, spanned::Spanned,
 };
 
 /// Registers a function as a gpu kernel.
@@ -25,7 +25,7 @@ pub fn kernel(input: proc_macro::TokenStream, item: proc_macro::TokenStream) ->
     let _ = parse_macro_input!(input as KernelHints);
     let input = parse_macro_input!(cloned as proc_macro2::TokenStream);
     let mut item = parse_macro_input!(item as ItemFn);
-    let no_mangle = parse_quote!(#[no_mangle]);
+    let no_mangle = parse_quote!(#[unsafe(no_mangle)]);
     item.attrs.push(no_mangle);
     let internal = parse_quote!(#[cfg_attr(target_arch="nvptx64", nvvm_internal::kernel(#input))]);
     item.attrs.push(internal);
diff --git a/crates/cudnn/Cargo.toml b/crates/cudnn/Cargo.toml
index 05de479f..99a8f388 100644
--- a/crates/cudnn/Cargo.toml
+++ b/crates/cudnn/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 authors = ["frjnn <f.iannelli.francesco229@gmail.com>"]
-edition = "2021"
+edition = "2024"
 name = "cudnn"
 version = "0.1.0"
 
diff --git a/crates/cudnn/src/activation/mod.rs b/crates/cudnn/src/activation/mod.rs
index c5193204..b51d2799 100644
--- a/crates/cudnn/src/activation/mod.rs
+++ b/crates/cudnn/src/activation/mod.rs
@@ -1,7 +1,7 @@
 use cust::memory::GpuBuffer;
 
 use crate::{
-    private, CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor,
+    CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor, private,
 };
 
 mod activation_descriptor;
diff --git a/crates/cudnn/src/attention/seq_data_descriptor.rs b/crates/cudnn/src/attention/seq_data_descriptor.rs
index 9c47e53f..4dd007e4 100644
--- a/crates/cudnn/src/attention/seq_data_descriptor.rs
+++ b/crates/cudnn/src/attention/seq_data_descriptor.rs
@@ -1,6 +1,6 @@
 use std::{marker::PhantomData, mem::MaybeUninit};
 
-use crate::{private, CudnnError, DataType, IntoResult, SeqDataAxis};
+use crate::{CudnnError, DataType, IntoResult, SeqDataAxis, private};
 
 /// Specifies the allowed types for the sequential data buffer.
 ///
diff --git a/crates/cudnn/src/backend/conv_bwd_data.rs b/crates/cudnn/src/backend/conv_bwd_data.rs
index 793e8659..cff56be3 100644
--- a/crates/cudnn/src/backend/conv_bwd_data.rs
+++ b/crates/cudnn/src/backend/conv_bwd_data.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor},
     CudnnError, DataType, IntoResult,
+    backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor},
 };
 
 pub struct ConvBwdDataBuilder {
diff --git a/crates/cudnn/src/backend/conv_bwd_filter.rs b/crates/cudnn/src/backend/conv_bwd_filter.rs
index 5ddce6f8..18b49617 100644
--- a/crates/cudnn/src/backend/conv_bwd_filter.rs
+++ b/crates/cudnn/src/backend/conv_bwd_filter.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor},
     CudnnError, DataType, IntoResult,
+    backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor},
 };
 
 pub struct ConvBwdFilterBuilder {
diff --git a/crates/cudnn/src/backend/conv_cfg.rs b/crates/cudnn/src/backend/conv_cfg.rs
index f89d99d1..cb5b5a6e 100644
--- a/crates/cudnn/src/backend/conv_cfg.rs
+++ b/crates/cudnn/src/backend/conv_cfg.rs
@@ -1,4 +1,4 @@
-use crate::{backend::Descriptor, ConvMode, CudnnError, DataType, IntoResult};
+use crate::{ConvMode, CudnnError, DataType, IntoResult, backend::Descriptor};
 
 #[derive(Default, Clone, PartialEq, Eq, Hash, Debug)]
 pub struct ConvCfgBuilder<'a> {
diff --git a/crates/cudnn/src/backend/conv_fwd.rs b/crates/cudnn/src/backend/conv_fwd.rs
index ffde1de2..873165ab 100644
--- a/crates/cudnn/src/backend/conv_fwd.rs
+++ b/crates/cudnn/src/backend/conv_fwd.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor},
     CudnnError, DataType, IntoResult,
+    backend::{ConvCfg, Descriptor, FloatDataType, Operation, Real, Tensor},
 };
 
 pub struct ConvFwdBuilder {
diff --git a/crates/cudnn/src/backend/engine.rs b/crates/cudnn/src/backend/engine.rs
index 9eb3d726..43e1c421 100644
--- a/crates/cudnn/src/backend/engine.rs
+++ b/crates/cudnn/src/backend/engine.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, Graph},
     CudnnError, IntoResult,
+    backend::{Descriptor, Graph},
 };
 
 #[derive(Default, Debug, PartialEq)]
diff --git a/crates/cudnn/src/backend/engine_cfg.rs b/crates/cudnn/src/backend/engine_cfg.rs
index 9a8b39a1..8531e12c 100644
--- a/crates/cudnn/src/backend/engine_cfg.rs
+++ b/crates/cudnn/src/backend/engine_cfg.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, Engine},
     CudnnError, IntoResult,
+    backend::{Descriptor, Engine},
 };
 
 #[derive(Default, PartialEq, Debug)]
diff --git a/crates/cudnn/src/backend/engine_heuristic.rs b/crates/cudnn/src/backend/engine_heuristic.rs
index ac7c4fe0..ebdda555 100644
--- a/crates/cudnn/src/backend/engine_heuristic.rs
+++ b/crates/cudnn/src/backend/engine_heuristic.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, EngineCfgBuilder, Graph},
     CudnnContext, CudnnError, IntoResult,
+    backend::{Descriptor, EngineCfgBuilder, Graph},
 };
 
 pub enum HeuristicMode {
diff --git a/crates/cudnn/src/backend/execution_plan.rs b/crates/cudnn/src/backend/execution_plan.rs
index 14f313cf..4dffc922 100644
--- a/crates/cudnn/src/backend/execution_plan.rs
+++ b/crates/cudnn/src/backend/execution_plan.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, EngineCfg},
     CudnnContext, CudnnError, IntoResult,
+    backend::{Descriptor, EngineCfg},
 };
 
 #[derive(Default, PartialEq, Debug)]
diff --git a/crates/cudnn/src/backend/graph.rs b/crates/cudnn/src/backend/graph.rs
index a30af78f..c070600c 100644
--- a/crates/cudnn/src/backend/graph.rs
+++ b/crates/cudnn/src/backend/graph.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, Operation},
     CudnnContext, CudnnError,
+    backend::{Descriptor, Operation},
 };
 
 #[derive(Default, PartialEq, Debug)]
diff --git a/crates/cudnn/src/backend/matmul.rs b/crates/cudnn/src/backend/matmul.rs
index 14badc8b..f4e55ef8 100644
--- a/crates/cudnn/src/backend/matmul.rs
+++ b/crates/cudnn/src/backend/matmul.rs
@@ -1,8 +1,8 @@
 use cust::memory::bytemuck::Contiguous;
 
 use crate::{
-    backend::{Descriptor, MatMulCfg, Operation, Tensor},
     CudnnError, DataType, IntoResult,
+    backend::{Descriptor, MatMulCfg, Operation, Tensor},
 };
 
 #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)]
diff --git a/crates/cudnn/src/backend/matmul_cfg.rs b/crates/cudnn/src/backend/matmul_cfg.rs
index 3fba7107..d8314d76 100644
--- a/crates/cudnn/src/backend/matmul_cfg.rs
+++ b/crates/cudnn/src/backend/matmul_cfg.rs
@@ -1,4 +1,4 @@
-use crate::{backend::Descriptor, CudnnError, DataType, IntoResult};
+use crate::{CudnnError, DataType, IntoResult, backend::Descriptor};
 
 #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)]
 pub struct MatMulCfgBuilder {
diff --git a/crates/cudnn/src/backend/pointwise.rs b/crates/cudnn/src/backend/pointwise.rs
index baa0347e..9ca3c60c 100644
--- a/crates/cudnn/src/backend/pointwise.rs
+++ b/crates/cudnn/src/backend/pointwise.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, FloatDataType, Operation, PointwiseCfg, PointwiseMode, Real, Tensor},
     CudnnError, DataType, IntoResult, NanPropagation,
+    backend::{Descriptor, FloatDataType, Operation, PointwiseCfg, PointwiseMode, Real, Tensor},
 };
 
 #[derive(Clone, Default, Debug, PartialEq)]
diff --git a/crates/cudnn/src/backend/pointwise_cfg.rs b/crates/cudnn/src/backend/pointwise_cfg.rs
index bae13e72..20a09475 100644
--- a/crates/cudnn/src/backend/pointwise_cfg.rs
+++ b/crates/cudnn/src/backend/pointwise_cfg.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, PointwiseMode},
     CudnnError, DataType, IntoResult, NanPropagation,
+    backend::{Descriptor, PointwiseMode},
 };
 
 #[derive(Clone, Default, PartialEq, Debug)]
diff --git a/crates/cudnn/src/backend/reduction.rs b/crates/cudnn/src/backend/reduction.rs
index 3a05b767..d5794192 100644
--- a/crates/cudnn/src/backend/reduction.rs
+++ b/crates/cudnn/src/backend/reduction.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, Operation, ReductionCfg, Tensor},
     CudnnError, IntoResult,
+    backend::{Descriptor, Operation, ReductionCfg, Tensor},
 };
 
 #[derive(Default, Clone, Debug, PartialEq, Eq, Hash)]
diff --git a/crates/cudnn/src/backend/reduction_cfg.rs b/crates/cudnn/src/backend/reduction_cfg.rs
index bb1689e3..292d1ecc 100644
--- a/crates/cudnn/src/backend/reduction_cfg.rs
+++ b/crates/cudnn/src/backend/reduction_cfg.rs
@@ -1,6 +1,6 @@
 use crate::{
-    backend::{Descriptor, ReductionMode},
     CudnnError, DataType, IntoResult,
+    backend::{Descriptor, ReductionMode},
 };
 
 #[derive(Clone, Default, PartialEq, Eq, Hash, Debug)]
diff --git a/crates/cudnn/src/backend/tensor.rs b/crates/cudnn/src/backend/tensor.rs
index 1b9c7e88..b1fcdacc 100644
--- a/crates/cudnn/src/backend/tensor.rs
+++ b/crates/cudnn/src/backend/tensor.rs
@@ -1,4 +1,4 @@
-use crate::{backend::Descriptor, CudnnError, DataType, IntoResult};
+use crate::{CudnnError, DataType, IntoResult, backend::Descriptor};
 
 #[derive(Clone, Default, Debug, PartialEq, Eq, Hash)]
 pub struct TensorBuilder<'a> {
diff --git a/crates/cudnn/src/convolution/convolution_config.rs b/crates/cudnn/src/convolution/convolution_config.rs
index c5186e53..ea3c2a37 100644
--- a/crates/cudnn/src/convolution/convolution_config.rs
+++ b/crates/cudnn/src/convolution/convolution_config.rs
@@ -1,4 +1,4 @@
-use crate::{private, DataType};
+use crate::{DataType, private};
 
 /// Supported data types configurations for convolution operations.
 ///
diff --git a/crates/cudnn/src/op/op_tensor_descriptor.rs b/crates/cudnn/src/op/op_tensor_descriptor.rs
index 11e1a89a..80819b7f 100644
--- a/crates/cudnn/src/op/op_tensor_descriptor.rs
+++ b/crates/cudnn/src/op/op_tensor_descriptor.rs
@@ -15,12 +15,12 @@ unsafe fn init_raw_op_descriptor<T: DataType>(
 ) -> Result<cudnn_sys::cudnnOpTensorDescriptor_t, CudnnError> {
     let mut raw = MaybeUninit::uninit();
 
-    cudnn_sys::cudnnCreateOpTensorDescriptor(raw.as_mut_ptr()).into_result()?;
-
-    let raw = raw.assume_init();
-
-    cudnn_sys::cudnnSetOpTensorDescriptor(raw, op, T::into_raw(), nan_opt).into_result()?;
-    Ok(raw)
+    unsafe {
+        cudnn_sys::cudnnCreateOpTensorDescriptor(raw.as_mut_ptr()).into_result()?;
+        let raw = raw.assume_init();
+        cudnn_sys::cudnnSetOpTensorDescriptor(raw, op, T::into_raw(), nan_opt).into_result()?;
+        Ok(raw)
+    }
 }
 
 /// The description of a unary Tensor Core operation.
diff --git a/crates/cudnn/src/pooling/mod.rs b/crates/cudnn/src/pooling/mod.rs
index 5953d2b7..2b46139c 100644
--- a/crates/cudnn/src/pooling/mod.rs
+++ b/crates/cudnn/src/pooling/mod.rs
@@ -1,7 +1,7 @@
 use cust::memory::GpuBuffer;
 
 use crate::{
-    private, CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor,
+    CudnnContext, CudnnError, DataType, IntoResult, ScalingDataType, TensorDescriptor, private,
 };
 
 mod pooling_descriptor;
diff --git a/crates/cudnn/src/rnn/rnn_data_descriptor.rs b/crates/cudnn/src/rnn/rnn_data_descriptor.rs
index 696f8d8b..4c46dfae 100644
--- a/crates/cudnn/src/rnn/rnn_data_descriptor.rs
+++ b/crates/cudnn/src/rnn/rnn_data_descriptor.rs
@@ -1,6 +1,6 @@
 use std::{marker::PhantomData, mem::MaybeUninit};
 
-use crate::{private, CudnnError, DataType, IntoResult, RnnDataLayout};
+use crate::{CudnnError, DataType, IntoResult, RnnDataLayout, private};
 
 /// Specifies the allowed types for the recurrent neural network inputs and outputs.
 ///
diff --git a/crates/cudnn/src/softmax/mod.rs b/crates/cudnn/src/softmax/mod.rs
index ed979fc6..1ae6c422 100644
--- a/crates/cudnn/src/softmax/mod.rs
+++ b/crates/cudnn/src/softmax/mod.rs
@@ -1,6 +1,6 @@
 use cust::memory::GpuBuffer;
 
-use crate::{private, CudnnContext, CudnnError, DataType, IntoResult, TensorDescriptor};
+use crate::{CudnnContext, CudnnError, DataType, IntoResult, TensorDescriptor, private};
 
 mod softmax_algo;
 mod softmax_mode;
diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
index 370fe331..3e59f386 100644
--- a/crates/cust/Cargo.toml
+++ b/crates/cust/Cargo.toml
@@ -6,7 +6,7 @@ authors = [
     "Riccardo D'Ambrosio <rdambrosio016@gmail.com>",
     "Brook Heisler <brookheisler@gmail.com>",
 ]
-edition = "2021"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "High level bindings to the CUDA Driver API"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/cust/src/context/legacy.rs b/crates/cust/src/context/legacy.rs
index 3e39ce47..f4d332f1 100644
--- a/crates/cust/src/context/legacy.rs
+++ b/crates/cust/src/context/legacy.rs
@@ -121,11 +121,11 @@ use std::ptr;
 use cust_raw::driver_sys;
 use cust_raw::driver_sys::CUcontext;
 
+use crate::CudaApiVersion;
 use crate::context::ContextHandle;
 use crate::device::Device;
 use crate::error::{CudaResult, DropResult, ToResult};
 use crate::private::Sealed;
-use crate::CudaApiVersion;
 
 /// This enumeration represents configuration settings for devices which share hardware resources
 /// between L1 cache and shared memory.
diff --git a/crates/cust/src/context/mod.rs b/crates/cust/src/context/mod.rs
index e52104f2..69fb2cdb 100644
--- a/crates/cust/src/context/mod.rs
+++ b/crates/cust/src/context/mod.rs
@@ -34,7 +34,7 @@
 //! with the [`legacy`] module.
 
 use std::{
-    mem::{self, transmute, MaybeUninit},
+    mem::{self, MaybeUninit, transmute},
     ptr,
 };
 
@@ -45,10 +45,10 @@ pub mod legacy;
 
 use crate::context::legacy::StreamPriorityRange;
 use crate::{
+    CudaApiVersion,
     device::Device,
     error::{CudaResult, DropResult, ToResult},
     private::Sealed,
-    CudaApiVersion,
 };
 
 pub trait ContextHandle: Sealed {
@@ -215,7 +215,7 @@ impl Context {
     /// Nothing else should be using the primary context for this device, otherwise,
     /// spurious errors or segfaults will occur.
     pub unsafe fn reset(device: &Device) -> CudaResult<()> {
-        driver_sys::cuDevicePrimaryCtxReset(device.as_raw()).to_result()
+        unsafe { driver_sys::cuDevicePrimaryCtxReset(device.as_raw()).to_result() }
     }
 
     /// Sets the flags for the device context, these flags will apply to any user of the primary
diff --git a/crates/cust/src/event.rs b/crates/cust/src/event.rs
index efa8130c..61bc0ff0 100644
--- a/crates/cust/src/event.rs
+++ b/crates/cust/src/event.rs
@@ -18,8 +18,8 @@ use std::ptr;
 use std::time::Duration;
 
 use cust_raw::driver_sys::{
-    cuEventCreate, cuEventDestroy, cuEventElapsedTime, cuEventQuery, cuEventRecord,
-    cuEventSynchronize, CUevent,
+    CUevent, cuEventCreate, cuEventDestroy, cuEventElapsedTime, cuEventQuery, cuEventRecord,
+    cuEventSynchronize,
 };
 
 use crate::error::{CudaError, CudaResult, DropResult, ToResult};
diff --git a/crates/cust/src/external.rs b/crates/cust/src/external.rs
index a634645f..44faa8a6 100644
--- a/crates/cust/src/external.rs
+++ b/crates/cust/src/external.rs
@@ -22,15 +22,17 @@ impl ExternalMemory {
 
         let mut memory: driver_sys::CUexternalMemory = std::ptr::null_mut();
 
-        driver_sys::cuImportExternalMemory(&mut memory, &desc)
-            .to_result()
-            .map(|_| ExternalMemory(memory))
+        unsafe {
+            driver_sys::cuImportExternalMemory(&mut memory, &desc)
+                .to_result()
+                .map(|_| ExternalMemory(memory))
+        }
     }
 
     #[allow(clippy::missing_safety_doc)]
     pub unsafe fn reimport(&mut self, fd: i32, size: usize) -> CudaResult<()> {
         // import new memory - this will call drop to destroy the old one
-        *self = ExternalMemory::import(fd, size)?;
+        *self = unsafe { ExternalMemory::import(fd, size)? };
 
         Ok(())
     }
diff --git a/crates/cust/src/function.rs b/crates/cust/src/function.rs
index b01e889e..67a5fb84 100644
--- a/crates/cust/src/function.rs
+++ b/crates/cust/src/function.rs
@@ -1,7 +1,7 @@
 //! Functions and types for working with CUDA kernels.
 
 use std::marker::PhantomData;
-use std::mem::{transmute, MaybeUninit};
+use std::mem::{MaybeUninit, transmute};
 
 use cust_raw::driver_sys;
 use cust_raw::driver_sys::CUfunction;
diff --git a/crates/cust/src/graph.rs b/crates/cust/src/graph.rs
index 3db77dd5..9f04329e 100644
--- a/crates/cust/src/graph.rs
+++ b/crates/cust/src/graph.rs
@@ -281,7 +281,10 @@ impl Graph {
                 .enumerate()
                 .position(|(cur_idx, x)| x == node && cur_idx != idx)
             {
-                panic!("Duplicate dependency found in call to `{}`, the first instance is at index {}, the second instance is at index {}", func_name, idx, pos);
+                panic!(
+                    "Duplicate dependency found in call to `{}`, the first instance is at index {}, the second instance is at index {}",
+                    func_name, idx, pos
+                );
             }
 
             assert!(
diff --git a/crates/cust/src/memory/array.rs b/crates/cust/src/memory/array.rs
index 40b5ba06..36525b70 100644
--- a/crates/cust/src/memory/array.rs
+++ b/crates/cust/src/memory/array.rs
@@ -5,19 +5,19 @@
 
 use std::ffi::c_void;
 use std::mem;
-use std::mem::zeroed;
 use std::mem::ManuallyDrop;
 use std::mem::MaybeUninit;
+use std::mem::zeroed;
 use std::os::raw::c_uint;
 use std::panic;
 use std::ptr::null;
 use std::ptr::null_mut;
 
 use cust_raw::driver_sys;
+use cust_raw::driver_sys::CUDA_MEMCPY2D;
 use cust_raw::driver_sys::cuMemcpy2D;
 use cust_raw::driver_sys::cuMemcpyAtoH;
 use cust_raw::driver_sys::cuMemcpyHtoA;
-use cust_raw::driver_sys::CUDA_MEMCPY2D;
 use cust_raw::driver_sys::{CUarray, CUarray_format, CUarray_format_enum};
 
 use crate::context::CurrentContext;
diff --git a/crates/cust/src/memory/device/device_box.rs b/crates/cust/src/memory/device/device_box.rs
index edccd3da..4b3ab825 100644
--- a/crates/cust/src/memory/device/device_box.rs
+++ b/crates/cust/src/memory/device/device_box.rs
@@ -5,11 +5,11 @@ use std::os::raw::c_void;
 use cust_raw::driver_sys;
 
 use crate::error::{CudaResult, DropResult, ToResult};
+use crate::memory::DevicePointer;
 use crate::memory::device::AsyncCopyDestination;
 use crate::memory::device::CopyDestination;
 use crate::memory::malloc::{cuda_free, cuda_malloc};
-use crate::memory::DevicePointer;
-use crate::memory::{cuda_free_async, cuda_malloc_async, DeviceCopy};
+use crate::memory::{DeviceCopy, cuda_free_async, cuda_malloc_async};
 use crate::stream::Stream;
 
 /// A pointer type for heap-allocation in CUDA device memory.
@@ -86,9 +86,11 @@ impl<T: DeviceCopy> DeviceBox<T> {
     /// # Ok(())
     /// # }
     pub unsafe fn new_async(val: &T, stream: &Stream) -> CudaResult<Self> {
-        let mut dev_box = DeviceBox::uninitialized_async(stream)?;
-        dev_box.async_copy_from(val, stream)?;
-        Ok(dev_box)
+        unsafe {
+            let mut dev_box = DeviceBox::uninitialized_async(stream)?;
+            dev_box.async_copy_from(val, stream)?;
+            Ok(dev_box)
+        }
     }
 
     /// Enqueues an operation to free the memory backed by this [`DeviceBox`] on a
@@ -204,14 +206,16 @@ impl<T: DeviceCopy + bytemuck::Zeroable> DeviceBox<T> {
     /// ```
     #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
     pub unsafe fn zeroed_async(stream: &Stream) -> CudaResult<Self> {
-        let new_box = DeviceBox::uninitialized_async(stream)?;
+        let new_box = unsafe { DeviceBox::uninitialized_async(stream)? };
         if mem::size_of::<T>() != 0 {
-            driver_sys::cuMemsetD8Async(
-                new_box.as_device_ptr().as_raw(),
-                0,
-                mem::size_of::<T>(),
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemsetD8Async(
+                    new_box.as_device_ptr().as_raw(),
+                    0,
+                    mem::size_of::<T>(),
+                    stream.as_inner(),
+                )
+            }
             .to_result()?;
         }
         Ok(new_box)
@@ -243,7 +247,7 @@ impl<T: DeviceCopy> DeviceBox<T> {
                 ptr: DevicePointer::null(),
             })
         } else {
-            let ptr = cuda_malloc(1)?;
+            let ptr = unsafe { cuda_malloc(1)? };
             Ok(DeviceBox { ptr })
         }
     }
@@ -266,7 +270,7 @@ impl<T: DeviceCopy> DeviceBox<T> {
                 ptr: DevicePointer::null(),
             })
         } else {
-            let ptr = cuda_malloc_async(stream, 1)?;
+            let ptr = unsafe { cuda_malloc_async(stream, 1)? };
             Ok(DeviceBox { ptr })
         }
     }
@@ -473,12 +477,14 @@ impl<T: DeviceCopy> AsyncCopyDestination<T> for DeviceBox<T> {
     unsafe fn async_copy_from(&mut self, val: &T, stream: &Stream) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            driver_sys::cuMemcpyHtoDAsync(
-                self.ptr.as_raw(),
-                val as *const _ as *const c_void,
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyHtoDAsync(
+                    self.ptr.as_raw(),
+                    val as *const _ as *const c_void,
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
@@ -487,12 +493,14 @@ impl<T: DeviceCopy> AsyncCopyDestination<T> for DeviceBox<T> {
     unsafe fn async_copy_to(&self, val: &mut T, stream: &Stream) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            driver_sys::cuMemcpyDtoHAsync(
-                val as *mut _ as *mut c_void,
-                self.ptr.as_raw(),
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyDtoHAsync(
+                    val as *mut _ as *mut c_void,
+                    self.ptr.as_raw(),
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
@@ -502,12 +510,14 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceBox<T>> for DeviceBox<T> {
     unsafe fn async_copy_from(&mut self, val: &DeviceBox<T>, stream: &Stream) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            driver_sys::cuMemcpyDtoDAsync(
-                self.ptr.as_raw(),
-                val.ptr.as_raw(),
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyDtoDAsync(
+                    self.ptr.as_raw(),
+                    val.ptr.as_raw(),
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
@@ -516,12 +526,14 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceBox<T>> for DeviceBox<T> {
     unsafe fn async_copy_to(&self, val: &mut DeviceBox<T>, stream: &Stream) -> CudaResult<()> {
         let size = mem::size_of::<T>();
         if size != 0 {
-            driver_sys::cuMemcpyDtoDAsync(
-                val.ptr.as_raw(),
-                self.ptr.as_raw(),
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyDtoDAsync(
+                    val.ptr.as_raw(),
+                    self.ptr.as_raw(),
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
diff --git a/crates/cust/src/memory/device/device_buffer.rs b/crates/cust/src/memory/device/device_buffer.rs
index eae9fb4b..80a80e01 100644
--- a/crates/cust/src/memory/device/device_buffer.rs
+++ b/crates/cust/src/memory/device/device_buffer.rs
@@ -1,4 +1,4 @@
-use std::mem::{self, align_of, size_of, transmute, ManuallyDrop};
+use std::mem::{self, ManuallyDrop, align_of, size_of, transmute};
 use std::ops::{Deref, DerefMut};
 
 #[cfg(feature = "bytemuck")]
@@ -10,8 +10,8 @@ use cust_raw::driver_sys;
 use crate::error::{CudaResult, DropResult, ToResult};
 use crate::memory::device::{AsyncCopyDestination, CopyDestination, DeviceSlice};
 use crate::memory::malloc::{cuda_free, cuda_malloc};
-use crate::memory::{cuda_free_async, DevicePointer};
-use crate::memory::{cuda_malloc_async, DeviceCopy};
+use crate::memory::{DeviceCopy, cuda_malloc_async};
+use crate::memory::{DevicePointer, cuda_free_async};
 use crate::stream::Stream;
 
 /// Fixed-size device-side buffer. Provides basic access to device memory.
@@ -49,7 +49,7 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
     /// ```
     pub unsafe fn uninitialized(size: usize) -> CudaResult<Self> {
         let ptr = if size > 0 && size_of::<T>() > 0 {
-            cuda_malloc(size)?
+            unsafe { cuda_malloc(size)? }
         } else {
             // FIXME (AL): Do we /really/ want to allow creating an invalid buffer?
             DevicePointer::null()
@@ -74,7 +74,7 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
     /// You can synchronize the stream to ensure the memory allocation operation is complete.
     pub unsafe fn uninitialized_async(size: usize, stream: &Stream) -> CudaResult<Self> {
         let ptr = if size > 0 && size_of::<T>() > 0 {
-            cuda_malloc_async(stream, size)?
+            unsafe { cuda_malloc_async(stream, size)? }
         } else {
             DevicePointer::null()
         };
@@ -272,14 +272,16 @@ impl<T: DeviceCopy + Zeroable> DeviceBuffer<T> {
     /// ```
     #[cfg_attr(docsrs, doc(cfg(feature = "bytemuck")))]
     pub unsafe fn zeroed_async(size: usize, stream: &Stream) -> CudaResult<Self> {
-        let new_buf = DeviceBuffer::uninitialized_async(size, stream)?;
+        let new_buf = unsafe { DeviceBuffer::uninitialized_async(size, stream)? };
         if size_of::<T>() != 0 {
-            driver_sys::cuMemsetD8Async(
-                new_buf.as_device_ptr().as_raw(),
-                0,
-                size_of::<T>() * size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemsetD8Async(
+                    new_buf.as_device_ptr().as_raw(),
+                    0,
+                    size_of::<T>() * size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?;
         }
         Ok(new_buf)
@@ -394,9 +396,11 @@ impl<T: DeviceCopy> DeviceBuffer<T> {
     /// }
     /// ```
     pub unsafe fn from_slice_async(slice: &[T], stream: &Stream) -> CudaResult<Self> {
-        let mut uninit = DeviceBuffer::uninitialized_async(slice.len(), stream)?;
-        uninit.async_copy_from(slice, stream)?;
-        Ok(uninit)
+        unsafe {
+            let mut uninit = DeviceBuffer::uninitialized_async(slice.len(), stream)?;
+            uninit.async_copy_from(slice, stream)?;
+            Ok(uninit)
+        }
     }
 
     /// Explicitly creates a [`DeviceSlice`] from this buffer.
diff --git a/crates/cust/src/memory/device/device_slice.rs b/crates/cust/src/memory/device/device_slice.rs
index ff0db3dd..1c30b205 100644
--- a/crates/cust/src/memory/device/device_slice.rs
+++ b/crates/cust/src/memory/device/device_slice.rs
@@ -12,9 +12,9 @@ use bytemuck::{Pod, Zeroable};
 use cust_raw::driver_sys;
 
 use crate::error::{CudaResult, ToResult};
+use crate::memory::DevicePointer;
 use crate::memory::device::AsyncCopyDestination;
 use crate::memory::device::{CopyDestination, DeviceBuffer};
-use crate::memory::DevicePointer;
 use crate::memory::{DeviceCopy, DeviceMemory};
 use crate::stream::Stream;
 
@@ -204,7 +204,7 @@ impl<T: DeviceCopy> DeviceSlice<T> {
     /// ```
     #[allow(clippy::needless_pass_by_value)]
     pub unsafe fn from_raw_parts<'a>(ptr: DevicePointer<T>, len: usize) -> &'a DeviceSlice<T> {
-        &*(slice_from_raw_parts(ptr.as_ptr(), len) as *const DeviceSlice<T>)
+        unsafe { &*(slice_from_raw_parts(ptr.as_ptr(), len) as *const DeviceSlice<T>) }
     }
 
     /// Performs the same functionality as `from_raw_parts`, except that a
@@ -226,7 +226,7 @@ impl<T: DeviceCopy> DeviceSlice<T> {
         ptr: DevicePointer<T>,
         len: usize,
     ) -> &'a mut DeviceSlice<T> {
-        &mut *(slice_from_raw_parts_mut(ptr.as_mut_ptr(), len) as *mut DeviceSlice<T>)
+        unsafe { &mut *(slice_from_raw_parts_mut(ptr.as_mut_ptr(), len) as *mut DeviceSlice<T>) }
     }
 }
 
@@ -268,12 +268,14 @@ impl<T: DeviceCopy + Pod> DeviceSlice<T> {
             return Ok(());
         }
 
-        driver_sys::cuMemsetD8Async(
-            self.as_raw_ptr(),
-            value,
-            self.size_in_bytes(),
-            stream.as_inner(),
-        )
+        unsafe {
+            driver_sys::cuMemsetD8Async(
+                self.as_raw_ptr(),
+                value,
+                self.size_in_bytes(),
+                stream.as_inner(),
+            )
+        }
         .to_result()
     }
 
@@ -331,8 +333,10 @@ impl<T: DeviceCopy + Pod> DeviceSlice<T> {
             0,
             "Buffer pointer is not aligned to at least 2 bytes!"
         );
-        driver_sys::cuMemsetD16Async(self.as_raw_ptr(), value, data_len / 2, stream.as_inner())
-            .to_result()
+        unsafe {
+            driver_sys::cuMemsetD16Async(self.as_raw_ptr(), value, data_len / 2, stream.as_inner())
+        }
+        .to_result()
     }
 
     /// Sets the memory range of this buffer to contiguous `32-bit` values of `value`.
@@ -389,8 +393,10 @@ impl<T: DeviceCopy + Pod> DeviceSlice<T> {
             0,
             "Buffer pointer is not aligned to at least 4 bytes!"
         );
-        driver_sys::cuMemsetD32Async(self.as_raw_ptr(), value, data_len / 4, stream.as_inner())
-            .to_result()
+        unsafe {
+            driver_sys::cuMemsetD32Async(self.as_raw_ptr(), value, data_len / 4, stream.as_inner())
+        }
+        .to_result()
     }
 }
 
@@ -415,11 +421,13 @@ impl<T: DeviceCopy + Zeroable> DeviceSlice<T> {
     pub unsafe fn set_zero_async(&mut self, stream: &Stream) -> CudaResult<()> {
         // SAFETY: this is fine because Zeroable guarantees a zero byte-pattern is safe
         // for this type. And a slice of bytes can represent any type.
-        let erased = DeviceSlice::from_raw_parts_mut(
-            self.as_device_ptr().cast::<u8>(),
-            self.size_in_bytes(),
-        );
-        erased.set_8_async(0, stream)
+        unsafe {
+            let erased = DeviceSlice::from_raw_parts_mut(
+                self.as_device_ptr().cast::<u8>(),
+                self.size_in_bytes(),
+            );
+            erased.set_8_async(0, stream)
+        }
     }
 }
 
@@ -471,14 +479,14 @@ fn slice_end_index_overflow_fail() -> ! {
 
 impl<T: DeviceCopy> DeviceSliceIndex<T> for usize {
     unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
-        (self..self + 1).get_unchecked(slice)
+        unsafe { (self..self + 1).get_unchecked(slice) }
     }
     fn index(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
         slice.index(self..self + 1)
     }
 
     unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
-        (self..self + 1).get_unchecked_mut(slice)
+        unsafe { (self..self + 1).get_unchecked_mut(slice) }
     }
     fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
         slice.index_mut(self..self + 1)
@@ -487,7 +495,12 @@ impl<T: DeviceCopy> DeviceSliceIndex<T> for usize {
 
 impl<T: DeviceCopy> DeviceSliceIndex<T> for Range<usize> {
     unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
-        DeviceSlice::from_raw_parts(slice.as_device_ptr().add(self.start), self.end - self.start)
+        unsafe {
+            DeviceSlice::from_raw_parts(
+                slice.as_device_ptr().add(self.start),
+                self.end - self.start,
+            )
+        }
     }
     fn index(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
         if self.start > self.end {
@@ -500,10 +513,12 @@ impl<T: DeviceCopy> DeviceSliceIndex<T> for Range<usize> {
     }
 
     unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
-        DeviceSlice::from_raw_parts_mut(
-            slice.as_device_ptr().add(self.start),
-            self.end - self.start,
-        )
+        unsafe {
+            DeviceSlice::from_raw_parts_mut(
+                slice.as_device_ptr().add(self.start),
+                self.end - self.start,
+            )
+        }
     }
     fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
         if self.start > self.end {
@@ -518,14 +533,14 @@ impl<T: DeviceCopy> DeviceSliceIndex<T> for Range<usize> {
 
 impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeTo<usize> {
     unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
-        (0..self.end).get_unchecked(slice)
+        unsafe { (0..self.end).get_unchecked(slice) }
     }
     fn index(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
         (0..self.end).index(slice)
     }
 
     unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
-        (0..self.end).get_unchecked_mut(slice)
+        unsafe { (0..self.end).get_unchecked_mut(slice) }
     }
     fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
         (0..self.end).index_mut(slice)
@@ -534,7 +549,7 @@ impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeTo<usize> {
 
 impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeFrom<usize> {
     unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
-        (self.start..slice.len()).get_unchecked(slice)
+        unsafe { (self.start..slice.len()).get_unchecked(slice) }
     }
     fn index(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
         if self.start > slice.len() {
@@ -545,7 +560,7 @@ impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeFrom<usize> {
     }
 
     unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
-        (self.start..slice.len()).get_unchecked_mut(slice)
+        unsafe { (self.start..slice.len()).get_unchecked_mut(slice) }
     }
     fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
         if self.start > slice.len() {
@@ -584,7 +599,7 @@ fn into_slice_range(range: RangeInclusive<usize>) -> Range<usize> {
 
 impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeInclusive<usize> {
     unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
-        into_slice_range(self).get_unchecked(slice)
+        unsafe { into_slice_range(self).get_unchecked(slice) }
     }
     fn index(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
         if *self.end() == usize::MAX {
@@ -594,7 +609,7 @@ impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeInclusive<usize> {
     }
 
     unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
-        into_slice_range(self).get_unchecked_mut(slice)
+        unsafe { into_slice_range(self).get_unchecked_mut(slice) }
     }
     fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
         if *self.end() == usize::MAX {
@@ -606,14 +621,14 @@ impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeInclusive<usize> {
 
 impl<T: DeviceCopy> DeviceSliceIndex<T> for RangeToInclusive<usize> {
     unsafe fn get_unchecked(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
-        (0..=self.end).get_unchecked(slice)
+        unsafe { (0..=self.end).get_unchecked(slice) }
     }
     fn index(self, slice: &DeviceSlice<T>) -> &DeviceSlice<T> {
         (0..=self.end).index(slice)
     }
 
     unsafe fn get_unchecked_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
-        (0..=self.end).get_unchecked_mut(slice)
+        unsafe { (0..=self.end).get_unchecked_mut(slice) }
     }
     fn index_mut(self, slice: &mut DeviceSlice<T>) -> &mut DeviceSlice<T> {
         (0..=self.end).index_mut(slice)
@@ -717,12 +732,14 @@ impl<T: DeviceCopy, I: AsRef<[T]> + AsMut<[T]> + ?Sized> AsyncCopyDestination<I>
         );
         let size = self.size_in_bytes();
         if size != 0 {
-            driver_sys::cuMemcpyHtoDAsync(
-                self.as_raw_ptr(),
-                val.as_ptr() as *const c_void,
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyHtoDAsync(
+                    self.as_raw_ptr(),
+                    val.as_ptr() as *const c_void,
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
@@ -736,12 +753,14 @@ impl<T: DeviceCopy, I: AsRef<[T]> + AsMut<[T]> + ?Sized> AsyncCopyDestination<I>
         );
         let size = self.size_in_bytes();
         if size != 0 {
-            driver_sys::cuMemcpyDtoHAsync(
-                val.as_mut_ptr() as *mut c_void,
-                self.as_raw_ptr(),
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyDtoHAsync(
+                    val.as_mut_ptr() as *mut c_void,
+                    self.as_raw_ptr(),
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
@@ -755,12 +774,14 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceSlice<T>> for DeviceSlice<T> {
         );
         let size = self.size_in_bytes();
         if size != 0 {
-            driver_sys::cuMemcpyDtoDAsync(
-                self.as_raw_ptr(),
-                val.as_raw_ptr(),
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyDtoDAsync(
+                    self.as_raw_ptr(),
+                    val.as_raw_ptr(),
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
@@ -773,12 +794,14 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceSlice<T>> for DeviceSlice<T> {
         );
         let size = self.size_in_bytes();
         if size != 0 {
-            driver_sys::cuMemcpyDtoDAsync(
-                val.as_raw_ptr(),
-                self.as_raw_ptr(),
-                size,
-                stream.as_inner(),
-            )
+            unsafe {
+                driver_sys::cuMemcpyDtoDAsync(
+                    val.as_raw_ptr(),
+                    self.as_raw_ptr(),
+                    size,
+                    stream.as_inner(),
+                )
+            }
             .to_result()?
         }
         Ok(())
@@ -786,10 +809,10 @@ impl<T: DeviceCopy> AsyncCopyDestination<DeviceSlice<T>> for DeviceSlice<T> {
 }
 impl<T: DeviceCopy> AsyncCopyDestination<DeviceBuffer<T>> for DeviceSlice<T> {
     unsafe fn async_copy_from(&mut self, val: &DeviceBuffer<T>, stream: &Stream) -> CudaResult<()> {
-        self.async_copy_from(val as &DeviceSlice<T>, stream)
+        unsafe { self.async_copy_from(val as &DeviceSlice<T>, stream) }
     }
 
     unsafe fn async_copy_to(&self, val: &mut DeviceBuffer<T>, stream: &Stream) -> CudaResult<()> {
-        self.async_copy_to(val as &mut DeviceSlice<T>, stream)
+        unsafe { self.async_copy_to(val as &mut DeviceSlice<T>, stream) }
     }
 }
diff --git a/crates/cust/src/memory/device/device_variable.rs b/crates/cust/src/memory/device/device_variable.rs
index 14eb52bf..33c80d14 100644
--- a/crates/cust/src/memory/device/device_variable.rs
+++ b/crates/cust/src/memory/device/device_variable.rs
@@ -1,6 +1,6 @@
 use crate::error::CudaResult;
-use crate::memory::device::CopyDestination;
 use crate::memory::DeviceCopy;
+use crate::memory::device::CopyDestination;
 use crate::memory::{DeviceBox, DevicePointer};
 use std::ops::{Deref, DerefMut};
 
diff --git a/crates/cust/src/memory/locked/locked_box.rs b/crates/cust/src/memory/locked/locked_box.rs
index 0af28245..eb19cd38 100644
--- a/crates/cust/src/memory/locked/locked_box.rs
+++ b/crates/cust/src/memory/locked/locked_box.rs
@@ -6,7 +6,7 @@ use std::{
 
 use crate::{
     error::CudaResult,
-    memory::{cuda_free_locked, cuda_malloc_locked, DeviceCopy},
+    memory::{DeviceCopy, cuda_free_locked, cuda_malloc_locked},
 };
 
 /// Page-locked box in host memory.
@@ -45,7 +45,7 @@ impl<T: DeviceCopy> LockedBox<T> {
         if mem::size_of::<T>() == 0 {
             Ok(LockedBox { ptr: null_mut() })
         } else {
-            let ptr = cuda_malloc_locked(1)?;
+            let ptr = unsafe { cuda_malloc_locked(1)? };
             Ok(LockedBox { ptr })
         }
     }
diff --git a/crates/cust/src/memory/locked/locked_buffer.rs b/crates/cust/src/memory/locked/locked_buffer.rs
index 14fd321d..e310c619 100644
--- a/crates/cust/src/memory/locked/locked_buffer.rs
+++ b/crates/cust/src/memory/locked/locked_buffer.rs
@@ -1,6 +1,6 @@
 use crate::error::*;
-use crate::memory::malloc::{cuda_free_locked, cuda_malloc_locked};
 use crate::memory::DeviceCopy;
+use crate::memory::malloc::{cuda_free_locked, cuda_malloc_locked};
 use std::mem;
 use std::ops;
 use std::ptr;
@@ -98,7 +98,7 @@ impl<T: DeviceCopy> LockedBuffer<T> {
     /// ```
     pub unsafe fn uninitialized(size: usize) -> CudaResult<Self> {
         let ptr: *mut T = if size > 0 && mem::size_of::<T>() > 0 {
-            cuda_malloc_locked(size)?
+            unsafe { cuda_malloc_locked(size)? }
         } else {
             ptr::NonNull::dangling().as_ptr()
         };
diff --git a/crates/cust/src/memory/malloc.rs b/crates/cust/src/memory/malloc.rs
index 6255778c..83b17c63 100644
--- a/crates/cust/src/memory/malloc.rs
+++ b/crates/cust/src/memory/malloc.rs
@@ -48,7 +48,7 @@ pub unsafe fn cuda_malloc<T: DeviceCopy>(count: usize) -> CudaResult<DevicePoint
     }
 
     let mut ptr = 0;
-    driver_sys::cuMemAlloc(&mut ptr, size).to_result()?;
+    unsafe { driver_sys::cuMemAlloc(&mut ptr, size).to_result()? };
     Ok(DevicePointer::from_raw(ptr))
 }
 
@@ -71,11 +71,13 @@ pub unsafe fn cuda_malloc_async<T: DeviceCopy>(
     }
 
     let mut ptr: *mut c_void = ptr::null_mut();
-    driver_sys::cuMemAllocAsync(
-        &mut ptr as *mut *mut c_void as *mut u64,
-        size,
-        stream.as_inner(),
-    )
+    unsafe {
+        driver_sys::cuMemAllocAsync(
+            &mut ptr as *mut *mut c_void as *mut u64,
+            size,
+            stream.as_inner(),
+        )
+    }
     .to_result()?;
     let ptr = ptr as *mut T;
     Ok(DevicePointer::from_raw(ptr as driver_sys::CUdeviceptr))
@@ -97,7 +99,7 @@ pub unsafe fn cuda_free_async<T: DeviceCopy>(
         return Err(CudaError::InvalidMemoryAllocation);
     }
 
-    driver_sys::cuMemFreeAsync(p.as_raw(), stream.as_inner()).to_result()
+    unsafe { driver_sys::cuMemFreeAsync(p.as_raw(), stream.as_inner()) }.to_result()
 }
 
 /// Unsafe wrapper around the `cuMemAllocManaged` function, which allocates some unified memory and
@@ -140,14 +142,16 @@ pub unsafe fn cuda_malloc_unified<T: DeviceCopy>(count: usize) -> CudaResult<Uni
     }
 
     let mut ptr: *mut c_void = ptr::null_mut();
-    driver_sys::cuMemAllocManaged(
-        &mut ptr as *mut *mut c_void as *mut u64,
-        size,
-        driver_sys::CUmemAttach_flags_enum::CU_MEM_ATTACH_GLOBAL as u32,
-    )
-    .to_result()?;
-    let ptr = ptr as *mut T;
-    Ok(UnifiedPointer::wrap(ptr))
+    unsafe {
+        driver_sys::cuMemAllocManaged(
+            &mut ptr as *mut *mut c_void as *mut u64,
+            size,
+            driver_sys::CUmemAttach_flags_enum::CU_MEM_ATTACH_GLOBAL as u32,
+        )
+        .to_result()?;
+        let ptr = ptr as *mut T;
+        Ok(UnifiedPointer::wrap(ptr))
+    }
 }
 
 /// Unsafe wrapper around the `cuMemAllocPitch` function, which allocates device memory in two dimensions
@@ -203,7 +207,7 @@ pub unsafe fn cuda_malloc_pitched<T: DeviceCopy>(
 
     let mut ptr = 0;
     let mut pitch = 0;
-    driver_sys::cuMemAllocPitch(&mut ptr, &mut pitch, width_bytes, height, element_size)
+    unsafe { driver_sys::cuMemAllocPitch(&mut ptr, &mut pitch, width_bytes, height, element_size) }
         .to_result()?;
     Ok((DevicePointer::from_raw(ptr), pitch))
 }
@@ -236,7 +240,7 @@ pub unsafe fn cuda_free<T: DeviceCopy>(ptr: DevicePointer<T>) -> CudaResult<()>
         return Err(CudaError::InvalidMemoryAllocation);
     }
 
-    driver_sys::cuMemFree(ptr.as_raw()).to_result()?;
+    unsafe { driver_sys::cuMemFree(ptr.as_raw()).to_result()? };
     Ok(())
 }
 
@@ -269,7 +273,7 @@ pub unsafe fn cuda_free_unified<T: DeviceCopy>(mut p: UnifiedPointer<T>) -> Cuda
         return Err(CudaError::InvalidMemoryAllocation);
     }
 
-    driver_sys::cuMemFree(ptr as u64).to_result()?;
+    unsafe { driver_sys::cuMemFree(ptr as u64) }.to_result()?;
     Ok(())
 }
 
@@ -311,7 +315,7 @@ pub unsafe fn cuda_malloc_locked<T>(count: usize) -> CudaResult<*mut T> {
     }
 
     let mut ptr: *mut c_void = ptr::null_mut();
-    driver_sys::cuMemAllocHost(&mut ptr as *mut *mut c_void, size).to_result()?;
+    unsafe { driver_sys::cuMemAllocHost(&mut ptr as *mut *mut c_void, size) }.to_result()?;
     let ptr = ptr as *mut T;
     Ok(ptr)
 }
@@ -344,7 +348,7 @@ pub unsafe fn cuda_free_locked<T>(ptr: *mut T) -> CudaResult<()> {
         return Err(CudaError::InvalidMemoryAllocation);
     }
 
-    driver_sys::cuMemFreeHost(ptr as *mut c_void).to_result()?;
+    unsafe { driver_sys::cuMemFreeHost(ptr as *mut c_void) }.to_result()?;
     Ok(())
 }
 
diff --git a/crates/cust/src/memory/mod.rs b/crates/cust/src/memory/mod.rs
index aa349145..11b487d5 100644
--- a/crates/cust/src/memory/mod.rs
+++ b/crates/cust/src/memory/mod.rs
@@ -212,7 +212,7 @@ pub unsafe fn memcpy_htod(
     src_ptr: *const c_void,
     size: usize,
 ) -> CudaResult<()> {
-    driver_sys::cuMemcpyHtoD(d_ptr, src_ptr, size).to_result()?;
+    unsafe { driver_sys::cuMemcpyHtoD(d_ptr, src_ptr, size).to_result()? };
     Ok(())
 }
 
@@ -223,7 +223,7 @@ pub unsafe fn memcpy_dtoh(
     src_ptr: driver_sys::CUdeviceptr,
     size: usize,
 ) -> CudaResult<()> {
-    driver_sys::cuMemcpyDtoH(d_ptr, src_ptr, size).to_result()?;
+    unsafe { driver_sys::cuMemcpyDtoH(d_ptr, src_ptr, size).to_result()? };
     Ok(())
 }
 
@@ -309,7 +309,7 @@ pub unsafe fn memcpy_2d_htod<T: DeviceCopy>(
         Height: height,
     };
 
-    driver_sys::cuMemcpy2D(&pcopy).to_result()?;
+    unsafe { driver_sys::cuMemcpy2D(&pcopy).to_result()? };
     Ok(())
 }
 
@@ -395,7 +395,7 @@ pub unsafe fn memcpy_2d_dtoh<T: DeviceCopy>(
         Height: height,
     };
 
-    driver_sys::cuMemcpy2D(&pcopy).to_result()?;
+    unsafe { driver_sys::cuMemcpy2D(&pcopy).to_result()? };
     Ok(())
 }
 
diff --git a/crates/cust/src/memory/pointer.rs b/crates/cust/src/memory/pointer.rs
index b22ff5d7..d40beed0 100644
--- a/crates/cust/src/memory/pointer.rs
+++ b/crates/cust/src/memory/pointer.rs
@@ -214,7 +214,7 @@ impl<T: DeviceCopy> DevicePointer<T> {
     where
         T: Sized,
     {
-        self.offset(count as isize)
+        unsafe { self.offset(count as isize) }
     }
 
     /// Calculates the offset from a pointer (convenience for
@@ -256,7 +256,7 @@ impl<T: DeviceCopy> DevicePointer<T> {
     where
         T: Sized,
     {
-        self.offset((count as isize).wrapping_neg())
+        unsafe { self.offset((count as isize).wrapping_neg()) }
     }
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
@@ -481,7 +481,7 @@ impl<T: DeviceCopy> UnifiedPointer<T> {
     where
         T: Sized,
     {
-        Self::wrap(self.0.offset(count))
+        unsafe { Self::wrap(self.0.offset(count)) }
     }
 
     /// Calculates the offset from a unified pointer using wrapping arithmetic.
@@ -559,7 +559,7 @@ impl<T: DeviceCopy> UnifiedPointer<T> {
     where
         T: Sized,
     {
-        self.offset(count as isize)
+        unsafe { self.offset(count as isize) }
     }
 
     /// Calculates the offset from a pointer (convenience for
@@ -600,7 +600,7 @@ impl<T: DeviceCopy> UnifiedPointer<T> {
     where
         T: Sized,
     {
-        self.offset((count as isize).wrapping_neg())
+        unsafe { self.offset((count as isize).wrapping_neg()) }
     }
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
diff --git a/crates/cust/src/memory/unified.rs b/crates/cust/src/memory/unified.rs
index d833a62d..c45f5ce3 100644
--- a/crates/cust/src/memory/unified.rs
+++ b/crates/cust/src/memory/unified.rs
@@ -15,8 +15,8 @@ use crate::device::Device;
 #[allow(unused_imports)]
 use crate::device::DeviceAttribute;
 use crate::error::*;
-use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified};
 use crate::memory::UnifiedPointer;
+use crate::memory::malloc::{cuda_free_unified, cuda_malloc_unified};
 use crate::prelude::Stream;
 
 /// A pointer type for heap-allocation in CUDA unified memory.
@@ -88,7 +88,7 @@ impl<T: DeviceCopy> UnifiedBox<T> {
                 ptr: UnifiedPointer::null(),
             })
         } else {
-            let ptr = cuda_malloc_unified(1)?;
+            let ptr = unsafe { cuda_malloc_unified(1)? };
             Ok(UnifiedBox { ptr })
         }
     }
@@ -117,7 +117,7 @@ impl<T: DeviceCopy> UnifiedBox<T> {
     /// ```
     pub unsafe fn from_raw(ptr: *mut T) -> Self {
         UnifiedBox {
-            ptr: UnifiedPointer::wrap(ptr),
+            ptr: unsafe { UnifiedPointer::wrap(ptr) },
         }
     }
 
@@ -420,9 +420,9 @@ impl<T: DeviceCopy> UnifiedBuffer<T> {
     /// ```
     pub unsafe fn uninitialized(size: usize) -> CudaResult<Self> {
         let ptr = if size > 0 && mem::size_of::<T>() > 0 {
-            cuda_malloc_unified(size)?
+            unsafe { cuda_malloc_unified(size)? }
         } else {
-            UnifiedPointer::wrap(ptr::NonNull::dangling().as_ptr())
+            unsafe { UnifiedPointer::wrap(ptr::NonNull::dangling().as_ptr()) }
         };
         Ok(UnifiedBuffer {
             buf: ptr,
diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs
index b7efd93c..124a67fd 100644
--- a/crates/cust/src/module.rs
+++ b/crates/cust/src/module.rs
@@ -1,6 +1,6 @@
 //! Functions and types for working with CUDA modules.
 
-use std::ffi::{c_void, CStr, CString};
+use std::ffi::{CStr, CString, c_void};
 use std::fmt;
 use std::marker::PhantomData;
 use std::mem;
@@ -63,7 +63,7 @@ pub enum ModuleJitOption {
     /// architecture.
     Fallback(JitFallback),
     /// Generates debug info in the compiled binary.
-    GenenerateDebugInfo(bool),
+    GenerateDebugInfo(bool),
     /// Generates line info in the compiled binary.
     GenerateLineInfo(bool),
 }
@@ -103,13 +103,13 @@ impl ModuleJitOption {
                     raw_opts.push(driver_sys::CUjit_option::CU_JIT_FALLBACK_STRATEGY);
                     raw_vals.push(*fallback as usize as *mut c_void);
                 }
-                Self::GenenerateDebugInfo(gen) => {
+                Self::GenerateDebugInfo(gen_) => {
                     raw_opts.push(driver_sys::CUjit_option::CU_JIT_GENERATE_DEBUG_INFO);
-                    raw_vals.push(*gen as usize as *mut c_void);
+                    raw_vals.push(*gen_ as usize as *mut c_void);
                 }
-                Self::GenerateLineInfo(gen) => {
+                Self::GenerateLineInfo(gen_) => {
                     raw_opts.push(driver_sys::CUjit_option::CU_JIT_GENERATE_LINE_INFO);
-                    raw_vals.push(*gen as usize as *mut c_void)
+                    raw_vals.push(*gen_ as usize as *mut c_void)
                 }
             }
         }
@@ -231,13 +231,15 @@ impl Module {
             inner: ptr::null_mut(),
         };
         let (mut options, mut option_values) = ModuleJitOption::into_raw(options);
-        driver_sys::cuModuleLoadDataEx(
-            &mut module.inner as *mut driver_sys::CUmodule,
-            image,
-            options.len() as c_uint,
-            options.as_mut_ptr(),
-            option_values.as_mut_ptr(),
-        )
+        unsafe {
+            driver_sys::cuModuleLoadDataEx(
+                &mut module.inner as *mut driver_sys::CUmodule,
+                image,
+                options.len() as c_uint,
+                options.as_mut_ptr(),
+                option_values.as_mut_ptr(),
+            )
+        }
         .to_result()?;
         Ok(module)
     }
diff --git a/crates/cust/src/prelude.rs b/crates/cust/src/prelude.rs
index 559a55ba..41758f3e 100644
--- a/crates/cust/src/prelude.rs
+++ b/crates/cust/src/prelude.rs
@@ -3,6 +3,7 @@
 //! This allows the user to `use cust::prelude::*;` and have the most commonly-used types
 //! available quickly.
 
+pub use crate::CudaFlags;
 pub use crate::context::{Context, ContextFlags};
 pub use crate::device::Device;
 pub use crate::event::{Event, EventFlags, EventStatus};
@@ -15,4 +16,3 @@ pub use crate::memory::{
 pub use crate::module::Module;
 pub use crate::stream::{Stream, StreamFlags};
 pub use crate::util::*;
-pub use crate::CudaFlags;
diff --git a/crates/cust/src/stream.rs b/crates/cust/src/stream.rs
index 3ee5c5f1..2cbbab84 100644
--- a/crates/cust/src/stream.rs
+++ b/crates/cust/src/stream.rs
@@ -16,7 +16,7 @@ use std::panic;
 use std::ptr;
 
 use cust_raw::driver_sys;
-use cust_raw::driver_sys::{cudaError_enum, CUstream};
+use cust_raw::driver_sys::{CUstream, cudaError_enum};
 
 use crate::error::{CudaResult, DropResult, ToResult};
 use crate::event::Event;
@@ -271,19 +271,21 @@ impl Stream {
         let grid_size: GridSize = grid_size.into();
         let block_size: BlockSize = block_size.into();
 
-        driver_sys::cuLaunchKernel(
-            func.to_raw(),
-            grid_size.x,
-            grid_size.y,
-            grid_size.z,
-            block_size.x,
-            block_size.y,
-            block_size.z,
-            shared_mem_bytes,
-            self.inner,
-            args.as_ptr() as *mut _,
-            ptr::null_mut(),
-        )
+        unsafe {
+            driver_sys::cuLaunchKernel(
+                func.to_raw(),
+                grid_size.x,
+                grid_size.y,
+                grid_size.z,
+                block_size.x,
+                block_size.y,
+                block_size.z,
+                shared_mem_bytes,
+                self.inner,
+                args.as_ptr() as *mut _,
+                ptr::null_mut(),
+            )
+        }
         .to_result()
     }
 
@@ -357,7 +359,7 @@ unsafe extern "C" fn callback_wrapper<T>(
 {
     // Stop panics from unwinding across the FFI
     let _ = panic::catch_unwind(|| {
-        let callback: Box<T> = Box::from_raw(callback as *mut T);
+        let callback: Box<T> = unsafe { Box::from_raw(callback as *mut T) };
         callback(status.to_result());
     });
 }
diff --git a/crates/cust/src/surface.rs b/crates/cust/src/surface.rs
index 181b9411..79c31a94 100644
--- a/crates/cust/src/surface.rs
+++ b/crates/cust/src/surface.rs
@@ -4,8 +4,8 @@ use std::{
 };
 
 use cust_raw::driver_sys::{
-    cuSurfObjectCreate, cuSurfObjectDestroy, cuSurfObjectGetResourceDesc, CUsurfObject,
-    CUDA_RESOURCE_DESC,
+    CUDA_RESOURCE_DESC, CUsurfObject, cuSurfObjectCreate, cuSurfObjectDestroy,
+    cuSurfObjectGetResourceDesc,
 };
 
 use crate::{
@@ -75,8 +75,10 @@ impl Surface {
     unsafe fn resource_desc(&mut self) -> CudaResult<ManuallyDrop<ResourceDescriptor>> {
         let raw = {
             let mut uninit = MaybeUninit::<CUDA_RESOURCE_DESC>::uninit();
-            cuSurfObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?;
-            uninit.assume_init()
+            unsafe {
+                cuSurfObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?;
+                uninit.assume_init()
+            }
         };
         Ok(ManuallyDrop::new(ResourceDescriptor::from_raw(raw)))
     }
diff --git a/crates/cust/src/texture.rs b/crates/cust/src/texture.rs
index f8019b26..60a918dd 100644
--- a/crates/cust/src/texture.rs
+++ b/crates/cust/src/texture.rs
@@ -1,15 +1,15 @@
-use std::mem::transmute;
 use std::mem::ManuallyDrop;
 use std::mem::MaybeUninit;
+use std::mem::transmute;
 use std::os::raw::c_ulonglong;
 use std::os::raw::{c_float, c_uint};
 use std::ptr;
 
 use cust_raw::driver_sys;
 use cust_raw::driver_sys::{
-    cuTexObjectCreate, cuTexObjectDestroy, cuTexObjectGetResourceDesc,
-    CUDA_RESOURCE_DESC_st__bindgen_ty_1, CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1,
-    CUresourcetype, CUtexObject, CUDA_RESOURCE_DESC, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC,
+    CUDA_RESOURCE_DESC, CUDA_RESOURCE_DESC_st__bindgen_ty_1,
+    CUDA_RESOURCE_DESC_st__bindgen_ty_1__bindgen_ty_1, CUDA_RESOURCE_VIEW_DESC, CUDA_TEXTURE_DESC,
+    CUresourcetype, CUtexObject, cuTexObjectCreate, cuTexObjectDestroy, cuTexObjectGetResourceDesc,
 };
 
 use crate::error::CudaResult;
@@ -491,8 +491,10 @@ impl Texture {
     unsafe fn resource_desc(&mut self) -> CudaResult<ManuallyDrop<ResourceDescriptor>> {
         let raw = {
             let mut uninit = MaybeUninit::<CUDA_RESOURCE_DESC>::uninit();
-            cuTexObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?;
-            uninit.assume_init()
+            unsafe {
+                cuTexObjectGetResourceDesc(uninit.as_mut_ptr(), self.handle).to_result()?;
+                uninit.assume_init()
+            }
         };
         Ok(ManuallyDrop::new(ResourceDescriptor::from_raw(raw)))
     }
diff --git a/crates/cust/src/util.rs b/crates/cust/src/util.rs
index 160622b9..066bca45 100644
--- a/crates/cust/src/util.rs
+++ b/crates/cust/src/util.rs
@@ -1,8 +1,8 @@
 use crate::{
     error::CudaResult,
     memory::{
-        array::{ArrayObject, ArrayPrimitive},
         DeviceBox, DeviceCopy, UnifiedBuffer,
+        array::{ArrayObject, ArrayPrimitive},
     },
     prelude::DeviceBuffer,
     surface::Surface,
diff --git a/crates/cust_core/Cargo.toml b/crates/cust_core/Cargo.toml
index 60e2d17f..87826a56 100644
--- a/crates/cust_core/Cargo.toml
+++ b/crates/cust_core/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "cust_core"
 version = "0.1.1"
-edition = "2021"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "Core library for cust that can be shared across CPU and GPU"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/cust_core/src/lib.rs b/crates/cust_core/src/lib.rs
index 40d4819e..0afb8750 100644
--- a/crates/cust_core/src/lib.rs
+++ b/crates/cust_core/src/lib.rs
@@ -108,36 +108,36 @@ pub mod _hidden {
     {
     }
     unsafe impl<
-            A: DeviceCopy,
-            B: DeviceCopy,
-            C: DeviceCopy,
-            D: DeviceCopy,
-            E: DeviceCopy,
-            F: DeviceCopy,
-        > DeviceCopy for (A, B, C, D, E, F)
+        A: DeviceCopy,
+        B: DeviceCopy,
+        C: DeviceCopy,
+        D: DeviceCopy,
+        E: DeviceCopy,
+        F: DeviceCopy,
+    > DeviceCopy for (A, B, C, D, E, F)
     {
     }
     unsafe impl<
-            A: DeviceCopy,
-            B: DeviceCopy,
-            C: DeviceCopy,
-            D: DeviceCopy,
-            E: DeviceCopy,
-            F: DeviceCopy,
-            G: DeviceCopy,
-        > DeviceCopy for (A, B, C, D, E, F, G)
+        A: DeviceCopy,
+        B: DeviceCopy,
+        C: DeviceCopy,
+        D: DeviceCopy,
+        E: DeviceCopy,
+        F: DeviceCopy,
+        G: DeviceCopy,
+    > DeviceCopy for (A, B, C, D, E, F, G)
     {
     }
     unsafe impl<
-            A: DeviceCopy,
-            B: DeviceCopy,
-            C: DeviceCopy,
-            D: DeviceCopy,
-            E: DeviceCopy,
-            F: DeviceCopy,
-            G: DeviceCopy,
-            H: DeviceCopy,
-        > DeviceCopy for (A, B, C, D, E, F, G, H)
+        A: DeviceCopy,
+        B: DeviceCopy,
+        C: DeviceCopy,
+        D: DeviceCopy,
+        E: DeviceCopy,
+        F: DeviceCopy,
+        G: DeviceCopy,
+        H: DeviceCopy,
+    > DeviceCopy for (A, B, C, D, E, F, G, H)
     {
     }
 
diff --git a/crates/cust_derive/Cargo.toml b/crates/cust_derive/Cargo.toml
index 207f7207..9d21ba0c 100644
--- a/crates/cust_derive/Cargo.toml
+++ b/crates/cust_derive/Cargo.toml
@@ -2,7 +2,7 @@
 name = "cust_derive"
 version = "0.2.0"
 authors = ["Brook Heisler <brookheisler@gmail.com>", "Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
-edition = "2018"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "Macros for cust"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/cust_derive/src/lib.rs b/crates/cust_derive/src/lib.rs
index ef9f7d21..5a3a4011 100644
--- a/crates/cust_derive/src/lib.rs
+++ b/crates/cust_derive/src/lib.rs
@@ -6,21 +6,21 @@ extern crate syn;
 
 use proc_macro2::{Ident, Span, TokenStream};
 use syn::{
-    parse_str, Data, DataEnum, DataStruct, DataUnion, DeriveInput, Field, Fields, Generics,
-    TypeParamBound,
+    Data, DataEnum, DataStruct, DataUnion, DeriveInput, Field, Fields, Generics, TypeParamBound,
+    parse_str,
 };
 
 #[proc_macro_derive(DeviceCopyCore)]
 pub fn device_copy_core(input: BaseTokenStream) -> BaseTokenStream {
     let ast = syn::parse(input).unwrap();
-    let gen = impl_device_copy(&ast, quote!(::cust_core::DeviceCopy));
-    BaseTokenStream::from(gen)
+    let code = impl_device_copy(&ast, quote!(::cust_core::DeviceCopy));
+    BaseTokenStream::from(code)
 }
 #[proc_macro_derive(DeviceCopy)]
 pub fn device_copy(input: BaseTokenStream) -> BaseTokenStream {
     let ast = syn::parse(input).unwrap();
-    let gen = impl_device_copy(&ast, quote!(::cust::memory::DeviceCopy));
-    BaseTokenStream::from(gen)
+    let code = impl_device_copy(&ast, quote!(::cust::memory::DeviceCopy));
+    BaseTokenStream::from(code)
 }
 
 use proc_macro::TokenStream as BaseTokenStream;
@@ -53,11 +53,11 @@ fn impl_device_copy(input: &DeriveInput, import: TokenStream) -> TokenStream {
 
     // Finally, generate the unsafe impl and the type-checking function.
     let generated_code = quote! {
-        unsafe impl#impl_generics #import for #input_type#type_generics #where_clause {}
+        unsafe impl #impl_generics #import for #input_type #type_generics #where_clause {}
 
         #[doc(hidden)]
         #[allow(all)]
-        fn #type_test_func_ident#impl_generics(value: &#input_type#type_generics) #where_clause {
+        fn #type_test_func_ident #impl_generics(value: &#input_type #type_generics) #where_clause {
             fn assert_impl<T: #import>() {}
             #check_types_code
         }
diff --git a/crates/cust_raw/Cargo.toml b/crates/cust_raw/Cargo.toml
index f5fe1f86..c930adce 100644
--- a/crates/cust_raw/Cargo.toml
+++ b/crates/cust_raw/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "cust_raw"
 version = "0.11.3"
-edition = "2021"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "Low level bindings to the CUDA Driver API"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/cust_raw/build/main.rs b/crates/cust_raw/build/main.rs
index 212d5510..0c0d1175 100644
--- a/crates/cust_raw/build/main.rs
+++ b/crates/cust_raw/build/main.rs
@@ -154,6 +154,7 @@ fn create_cuda_driver_bindings(
         .size_t_is_usize(true)
         .layout_tests(true)
         .must_use_type("CUresult")
+        .wrap_unsafe_ops(true)
         // The CUDA docs have lots of malformed Doxygen directives, e.g.
         //
         //   \sa
@@ -217,6 +218,7 @@ fn create_cuda_runtime_bindings(
         .size_t_is_usize(true)
         .layout_tests(true)
         .must_use_type("cudaError_t")
+        .wrap_unsafe_ops(true)
         // See the comment on `generate_comments` in `create_cuda_runtime_bindings`.
         .generate_comments(false)
         .generate()
@@ -270,6 +272,7 @@ fn create_cublas_bindings(sdk: &cuda_sdk::CudaSdk, outdir: &path::Path, manifest
             .size_t_is_usize(true)
             .layout_tests(true)
             .must_use_type("cublasStatus_t")
+            .wrap_unsafe_ops(true)
             // See the comment on `generate_comments` in `create_cuda_runtime_bindings`.
             .generate_comments(false)
             .generate()
@@ -312,6 +315,7 @@ fn create_nvptx_compiler_bindings(
         .size_t_is_usize(true)
         .layout_tests(true)
         .must_use_type("nvPTXCompileResult")
+        .wrap_unsafe_ops(true)
         // See the comment on `generate_comments` in `create_cuda_runtime_bindings`.
         .generate_comments(false)
         .generate()
@@ -347,6 +351,7 @@ fn create_nvvm_bindings(sdk: &cuda_sdk::CudaSdk, outdir: &path::Path, manifest_d
         .size_t_is_usize(true)
         .layout_tests(true)
         .must_use_type("nvvmResult")
+        .wrap_unsafe_ops(true)
         // See the comment on `generate_comments` in `create_cuda_runtime_bindings`.
         .generate_comments(false)
         .generate()
diff --git a/crates/gpu_rand/Cargo.toml b/crates/gpu_rand/Cargo.toml
index f8ede974..6f2cee1b 100644
--- a/crates/gpu_rand/Cargo.toml
+++ b/crates/gpu_rand/Cargo.toml
@@ -3,7 +3,7 @@ name = "gpu_rand"
 version = "0.1.3"
 authors = ["The Rand Project Developers", "The Rust CUDA Project Developers"]
 license = "MIT OR Apache-2.0"
-edition = "2021"
+edition = "2024"
 description = "GPU-friendly random number generators for the Rust CUDA Project"
 repository = "https://github.com/Rust-GPU/rust-cuda"
 readme = "../../README.md"
diff --git a/crates/gpu_rand/src/lib.rs b/crates/gpu_rand/src/lib.rs
index 9ddeccaf..d8189cc7 100644
--- a/crates/gpu_rand/src/lib.rs
+++ b/crates/gpu_rand/src/lib.rs
@@ -5,7 +5,7 @@
 //! assembly. However, it is supposed to also work on the CPU, allowing you to reuse the same random states across CPU and GPU.
 //!
 //! A lot of the initial code is taken from the [rust-random project](https://github.com/rust-random) and modified to make it able to
-//! pass to the GPU, as well as cleaning up certain things and updating it to edition 2021.
+//! pass to the GPU, as well as cleaning up certain things and updating it to edition 2024.
 //! The following generators are implemented:
 //!
 
diff --git a/crates/gpu_rand/src/xoroshiro/mod.rs b/crates/gpu_rand/src/xoroshiro/mod.rs
index 7fcda15f..79113942 100644
--- a/crates/gpu_rand/src/xoroshiro/mod.rs
+++ b/crates/gpu_rand/src/xoroshiro/mod.rs
@@ -75,11 +75,11 @@ mod xoshiro512starstar;
 pub use common::Seed512;
 pub use rand_core;
 pub use splitmix64::SplitMix64;
+pub use xoroshiro64star::Xoroshiro64Star;
+pub use xoroshiro64starstar::Xoroshiro64StarStar;
 pub use xoroshiro128plus::Xoroshiro128Plus;
 pub use xoroshiro128plusplus::Xoroshiro128PlusPlus;
 pub use xoroshiro128starstar::Xoroshiro128StarStar;
-pub use xoroshiro64star::Xoroshiro64Star;
-pub use xoroshiro64starstar::Xoroshiro64StarStar;
 pub use xoshiro128plus::Xoshiro128Plus;
 pub use xoshiro128plusplus::Xoshiro128PlusPlus;
 pub use xoshiro128starstar::Xoshiro128StarStar;
diff --git a/crates/nvvm/Cargo.toml b/crates/nvvm/Cargo.toml
index 22cf71d2..3498a286 100644
--- a/crates/nvvm/Cargo.toml
+++ b/crates/nvvm/Cargo.toml
@@ -2,7 +2,7 @@
 name = "nvvm"
 version = "0.1.1"
 authors = ["Riccardo D'Ambrosio <rdambrosio016@gmail.com>"]
-edition = "2018"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "High level bindings to libnvvm"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/nvvm/src/lib.rs b/crates/nvvm/src/lib.rs
index c6706ea0..9a460c6c 100644
--- a/crates/nvvm/src/lib.rs
+++ b/crates/nvvm/src/lib.rs
@@ -746,8 +746,8 @@ impl NvvmProgram {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::str::FromStr;
     use NvvmArch::*;
+    use std::str::FromStr;
 
     #[test]
     fn nvvm_arch_capability_value() {
@@ -836,7 +836,9 @@ mod tests {
 
         assert_eq!(
             Compute70.all_target_features(),
-            vec![Compute50, Compute52, Compute53, Compute60, Compute61, Compute62, Compute70]
+            vec![
+                Compute50, Compute52, Compute53, Compute60, Compute61, Compute62, Compute70
+            ]
         );
 
         assert_eq!(
diff --git a/crates/optix/examples/path_tracer/Cargo.toml b/crates/optix/examples/path_tracer/Cargo.toml
index 4b1bf7aa..ed25bf73 100644
--- a/crates/optix/examples/path_tracer/Cargo.toml
+++ b/crates/optix/examples/path_tracer/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "path-tracer"
 version = "0.1.0"
-edition = "2018"
+edition = "2024"
 
 [dependencies]
 glam = { version = "0.30", features = ["bytemuck", "cuda", "mint"] }
diff --git a/crates/optix/examples/path_tracer/kernels/Cargo.toml b/crates/optix/examples/path_tracer/kernels/Cargo.toml
index 1f3c34ea..87f93da1 100644
--- a/crates/optix/examples/path_tracer/kernels/Cargo.toml
+++ b/crates/optix/examples/path_tracer/kernels/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "path-tracer-kernels"
 version = "0.1.0"
-edition = "2018"
+edition = "2024"
 
 [dependencies]
 cuda_std = { version = "0.2", path = "../../../../cuda_std" }
diff --git a/crates/ptx/Cargo.toml b/crates/ptx/Cargo.toml
index eff4da3c..9ce27d8d 100644
--- a/crates/ptx/Cargo.toml
+++ b/crates/ptx/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "ptx"
 version = "0.1.0"
-edition = "2018"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "PTX parser and analyzer"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/ptx/src/lexer.rs b/crates/ptx/src/lexer.rs
index 440e5b5f..354c1c26 100644
--- a/crates/ptx/src/lexer.rs
+++ b/crates/ptx/src/lexer.rs
@@ -107,10 +107,10 @@ impl<'src> Lexer<'src> {
         Some(Ok(match cur {
             AsciiChar::Percent => {
                 let peek = self.peek();
-                if let Some(peeked) = peek {
-                    if is_ident_continue(peeked) {
-                        return Some(Ok(self.opcode_or_ident()));
-                    }
+                if let Some(peeked) = peek
+                    && is_ident_continue(peeked)
+                {
+                    return Some(Ok(self.opcode_or_ident()));
                 }
                 self.eat_and_ret_token(1, TokenKind::Modulo)
             }
@@ -440,14 +440,14 @@ impl<'src> Lexer<'src> {
         let cur = self.cur;
         let ident = self.eat_until(|c, _| is_ident_continue(c));
         // check if its an instruction
-        if ident.chars().all(|c| c.is_ascii_alphanumeric()) {
-            if let Ok(kind) = InstructionKind::from_str(ident.as_str()) {
-                *self.values.last_mut().unwrap() = Some(TokenValue::Instruction(kind));
-                return Token {
-                    kind: TokenKind::Instruction,
-                    range: cur..self.cur,
-                };
-            }
+        if ident.chars().all(|c| c.is_ascii_alphanumeric())
+            && let Ok(kind) = InstructionKind::from_str(ident.as_str())
+        {
+            *self.values.last_mut().unwrap() = Some(TokenValue::Instruction(kind));
+            return Token {
+                kind: TokenKind::Instruction,
+                range: cur..self.cur,
+            };
         }
 
         *self.values.last_mut().unwrap() =
@@ -468,7 +468,7 @@ impl<'src> Lexer<'src> {
                 return Ok(Token {
                     kind: TokenKind::Dot,
                     range: cur..self.cur,
-                })
+                });
             }
             Some(c) if !is_ident_continue(c) => {
                 return Ok(Token {
diff --git a/crates/ptx_compiler/Cargo.toml b/crates/ptx_compiler/Cargo.toml
index fc6c2e28..dfc9c66e 100644
--- a/crates/ptx_compiler/Cargo.toml
+++ b/crates/ptx_compiler/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "ptx_compiler"
 version = "0.1.1"
-edition = "2021"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "High level bindings to CUDA's ptx compilation APIs"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/crates/rustc_codegen_nvvm_macros/Cargo.toml b/crates/rustc_codegen_nvvm_macros/Cargo.toml
index 9d78a324..2cac50cc 100644
--- a/crates/rustc_codegen_nvvm_macros/Cargo.toml
+++ b/crates/rustc_codegen_nvvm_macros/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "rustc_codegen_nvvm_macros"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "Macros for rustc_codegen_nvvm"
 repository = "https://github.com/Rust-GPU/rust-cuda"
diff --git a/tests/compiletests/Cargo.toml b/tests/compiletests/Cargo.toml
index 989726d6..4c212d73 100644
--- a/tests/compiletests/Cargo.toml
+++ b/tests/compiletests/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "compiletests"
 version = "0.1.0"
-edition = "2021"
+edition = "2024"
 
 [[bin]]
 name = "compiletests"
diff --git a/tests/compiletests/src/main.rs b/tests/compiletests/src/main.rs
index b6888e95..8672854b 100644
--- a/tests/compiletests/src/main.rs
+++ b/tests/compiletests/src/main.rs
@@ -109,7 +109,7 @@ impl Runner {
                     .iter()
                     .map(|dir| format!("-L dependency={}", dir.display()))
                     .fold(String::new(), |a, b| b + " " + &a),
-                "--edition 2021",
+                "--edition 2024",
                 &*format!("--extern noprelude:core={}", deps.core.display()),
                 &*format!(
                     "--extern noprelude:compiler_builtins={}",
@@ -458,7 +458,9 @@ fn setup_windows_dll_path(codegen_backend_path: &Path) {
             } else {
                 format!("{dir_str}{separator}{existing_path}")
             };
-            env::set_var(lib_path_var, new_path);
+            unsafe {
+                env::set_var(lib_path_var, new_path);
+            }
         }
     }
 
@@ -477,24 +479,24 @@ fn setup_windows_dll_path(codegen_backend_path: &Path) {
     ];
 
     for llvm_config in &llvm_config_paths {
-        if let Ok(output) = Command::new(llvm_config).arg("--bindir").output() {
-            if output.status.success() {
-                if let Ok(bindir) = String::from_utf8(output.stdout) {
-                    let bindir = bindir.trim();
-                    let bindir_path = Path::new(bindir);
-                    if bindir_path.exists() {
-                        add_to_dylib_path(bindir_path);
-                        // Also add the lib directory which might contain DLLs
-                        if let Some(parent) = bindir_path.parent() {
-                            let libdir = parent.join("lib");
-                            if libdir.exists() {
-                                add_to_dylib_path(&libdir);
-                            }
+        if let Ok(output) = Command::new(llvm_config).arg("--bindir").output()
+            && output.status.success()
+        {
+            if let Ok(bindir) = String::from_utf8(output.stdout) {
+                let bindir = bindir.trim();
+                let bindir_path = Path::new(bindir);
+                if bindir_path.exists() {
+                    add_to_dylib_path(bindir_path);
+                    // Also add the lib directory which might contain DLLs
+                    if let Some(parent) = bindir_path.parent() {
+                        let libdir = parent.join("lib");
+                        if libdir.exists() {
+                            add_to_dylib_path(&libdir);
                         }
                     }
                 }
-                break;
             }
+            break;
         }
     }
 
@@ -629,6 +631,8 @@ fn setup_cuda_environment() {
             format!("{new_paths}{separator}{existing_path}")
         };
 
-        env::set_var(lib_path_var, new_lib_path);
+        unsafe {
+            env::set_var(lib_path_var, new_lib_path);
+        }
     }
 }
diff --git a/tests/compiletests/ui/atomic/atomic_operations.rs b/tests/compiletests/ui/atomic/atomic_operations.rs
index 48ae8d9c..812a0adb 100644
--- a/tests/compiletests/ui/atomic/atomic_operations.rs
+++ b/tests/compiletests/ui/atomic/atomic_operations.rs
@@ -17,7 +17,7 @@ pub unsafe fn test_cuda_atomic_floats() {
 
     // Block-scoped atomic float
     let block_atomic = BlockAtomicF32::new(1.5);
-    let _old = block_atomic.fetch_add(0.5, Ordering::Relaxed);
+    let _old = unsafe { block_atomic.fetch_add(0.5, Ordering::Relaxed) };
 
     // System-scoped atomic float
     let system_atomic = SystemAtomicF32::new(0.0);
@@ -29,7 +29,7 @@ pub unsafe fn test_cuda_atomic_floats() {
 
     // Test block-scoped f64
     let block_f64 = BlockAtomicF64::new(2.718);
-    let _old = block_f64.fetch_sub(0.5, Ordering::Relaxed);
+    let _old = unsafe { block_f64.fetch_sub(0.5, Ordering::Relaxed) };
 
     // Test bitwise operations on floats
     let _old = atomic_f32.fetch_and(3.14, Ordering::Relaxed);
diff --git a/tests/compiletests/ui/core/ops/range_contains.rs b/tests/compiletests/ui/core/ops/range_contains.rs
index f7f733b7..0a9c5b25 100644
--- a/tests/compiletests/ui/core/ops/range_contains.rs
+++ b/tests/compiletests/ui/core/ops/range_contains.rs
@@ -8,5 +8,7 @@ fn has_two_decimal_digits(x: u32) -> bool {
 
 #[kernel]
 pub unsafe fn main(i: u32, o: *mut u32) {
-    *o = has_two_decimal_digits(i) as u32;
+    unsafe {
+        *o = has_two_decimal_digits(i) as u32;
+    }
 }
diff --git a/tests/compiletests/ui/dis/simple_add.rs b/tests/compiletests/ui/dis/simple_add.rs
index 15b6b179..47f526cc 100644
--- a/tests/compiletests/ui/dis/simple_add.rs
+++ b/tests/compiletests/ui/dis/simple_add.rs
@@ -7,6 +7,8 @@ use cuda_std::kernel;
 
 #[kernel]
 pub unsafe fn simple_add_kernel(a: *const f32, b: *const f32, c: *mut f32) {
-    let sum = *a + *b;
-    *c = sum;
+    unsafe {
+        let sum = *a + *b;
+        *c = sum;
+    }
 }
diff --git a/tests/compiletests/ui/dis/simple_add.stderr b/tests/compiletests/ui/dis/simple_add.stderr
index 0386db78..5aa92e63 100644
--- a/tests/compiletests/ui/dis/simple_add.stderr
+++ b/tests/compiletests/ui/dis/simple_add.stderr
@@ -19,17 +19,17 @@ $L__tmp0:
     cvta.to.global.u64     %rd4, %rd3;
     cvta.to.global.u64     %rd5, %rd2;
     cvta.to.global.u64     %rd6, %rd1;
-    .loc    1 10 15
+    .loc    1 11 19
     ld.global.f32     %f1, [%rd6];
-    .loc    1 10 20
+    .loc    1 11 24
     ld.global.f32     %f2, [%rd5];
-    .loc    1 10 15
+    .loc    1 11 19
     add.f32     %f3, %f1, %f2;
 $L__tmp1:
-    .loc    1 11 5
+    .loc    1 12 9
     st.global.f32     [%rd4], %f3;
 $L__tmp2:
-    .loc    1 12 2
+    .loc    1 14 2
     ret;
 $L__tmp3:
 $L__func_end0:
diff --git a/tests/compiletests/ui/glam/mat3_vec3_multiply.rs b/tests/compiletests/ui/glam/mat3_vec3_multiply.rs
index adda6e82..de2e0d17 100644
--- a/tests/compiletests/ui/glam/mat3_vec3_multiply.rs
+++ b/tests/compiletests/ui/glam/mat3_vec3_multiply.rs
@@ -7,5 +7,7 @@ use cuda_std::kernel;
 #[kernel]
 pub unsafe fn mat3_vec3_multiply(input: glam::Mat3, output: *mut glam::Vec3) {
     let vector = input * glam::Vec3::new(1.0, 2.0, 3.0);
-    *output = vector;
+    unsafe {
+        *output = vector;
+    }
 }
diff --git a/tests/compiletests/ui/glam/mat4_operations.rs b/tests/compiletests/ui/glam/mat4_operations.rs
index e54d2d10..d15c5b28 100644
--- a/tests/compiletests/ui/glam/mat4_operations.rs
+++ b/tests/compiletests/ui/glam/mat4_operations.rs
@@ -16,15 +16,21 @@ pub unsafe fn mat4_transform_operations(
 ) {
     // Transform a 3D point (w=1 implied)
     let transformed_point = matrix.transform_point3(point);
-    *result_point = transformed_point;
+    unsafe {
+        *result_point = transformed_point;
+    }
 
     // Transform a 4D vector
     let transformed_vector = matrix * vector;
-    *result_vector = transformed_vector;
+    unsafe {
+        *result_vector = transformed_vector;
+    }
 
     // Calculate determinant
     let det = matrix.determinant();
-    *result_determinant = det;
+    unsafe {
+        *result_determinant = det;
+    }
 }
 
 #[kernel]
@@ -39,13 +45,19 @@ pub unsafe fn mat4_construction(
 ) {
     // Create translation matrix
     let trans_mat = Mat4::from_translation(translation);
-    *result_translation = trans_mat;
+    unsafe {
+        *result_translation = trans_mat;
+    }
 
     // Create scale matrix
     let scale_mat = Mat4::from_scale(scale);
-    *result_scale = scale_mat;
+    unsafe {
+        *result_scale = scale_mat;
+    }
 
     // Create rotation matrix
     let rot_mat = Mat4::from_axis_angle(axis, angle_radians);
-    *result_rotation = rot_mat;
+    unsafe {
+        *result_rotation = rot_mat;
+    }
 }
diff --git a/tests/compiletests/ui/glam/vec3_operations.rs b/tests/compiletests/ui/glam/vec3_operations.rs
index 9ab43513..2c33c87e 100644
--- a/tests/compiletests/ui/glam/vec3_operations.rs
+++ b/tests/compiletests/ui/glam/vec3_operations.rs
@@ -15,15 +15,21 @@ pub unsafe fn vec3_basic_ops(
 ) {
     // Vector addition
     let sum = a + b;
-    *result_add = sum;
+    unsafe {
+        *result_add = sum;
+    }
 
     // Dot product
     let dot = a.dot(b);
-    *result_dot = dot;
+    unsafe {
+        *result_dot = dot;
+    }
 
     // Cross product
     let cross = a.cross(b);
-    *result_cross = cross;
+    unsafe {
+        *result_cross = cross;
+    }
 }
 
 #[kernel]
@@ -34,9 +40,13 @@ pub unsafe fn vec3_normalization(
 ) {
     // Get length
     let len = input.length();
-    *result_length = len;
+    unsafe {
+        *result_length = len;
+    }
 
     // Normalize
     let normalized = input.normalize();
-    *result_normalized = normalized;
+    unsafe {
+        *result_normalized = normalized;
+    }
 }
diff --git a/tests/compiletests/ui/hello_world.rs b/tests/compiletests/ui/hello_world.rs
index 82174c88..ae093134 100644
--- a/tests/compiletests/ui/hello_world.rs
+++ b/tests/compiletests/ui/hello_world.rs
@@ -4,5 +4,7 @@ use cuda_std::kernel;
 
 #[kernel]
 pub unsafe fn add_one(x: *mut f32) {
-    *x = *x + 1.0;
+    unsafe {
+        *x = *x + 1.0;
+    }
 }
diff --git a/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs b/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs
index ad38b46b..c3993952 100644
--- a/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs
+++ b/tests/compiletests/ui/lang/consts/constant_memory_overflow.rs
@@ -17,5 +17,5 @@ static BIG_ARRAY_3: [u32; ARRAY_SIZE] = [333u32; ARRAY_SIZE];
 
 #[kernel]
 pub unsafe fn test_kernel(out: *mut u32) {
-    *out = BIG_ARRAY_1[0] + BIG_ARRAY_2[0] + BIG_ARRAY_3[0];
+    unsafe { *out = BIG_ARRAY_1[0] + BIG_ARRAY_2[0] + BIG_ARRAY_3[0] };
 }
diff --git a/tests/compiletests/ui/lang/consts/shallow-ref.rs b/tests/compiletests/ui/lang/consts/shallow-ref.rs
index 0065d47f..7d9051c0 100644
--- a/tests/compiletests/ui/lang/consts/shallow-ref.rs
+++ b/tests/compiletests/ui/lang/consts/shallow-ref.rs
@@ -20,7 +20,9 @@ pub unsafe fn test_shallow_ref(
     bool_out: *mut u32,
     vec_out: *mut Vec2,
 ) {
-    *scalar_out = scalar_load(&123);
-    *bool_out = (vec_in == Vec2::ZERO) as u32;
-    *vec_out = ROT90.transpose() * vec_in;
+    unsafe {
+        *scalar_out = scalar_load(&123);
+        *bool_out = (vec_in == Vec2::ZERO) as u32;
+        *vec_out = ROT90.transpose() * vec_in;
+    }
 }
diff --git a/tests/compiletests/ui/lang/core/array/init_array_i16.rs b/tests/compiletests/ui/lang/core/array/init_array_i16.rs
index 61eb4b6b..ede3b2cd 100644
--- a/tests/compiletests/ui/lang/core/array/init_array_i16.rs
+++ b/tests/compiletests/ui/lang/core/array/init_array_i16.rs
@@ -6,5 +6,7 @@ use cuda_std::kernel;
 #[kernel]
 pub unsafe fn test_init_array_i16(o: *mut i16) {
     let array = [0i16; 4];
-    *o = array[1];
+    unsafe {
+        *o = array[1];
+    }
 }
diff --git a/tests/compiletests/ui/lang/core/array/init_array_i32.rs b/tests/compiletests/ui/lang/core/array/init_array_i32.rs
index 4efb24f7..59c5850c 100644
--- a/tests/compiletests/ui/lang/core/array/init_array_i32.rs
+++ b/tests/compiletests/ui/lang/core/array/init_array_i32.rs
@@ -6,5 +6,7 @@ use cuda_std::kernel;
 #[kernel]
 pub unsafe fn test_init_array_i32(o: *mut i32) {
     let array = [0i32; 4];
-    *o = array[1];
+    unsafe {
+        *o = array[1];
+    }
 }
diff --git a/tests/compiletests/ui/lang/core/array/init_array_i64.rs b/tests/compiletests/ui/lang/core/array/init_array_i64.rs
index 0c161014..7625cac5 100644
--- a/tests/compiletests/ui/lang/core/array/init_array_i64.rs
+++ b/tests/compiletests/ui/lang/core/array/init_array_i64.rs
@@ -6,5 +6,7 @@ use cuda_std::kernel;
 #[kernel]
 pub unsafe fn test_init_array_i64(o: *mut i64) {
     let array = [0i64; 4];
-    *o = array[1];
+    unsafe {
+        *o = array[1];
+    }
 }
diff --git a/tests/compiletests/ui/lang/core/array/init_array_i8.rs b/tests/compiletests/ui/lang/core/array/init_array_i8.rs
index b39b274f..a443bb7f 100644
--- a/tests/compiletests/ui/lang/core/array/init_array_i8.rs
+++ b/tests/compiletests/ui/lang/core/array/init_array_i8.rs
@@ -6,5 +6,7 @@ use cuda_std::kernel;
 #[kernel]
 pub unsafe fn test_init_array_i8(o: *mut i8) {
     let array = [0i8; 4];
-    *o = array[1];
+    unsafe {
+        *o = array[1];
+    }
 }
diff --git a/tests/compiletests/ui/lang/core/intrinsics/log10.rs b/tests/compiletests/ui/lang/core/intrinsics/log10.rs
index a133c7e8..f4f3c72c 100644
--- a/tests/compiletests/ui/lang/core/intrinsics/log10.rs
+++ b/tests/compiletests/ui/lang/core/intrinsics/log10.rs
@@ -10,5 +10,7 @@ use cuda_std::GpuFloat;
 
 #[kernel]
 pub unsafe fn test_log10(input: *const f32, output: *mut f32) {
-    *output = (*input).log10();
+    unsafe {
+        *output = (*input).log10();
+    }
 }
diff --git a/tests/compiletests/ui/lang/core/unwrap_or.rs b/tests/compiletests/ui/lang/core/unwrap_or.rs
index 1deb6858..a6b54c7f 100644
--- a/tests/compiletests/ui/lang/core/unwrap_or.rs
+++ b/tests/compiletests/ui/lang/core/unwrap_or.rs
@@ -8,5 +8,7 @@ use cuda_std::kernel;
 
 #[kernel]
 pub unsafe fn test_unwrap_or(out: *mut u32) {
-    *out = None.unwrap_or(15);
+    unsafe {
+        *out = None.unwrap_or(15);
+    }
 }
diff --git a/tests/compiletests/ui/lang/f32/signum.rs b/tests/compiletests/ui/lang/f32/signum.rs
index 09beaa23..8f7d6e02 100644
--- a/tests/compiletests/ui/lang/f32/signum.rs
+++ b/tests/compiletests/ui/lang/f32/signum.rs
@@ -5,5 +5,7 @@ use cuda_std::kernel;
 
 #[kernel]
 pub unsafe fn test_signum(i: f32, o: *mut f32) {
-    *o = i.signum();
+    unsafe {
+        *o = i.signum();
+    }
 }
diff --git a/tests/compiletests/ui/lang/u32/bit_reverse.rs b/tests/compiletests/ui/lang/u32/bit_reverse.rs
index f8a8379f..188319f7 100644
--- a/tests/compiletests/ui/lang/u32/bit_reverse.rs
+++ b/tests/compiletests/ui/lang/u32/bit_reverse.rs
@@ -6,40 +6,56 @@ use cuda_std::kernel;
 
 #[kernel]
 pub unsafe fn reverse_bits_u8(buffer: *const u8, out: *mut u8) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
 
 #[kernel]
 pub unsafe fn reverse_bits_u16(buffer: *const u16, out: *mut u16) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
 
 #[kernel]
 pub unsafe fn reverse_bits_u32(buffer: *const u32, out: *mut u32) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
 
 #[kernel]
 pub unsafe fn reverse_bits_u64(buffer: *const u64, out: *mut u64) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
 
 #[kernel]
 pub unsafe fn reverse_bits_i8(buffer: *const i8, out: *mut i8) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
 
 #[kernel]
 pub unsafe fn reverse_bits_i16(buffer: *const i16, out: *mut i16) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
 
 #[kernel]
 pub unsafe fn reverse_bits_i32(buffer: *const i32, out: *mut i32) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
 
 #[kernel]
 pub unsafe fn reverse_bits_i64(buffer: *const i64, out: *mut i64) {
-    *out = (*buffer).reverse_bits();
+    unsafe {
+        *out = (*buffer).reverse_bits();
+    }
 }
diff --git a/tests/compiletests/ui/lang/u32/count_ones.rs b/tests/compiletests/ui/lang/u32/count_ones.rs
index cbb36fcc..310eb508 100644
--- a/tests/compiletests/ui/lang/u32/count_ones.rs
+++ b/tests/compiletests/ui/lang/u32/count_ones.rs
@@ -6,40 +6,56 @@ use cuda_std::kernel;
 
 #[kernel]
 pub unsafe fn count_ones_u8(buffer: *const u8, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
 
 #[kernel]
 pub unsafe fn count_ones_u16(buffer: *const u16, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
 
 #[kernel]
 pub unsafe fn count_ones_u32(buffer: *const u32, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
 
 #[kernel]
 pub unsafe fn count_ones_u64(buffer: *const u64, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
 
 #[kernel]
 pub unsafe fn count_ones_i8(buffer: *const i8, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
 
 #[kernel]
 pub unsafe fn count_ones_i16(buffer: *const i16, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
 
 #[kernel]
 pub unsafe fn count_ones_i32(buffer: *const i32, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
 
 #[kernel]
 pub unsafe fn count_ones_i64(buffer: *const i64, out: *mut u32) {
-    *out = (*buffer).count_ones();
+    unsafe {
+        *out = (*buffer).count_ones();
+    }
 }
diff --git a/tests/compiletests/ui/shared/shared_memory.rs b/tests/compiletests/ui/shared/shared_memory.rs
index 545930be..a9ed3b7b 100644
--- a/tests/compiletests/ui/shared/shared_memory.rs
+++ b/tests/compiletests/ui/shared/shared_memory.rs
@@ -16,13 +16,13 @@ pub unsafe fn test_static_shared_memory() {
     let tid = thread::thread_idx_x() as usize;
 
     // Write to shared memory
-    SHARED_DATA[tid].write(tid as i32);
+    unsafe { SHARED_DATA[tid] }.write(tid as i32);
 
     // Synchronize threads before reading
     thread::sync_threads();
 
     // Read from shared memory
-    let _value = SHARED_DATA[tid].assume_init();
+    let _value = unsafe { SHARED_DATA[tid].assume_init() };
 }
 
 #[kernel]
diff --git a/tests/compiletests/ui/warp/warp_functions.rs b/tests/compiletests/ui/warp/warp_functions.rs
index 32af2bec..2c0eea27 100644
--- a/tests/compiletests/ui/warp/warp_functions.rs
+++ b/tests/compiletests/ui/warp/warp_functions.rs
@@ -6,15 +6,17 @@ use cuda_std::warp;
 
 #[kernel]
 pub unsafe fn test_warp_functions() {
-    // Test lane ID function
-    let _lane = warp::lane_id();
+    unsafe {
+        // Test lane ID function
+        let _lane = warp::lane_id();
 
-    // Test active mask function
-    let _mask = warp::activemask();
+        // Test active mask function
+        let _mask = warp::activemask();
 
-    // Test warp sync with full mask
-    warp::sync_warp(0xFFFFFFFF);
+        // Test warp sync with full mask
+        warp::sync_warp(0xFFFFFFFF);
 
-    // Test warp sync with partial mask
-    warp::sync_warp(0x0000FFFF);
+        // Test warp sync with partial mask
+        warp::sync_warp(0x0000FFFF);
+    }
 }
diff --git a/tests/compiletests/ui/warp/warp_shuffle.rs b/tests/compiletests/ui/warp/warp_shuffle.rs
index dca08649..bb25f530 100644
--- a/tests/compiletests/ui/warp/warp_shuffle.rs
+++ b/tests/compiletests/ui/warp/warp_shuffle.rs
@@ -10,7 +10,7 @@ pub unsafe fn test_warp_shuffle_functions() {
     let width = 32_u32; // Full warp width
 
     // Test warp_shuffle_xor with various types
-    {
+    unsafe {
         // 8-bit types
         let val_i8: i8 = 42;
         let (res_i8, pred_i8) = warp::warp_shuffle_xor(mask, val_i8, 1, width);
@@ -54,7 +54,7 @@ pub unsafe fn test_warp_shuffle_functions() {
     }
 
     // Test warp_shuffle_down with various types
-    {
+    unsafe {
         let delta = 1_u32;
 
         let val_i32: i32 = 42;
@@ -77,7 +77,7 @@ pub unsafe fn test_warp_shuffle_functions() {
     }
 
     // Test warp_shuffle_up with various types
-    {
+    unsafe {
         let delta = 1_u32;
 
         let val_i32: i32 = 42;
@@ -100,7 +100,7 @@ pub unsafe fn test_warp_shuffle_functions() {
     }
 
     // Test warp_shuffle_idx with various types
-    {
+    unsafe {
         let idx = 5_u32;
 
         let val_i32: i32 = 42;
@@ -123,14 +123,14 @@ pub unsafe fn test_warp_shuffle_functions() {
     }
 
     // Test with different mask values
-    {
+    unsafe {
         let partial_mask = 0x0000FFFF_u32; // Lower 16 lanes
         let val: i32 = 123;
         let (res, pred) = warp::warp_shuffle_xor(partial_mask, val, 1, width);
     }
 
     // Test with different width values (must be power of 2 and <= 32)
-    {
+    unsafe {
         let val: i32 = 456;
         let lane_mask = 1_u32;
 
@@ -149,7 +149,7 @@ pub unsafe fn test_warp_shuffle_functions() {
 
     // Test with half-precision floating point types (if available)
     #[cfg(feature = "half")]
-    {
+    unsafe {
         use half::{bf16, f16};
 
         let val_f16 = f16::from_f32(1.5);
@@ -166,31 +166,31 @@ pub unsafe fn test_warp_shuffle_edge_cases() {
     let mask = 0xFFFFFFFF_u32;
 
     // Test with lane_mask = 0 (should shuffle with same lane)
-    {
+    unsafe {
         let val: i32 = 999;
         let (res, pred) = warp::warp_shuffle_xor(mask, val, 0, 32);
     }
 
     // Test with maximum lane_mask
-    {
+    unsafe {
         let val: i32 = 888;
         let (res, pred) = warp::warp_shuffle_xor(mask, val, 31, 32);
     }
 
     // Test shuffle_down with delta = 0
-    {
+    unsafe {
         let val: i32 = 777;
         let (res, pred) = warp::warp_shuffle_down(mask, val, 0, 32);
     }
 
     // Test shuffle_up with delta = 0
-    {
+    unsafe {
         let val: i32 = 666;
         let (res, pred) = warp::warp_shuffle_up(mask, val, 0, 32);
     }
 
     // Test shuffle_idx with idx = 0 and idx = 31
-    {
+    unsafe {
         let val: i32 = 555;
         let (res0, pred0) = warp::warp_shuffle_idx(mask, val, 0, 32);
         let (res31, pred31) = warp::warp_shuffle_idx(mask, val, 31, 32);
@@ -204,7 +204,7 @@ pub unsafe fn test_warp_shuffle_practical() {
     let mask = 0xFFFFFFFF_u32;
 
     // Butterfly reduction pattern using XOR shuffle
-    {
+    unsafe {
         let mut val = lane_id as i32;
 
         // Stage 1: XOR with distance 16
@@ -229,19 +229,19 @@ pub unsafe fn test_warp_shuffle_practical() {
     }
 
     // Broadcast from lane 0 using shuffle_idx
-    {
+    unsafe {
         let my_val = lane_id * 10;
         let (broadcast_val, is_valid) = warp::warp_shuffle_idx(mask, my_val, 0, 32);
     }
 
     // Shift pattern using shuffle_down
-    {
+    unsafe {
         let my_val = lane_id as f32;
         let (shifted_val, is_valid) = warp::warp_shuffle_down(mask, my_val, 1, 32);
     }
 
     // Reverse shift using shuffle_up
-    {
+    unsafe {
         let my_val = (31 - lane_id) as f32;
         let (shifted_val, is_valid) = warp::warp_shuffle_up(mask, my_val, 1, 32);
     }
diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml
index 2e8fff28..7467646b 100644
--- a/xtask/Cargo.toml
+++ b/xtask/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "xtask"
 version = "0.0.0"
-edition = "2021"
+edition = "2024"
 license = "MIT"
 
 [dependencies]