diff --git a/Cargo.lock b/Cargo.lock index bb9fb845dbc..a4774689436 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -83,6 +83,12 @@ dependencies = [ "libc", ] +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anstream" version = "0.6.21" @@ -1548,6 +1554,12 @@ dependencies = [ "libbz2-rs-sys", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "castaway" version = "0.2.4" @@ -1654,6 +1666,33 @@ dependencies = [ "phf 0.12.1", ] +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "cipher" version = "0.4.4" @@ -2068,6 +2107,39 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "itertools 0.13.0", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338" +dependencies = [ + "cast", + "itertools 0.13.0", +] + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -6465,6 +6537,12 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea" +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + [[package]] name = "opendal" version = "0.54.1" @@ -7116,6 +7194,34 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "polling" version = "3.11.0" @@ -9389,6 +9495,16 @@ dependencies = [ "zerovec", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.10.0" @@ -10148,6 +10264,7 @@ name = "vortex-cuda" version = "0.1.0" dependencies = [ "async-trait", + "criterion", "cudarc", "tokio", "tracing", diff --git a/Cargo.toml b/Cargo.toml index 8f09ef253a7..6a60714b37d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,6 +87,7 @@ cc = "1.2" cfg-if = "1.0.1" chrono = "0.4.42" clap = "4.5" +criterion = "0.7" crossterm = "0.29" cudarc = { version = "0.18.2", features = [ # 12.8 matches the CUDA toolkit version pre-installed on the lambda.ai GPU base diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml index 900a7cc02e7..cf822f1ec02 100644 --- a/vortex-cuda/Cargo.toml +++ b/vortex-cuda/Cargo.toml @@ -29,6 +29,11 @@ vortex-session = { workspace = true } vortex-utils = { workspace = true } [dev-dependencies] +criterion = { workspace = true } tokio = { workspace = true, features = ["rt", "macros"] } [build-dependencies] + +[[bench]] +name = "for_cuda" +harness = false diff --git a/vortex-cuda/benches/for_cuda.rs b/vortex-cuda/benches/for_cuda.rs new file mode 100644 index 00000000000..b781168700a --- /dev/null +++ b/vortex-cuda/benches/for_cuda.rs @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow(clippy::unwrap_used)] +#![allow(clippy::cast_possible_truncation)] + +use std::mem::size_of; +use std::time::Duration; + +use criterion::BenchmarkId; +use criterion::Criterion; +use criterion::Throughput; +use criterion::criterion_group; +use criterion::criterion_main; +use cudarc::driver::PushKernelArg; +use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::PrimitiveArray; +use vortex_buffer::Buffer; +use vortex_cuda::CudaExecutionCtx; +use vortex_cuda::CudaSession; +use vortex_cuda::has_nvcc; +use vortex_error::VortexExpect; +use vortex_fastlanes::FoRArray; +use vortex_session::VortexSession; + +const BENCH_ARGS: &[(usize, &str)] = &[ + (1_000, "1K"), + (10_000, "10K"), + (100_000, "100K"), + (1_000_000, "1M"), + (10_000_000, "10M"), + (100_000_000, "100M"), +]; + +/// Creates a FoR array for the given size. +fn make_for_array(len: usize) -> FoRArray { + let primitive_array = PrimitiveArray::new( + Buffer::from((0u32..len as u32).collect::>()), + vortex_array::validity::Validity::NonNullable, + ) + .into_array(); + + let for_offset = 10u32; + + FoRArray::try_new(primitive_array, for_offset.into()) + .vortex_expect("failed to create FoR array") +} + +/// Launches FoR decompression kernel and returns elapsed GPU time in seconds. +fn launch_for_kernel_timed( + for_array: &FoRArray, + reference: u32, + device_data: cudarc::driver::CudaSlice, + cuda_ctx: &mut CudaExecutionCtx, +) -> vortex_error::VortexResult { + let array_len = for_array.len() as u64; + + let events = vortex_cuda::launch_cuda_kernel!( + execution_ctx: cuda_ctx, + module: "for", + ptypes: &[for_array.ptype()], + launch_args: [device_data, reference, array_len], + event_recording: CU_EVENT_BLOCKING_SYNC, + array_len: for_array.len() + ); + + let elapsed_ms = events + .before_launch + .elapsed_ms(&events.after_launch) // synchronizes + .map_err(|e| vortex_error::vortex_err!("failed to get elapsed time: {}", e))?; + + Ok(Duration::from_secs_f32(elapsed_ms / 1000.0)) +} + +fn benchmark_for_cuda(c: &mut Criterion) { + if !has_nvcc() { + eprintln!("nvcc not found, skipping CUDA benchmarks"); + return; + } + + let mut group = c.benchmark_group("FoR_cuda"); + group.sample_size(10); + + for (len, label) in BENCH_ARGS { + let for_array = make_for_array(*len); + + group.throughput(Throughput::Bytes((len * size_of::()) as u64)); + group.bench_with_input( + BenchmarkId::new("u32_FoR", label), + &for_array, + |b, for_array| { + b.iter_custom(|iters| { + let mut cuda_ctx = CudaSession::new_ctx(VortexSession::empty()) + .vortex_expect("failed to create execution context"); + + let encoded = for_array.encoded(); + let unpacked_array = encoded.to_primitive(); + let unpacked_slice = unpacked_array.as_slice::(); + + let reference = 10u32; + let mut total_time = Duration::ZERO; + + for _ in 0..iters { + let device_data = cuda_ctx + .to_device(unpacked_slice) + .vortex_expect("failed to copy to device"); + + let kernel_time = launch_for_kernel_timed( + for_array, + reference, + device_data, + &mut cuda_ctx, + ) + .vortex_expect("kernel launch failed"); + + total_time += kernel_time; + } + + total_time + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, benchmark_for_cuda); +criterion_main!(benches); diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs index bd2c90d9d92..ca798939493 100644 --- a/vortex-cuda/src/lib.rs +++ b/vortex-cuda/src/lib.rs @@ -3,16 +3,17 @@ //! CUDA support for Vortex arrays. -mod executor; +pub mod executor; mod for_; mod kernel; mod session; use std::process::Command; +pub use executor::CudaExecutionCtx; pub use executor::CudaKernelEvents; use for_::ForExecutor; -use session::CudaSession; +pub use session::CudaSession; /// Check if the NVIDIA CUDA Compiler is available. pub fn has_nvcc() -> bool {