diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index 16cd6a1564..927128c8d8 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -4,7 +4,7 @@ use alloc::vec::Vec; #[cfg(feature = "cuda")] use cryptography_cuda::{ device::memory::HostOrDeviceSlice, device::stream::CudaStream, intt_batch, lde_batch, - ntt_batch, types::*, + ntt_batch, transpose_rev_batch, types::*, }; use itertools::Itertools; use plonky2_field::types::Field; @@ -140,6 +140,59 @@ impl, C: GenericConfig, const D: usize> fft_root_table: Option<&FftRootTable>, ) -> Self { let degree = polynomials[0].len(); + let log_n = log2_strict(degree) + rate_bits; + let num_gpus: usize = std::env::var("NUM_OF_GPUS") + .expect("NUM_OF_GPUS should be set") + .parse() + .unwrap(); + + #[cfg(feature = "cuda")] + if(log_n > 10 && polynomials.len() > 0){ + let lde_values = Self::from_coeffs_gpu( + &polynomials, + rate_bits, + blinding, + cap_height, + timing, + fft_root_table, + log_n, + degree + ); + + if num_gpus != 1 { + let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values)); + reverse_index_bits_in_place(&mut leaves); + + let merkle_tree = timed!( + timing, + "build Merkle tree", + MerkleTree::new(leaves, cap_height) + ); + + return Self { + polynomials, + merkle_tree, + degree_log: log2_strict(degree), + rate_bits, + blinding, + }; + } + + let merkle_tree = timed!( + timing, + "build Merkle tree", + MerkleTree::new(lde_values, cap_height) + ); + + return Self { + polynomials, + merkle_tree, + degree_log: log2_strict(degree), + rate_bits, + blinding, + }; + } + let lde_values = timed!( timing, "FFT + blinding", @@ -163,120 +216,168 @@ impl, C: GenericConfig, const D: usize> } } - fn lde_values( + #[cfg(feature = "cuda")] + pub fn from_coeffs_gpu( polynomials: &[PolynomialCoeffs], rate_bits: usize, blinding: bool, + cap_height: usize, + timing: &mut TimingTree, fft_root_table: Option<&FftRootTable>, - ) -> Vec> { - let degree = polynomials[0].len(); - #[cfg(all(feature = "cuda", feature = "batch"))] - let log_n = log2_strict(degree) + rate_bits; - + log_n: usize, + degree: usize + )-> Vec>{ // If blinding, salt with two random elements to each leaf vector. let salt_size = if blinding { SALT_SIZE } else { 0 }; println!("salt_size: {:?}", salt_size); - #[cfg(all(feature = "cuda", feature = "batch"))] let num_gpus: usize = std::env::var("NUM_OF_GPUS") .expect("NUM_OF_GPUS should be set") .parse() .unwrap(); - // let num_gpus: usize = 1; - #[cfg(all(feature = "cuda", feature = "batch"))] + // let num_gpus: usize = 1; + println!("get num of gpus: {:?}", num_gpus); let total_num_of_fft = polynomials.len(); println!("total_num_of_fft: {:?}", total_num_of_fft); - #[cfg(all(feature = "cuda", feature = "batch"))] let per_device_batch = total_num_of_fft.div_ceil(num_gpus); - - #[cfg(all(feature = "cuda", feature = "batch"))] let chunk_size = total_num_of_fft.div_ceil(num_gpus); - #[cfg(all(feature = "cuda", feature = "batch"))] - if (log_n > 10 && polynomials.len() > 0) { - let start_lde = std::time::Instant::now(); - - // let poly_chunk = polynomials; - // let id = 0; - let ret = polynomials - .par_chunks(chunk_size) - .enumerate() - .flat_map(|(id, poly_chunk)| { - - println!( - "invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}", - id, per_device_batch - ); - - let start = std::time::Instant::now(); + let start_lde = std::time::Instant::now(); - let input_domain_size = 1 << log2_strict(degree); - let device_input_data: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(id as i32, input_domain_size * polynomials.len()) - .unwrap(); - let device_input_data = std::sync::RwLock::new(device_input_data); - - poly_chunk.par_iter().enumerate().for_each(|(i, p)| { - // println!("copy for index: {:?}", i); - let _guard = device_input_data.read().unwrap(); - _guard.copy_from_host_offset( - p.coeffs.as_slice(), - input_domain_size * i, - input_domain_size, - ); - }); - - println!("data transform elapsed: {:?}", start.elapsed()); - let mut cfg_lde = NTTConfig::default(); - cfg_lde.batches = per_device_batch as u32; - cfg_lde.extension_rate_bits = rate_bits as u32; - cfg_lde.are_inputs_on_device = true; - cfg_lde.are_outputs_on_device = true; - cfg_lde.with_coset = true; - println!( - "start cuda_malloc with elements: {:?}", - (1 << log_n) * per_device_batch + // let poly_chunk = polynomials; + // let id = 0; + let ret = polynomials + .par_chunks(chunk_size) + .enumerate() + .flat_map(|(id, poly_chunk)| { + + println!( + "invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}", + id, per_device_batch + ); + + let start = std::time::Instant::now(); + + let input_domain_size = 1 << log2_strict(degree); + let device_input_data: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(id as i32, input_domain_size * polynomials.len()) + .unwrap(); + let device_input_data = std::sync::RwLock::new(device_input_data); + + poly_chunk.par_iter().enumerate().for_each(|(i, p)| { + // println!("copy for index: {:?}", i); + let _guard = device_input_data.read().unwrap(); + _guard.copy_from_host_offset( + p.coeffs.as_slice(), + input_domain_size * i, + input_domain_size, ); - let mut device_output_data: HostOrDeviceSlice<'_, F> = + }); + + println!("data transform elapsed: {:?}", start.elapsed()); + let mut cfg_lde = NTTConfig::default(); + cfg_lde.batches = per_device_batch as u32; + cfg_lde.extension_rate_bits = rate_bits as u32; + cfg_lde.are_inputs_on_device = true; + cfg_lde.are_outputs_on_device = true; + cfg_lde.with_coset = true; + println!( + "start cuda_malloc with elements: {:?}", + (1 << log_n) * per_device_batch + ); + let mut device_output_data: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch) + .unwrap(); + + let start = std::time::Instant::now(); + lde_batch::( + id, + device_output_data.as_mut_ptr(), + device_input_data.read().unwrap().as_ptr(), + log2_strict(degree), + cfg_lde, + ); + + println!("real lde_batch elapsed: {:?}", start.elapsed()); + + if num_gpus == 1 { + let mut device_transpose_data: HostOrDeviceSlice<'_, F> = HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch) .unwrap(); - let start = std::time::Instant::now(); - lde_batch::( - id, - device_output_data.as_mut_ptr(), - device_input_data.read().unwrap().as_ptr(), - log2_strict(degree), - cfg_lde, - ); - println!("real lde_batch elapsed: {:?}", start.elapsed()); - let start = std::time::Instant::now(); - let nums: Vec = (0..poly_chunk.len()).collect(); - let r = nums - .par_iter() - .map(|i| { - let mut host_data: Vec = vec![F::ZERO; 1 << log_n]; - device_output_data.copy_to_host_offset( - host_data.as_mut_slice(), - (1 << log_n) * i, - 1 << log_n, - ); - PolynomialValues::new(host_data).values - }) - .collect::>>(); - println!("collect data from gpu used: {:?}", start.elapsed()); - r - }) - // .chain( - // (0..salt_size) - // .into_par_iter() - // .map(|_| F::rand_vec(degree << rate_bits)), - // ) - .collect(); - println!("real lde elapsed: {:?}", start_lde.elapsed()); - return ret; - } + let mut cfg_trans = TransposeConfig::default(); + cfg_trans.batches = per_device_batch as u32; + cfg_trans.are_inputs_on_device = true; + cfg_trans.are_outputs_on_device = true; + transpose_rev_batch( + id as i32, + device_transpose_data.as_mut_ptr(), + device_output_data.as_mut_ptr(), + log2_strict(degree), + cfg_trans + ); + + let start = std::time::Instant::now(); + let nums: Vec = (0..poly_chunk.len()).collect(); + let r = nums + .par_iter() + .map(|i| { + let mut host_data: Vec = vec![F::ZERO; per_device_batch]; + device_transpose_data.copy_to_host_offset( + host_data.as_mut_slice(), + per_device_batch * i, + per_device_batch, + ); + PolynomialValues::new(host_data).values + }) + .collect::>>(); + println!("collect data from gpu used: {:?}", start.elapsed()); + return r; + } + + let start = std::time::Instant::now(); + let nums: Vec = (0..poly_chunk.len()).collect(); + + let r = nums + .par_iter() + .map(|i| { + let mut host_data: Vec = vec![F::ZERO; 1 << log_n]; + device_output_data.copy_to_host_offset( + host_data.as_mut_slice(), + (1 << log_n) * i, + 1 << log_n, + ); + PolynomialValues::new(host_data).values + }) + .collect::>>(); + println!("collect data from gpu used: {:?}", start.elapsed()); + return r; + + }) + // .chain( + // (0..salt_size) + // .into_par_iter() + // .map(|_| F::rand_vec(degree << rate_bits)), + // ) + .collect(); + println!("real lde elapsed: {:?}", start_lde.elapsed()); + return ret; + + } + + fn lde_values( + polynomials: &[PolynomialCoeffs], + rate_bits: usize, + blinding: bool, + fft_root_table: Option<&FftRootTable>, + ) -> Vec> { + let degree = polynomials[0].len(); + // If blinding, salt with two random elements to each leaf vector. + let salt_size = if blinding { SALT_SIZE } else { 0 }; + println!("salt_size: {:?}", salt_size); + let total_num_of_fft = polynomials.len(); + println!("total_num_of_fft: {:?}", total_num_of_fft); let ret = polynomials .par_iter()