diff --git a/.gitignore b/.gitignore index b8c715a199..3a37949d38 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,5 @@ pgo-data.profdata docs/**bench** **/*/libposeidon-permute-c-mac.a -**/*/go-iden3-crypto/ \ No newline at end of file +**/*/go-iden3-crypto/ +.vscode diff --git a/.gitmodules b/.gitmodules index dc862bf2bb..79586d66ea 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "depends/cryptography_cuda"] path = depends/cryptography_cuda url = git@github.com:okx/cryptography_cuda.git + branch =5c5c3ca7987125507c8eb17572ecb355ffed2256 \ No newline at end of file diff --git a/depends/cryptography_cuda b/depends/cryptography_cuda index f22980c5eb..5c5c3ca798 160000 --- a/depends/cryptography_cuda +++ b/depends/cryptography_cuda @@ -1 +1 @@ -Subproject commit f22980c5eb81301d56fa39baf777c57c19a97102 +Subproject commit 5c5c3ca7987125507c8eb17572ecb355ffed2256 diff --git a/field/examples/fft.rs b/field/examples/fft.rs index a02ff26283..a4a7e72098 100644 --- a/field/examples/fft.rs +++ b/field/examples/fft.rs @@ -15,7 +15,7 @@ fn main() { let domain_size = 1usize << 10; let v: Vec = (0..domain_size).map(|_| random_fr()).collect(); - let mut buffer = v.clone(); + let buffer = v.clone(); let coeffs = buffer .iter() diff --git a/field/src/fft.rs b/field/src/fft.rs index 7c84949c13..912c94da5e 100644 --- a/field/src/fft.rs +++ b/field/src/fft.rs @@ -2,7 +2,7 @@ use alloc::vec::Vec; use core::cmp::{max, min}; #[cfg(feature = "cuda")] -use cryptography_cuda::{intt, ntt, types::NTTInputOutputOrder}; +use cryptography_cuda::{ntt, types::NTTInputOutputOrder}; use plonky2_util::{log2_strict, reverse_index_bits_in_place}; use unroll::unroll_for_loops; @@ -34,6 +34,7 @@ pub fn fft_root_table(n: usize) -> FftRootTable { root_table } +#[allow(dead_code)] #[cfg(feature = "cuda")] fn fft_dispatch_gpu( input: &mut [F], diff --git a/field/src/lib.rs b/field/src/lib.rs index e0454e52d4..fdb058cc9d 100644 --- a/field/src/lib.rs +++ b/field/src/lib.rs @@ -80,7 +80,7 @@ lazy_static! { #[cfg(test)] mod test { - use super::*; + #[cfg(feature = "precompile")] #[test] diff --git a/plonky2/src/fri/oracle.rs b/plonky2/src/fri/oracle.rs index ae657eebb4..99197093c8 100644 --- a/plonky2/src/fri/oracle.rs +++ b/plonky2/src/fri/oracle.rs @@ -3,8 +3,7 @@ use alloc::vec::Vec; #[cfg(feature = "cuda")] use cryptography_cuda::{ - device::memory::HostOrDeviceSlice, device::stream::CudaStream, intt_batch, lde_batch, - ntt_batch, transpose_rev_batch, types::*, + device::memory::HostOrDeviceSlice, lde_batch_multi_gpu, transpose_rev_batch, types::*, }; use itertools::Itertools; use plonky2_field::types::Field; @@ -140,10 +139,12 @@ impl, C: GenericConfig, const D: usize> fft_root_table: Option<&FftRootTable>, ) -> Self { let degree = polynomials[0].len(); - let log_n = log2_strict(degree) + rate_bits; #[cfg(feature = "cuda")] - if(log_n > 10 && polynomials.len() > 0){ + let log_n = log2_strict(degree); + + #[cfg(feature = "cuda")] + if log_n + rate_bits > 10 && polynomials.len() > 0 { let lde_values = Self::from_coeffs_gpu( &polynomials, rate_bits, @@ -155,30 +156,10 @@ impl, C: GenericConfig, const D: usize> degree ); - let num_gpus: usize = std::env::var("NUM_OF_GPUS") + let _num_gpus: usize = std::env::var("NUM_OF_GPUS") .expect("NUM_OF_GPUS should be set") .parse() .unwrap(); - - - if num_gpus != 1 { - let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values)); - reverse_index_bits_in_place(&mut leaves); - - let merkle_tree = timed!( - timing, - "build Merkle tree", - MerkleTree::new(leaves, cap_height) - ); - - return Self { - polynomials, - merkle_tree, - degree_log: log2_strict(degree), - rate_bits, - blinding, - }; - } let merkle_tree = timed!( timing, @@ -223,15 +204,18 @@ impl, C: GenericConfig, const D: usize> polynomials: &[PolynomialCoeffs], rate_bits: usize, blinding: bool, - cap_height: usize, - timing: &mut TimingTree, - fft_root_table: Option<&FftRootTable>, + _cap_height: usize, + _timing: &mut TimingTree, + _fft_root_table: Option<&FftRootTable>, log_n: usize, - degree: usize + _degree: usize )-> Vec>{ // If blinding, salt with two random elements to each leaf vector. + + let salt_size = if blinding { SALT_SIZE } else { 0 }; println!("salt_size: {:?}", salt_size); + let output_domain_size = log_n + rate_bits; let num_gpus: usize = std::env::var("NUM_OF_GPUS") .expect("NUM_OF_GPUS should be set") @@ -242,134 +226,85 @@ impl, C: GenericConfig, const D: usize> println!("get num of gpus: {:?}", num_gpus); let total_num_of_fft = polynomials.len(); println!("total_num_of_fft: {:?}", total_num_of_fft); - let per_device_batch = total_num_of_fft.div_ceil(num_gpus); - let chunk_size = total_num_of_fft.div_ceil(num_gpus); + + let total_num_input_elements = total_num_of_fft * (1 << log_n); + let total_num_output_elements = total_num_of_fft * (1 << output_domain_size); let start_lde = std::time::Instant::now(); // let poly_chunk = polynomials; // let id = 0; - let ret = polynomials - .par_chunks(chunk_size) - .enumerate() - .flat_map(|(id, poly_chunk)| { - - println!( - "invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}", - id, per_device_batch - ); - - let start = std::time::Instant::now(); - - let input_domain_size = 1 << log2_strict(degree); - let device_input_data: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(id as i32, input_domain_size * polynomials.len()) - .unwrap(); - let device_input_data = std::sync::RwLock::new(device_input_data); - - poly_chunk.par_iter().enumerate().for_each(|(i, p)| { - // println!("copy for index: {:?}", i); - let _guard = device_input_data.read().unwrap(); - _guard.copy_from_host_offset( - p.coeffs.as_slice(), - input_domain_size * i, - input_domain_size, - ); - }); - - println!("data transform elapsed: {:?}", start.elapsed()); - let mut cfg_lde = NTTConfig::default(); - cfg_lde.batches = per_device_batch as u32; + + let mut gpu_input: Vec = polynomials + .into_iter() + .flat_map( + |v| + v.coeffs.iter().cloned() + ) + .collect(); + + let mut cfg_lde = NTTConfig::default(); + cfg_lde.batches = total_num_of_fft as u32; cfg_lde.extension_rate_bits = rate_bits as u32; - cfg_lde.are_inputs_on_device = true; + cfg_lde.are_inputs_on_device = false; cfg_lde.are_outputs_on_device = true; cfg_lde.with_coset = true; - println!( - "start cuda_malloc with elements: {:?}", - (1 << log_n) * per_device_batch - ); - let mut device_output_data: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch) - .unwrap(); - - let start = std::time::Instant::now(); - lde_batch::( - id, - device_output_data.as_mut_ptr(), - device_input_data.read().unwrap().as_ptr(), - log2_strict(degree), - cfg_lde, - ); - - println!("real lde_batch elapsed: {:?}", start.elapsed()); - - if num_gpus == 1 { - let mut device_transpose_data: HostOrDeviceSlice<'_, F> = - HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch) - .unwrap(); - - let mut cfg_trans = TransposeConfig::default(); - cfg_trans.batches = per_device_batch as u32; - cfg_trans.are_inputs_on_device = true; - cfg_trans.are_outputs_on_device = true; - - let start = std::time::Instant::now(); - transpose_rev_batch( - id as i32, - device_transpose_data.as_mut_ptr(), - device_output_data.as_mut_ptr(), - log_n, - cfg_trans - ); - - println!("real transpose_rev_batch elapsed: {:?}", start.elapsed()); - - let start = std::time::Instant::now(); - let nums: Vec = (0..(1< = vec![F::ZERO; per_device_batch]; - device_transpose_data.copy_to_host_offset( - host_data.as_mut_slice(), - per_device_batch * i, - per_device_batch, - ); - PolynomialValues::new(host_data).values - }) - .collect::>>(); - println!("collect data from gpu used: {:?}", start.elapsed()); - return r; - } - - let start = std::time::Instant::now(); - let nums: Vec = (0..poly_chunk.len()).collect(); - - let r = nums - .par_iter() - .map(|i| { - let mut host_data: Vec = vec![F::ZERO; 1 << log_n]; - device_output_data.copy_to_host_offset( - host_data.as_mut_slice(), - (1 << log_n) * i, - 1 << log_n, - ); - PolynomialValues::new(host_data).values - }) - .collect::>>(); - println!("collect data from gpu used: {:?}", start.elapsed()); - return r; - - }) - // .chain( - // (0..salt_size) - // .into_par_iter() - // .map(|_| F::rand_vec(degree << rate_bits)), - // ) - .collect(); - println!("real lde elapsed: {:?}", start_lde.elapsed()); - return ret; + cfg_lde.is_multi_gpu = true; + + + let mut device_output_data: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap(); + + lde_batch_multi_gpu::( + device_output_data.as_mut_ptr(), + gpu_input.as_mut_ptr(), + num_gpus, + cfg_lde.clone(), + log_n, + total_num_input_elements, + total_num_output_elements, + ); + + println!("real lde_batch elapsed: {:?}", start_lde.elapsed()); + + let mut cfg_trans = TransposeConfig::default(); + cfg_trans.batches = total_num_of_fft as u32; + cfg_trans.are_inputs_on_device = true; + cfg_trans.are_outputs_on_device = true; + + let mut device_transpose_data: HostOrDeviceSlice<'_, F> = + HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements) + .unwrap(); + + let start = std::time::Instant::now(); + transpose_rev_batch( + 0 as i32, + device_transpose_data.as_mut_ptr(), + device_output_data.as_mut_ptr(), + output_domain_size, + cfg_trans + ); + + println!("real transpose_rev_batch elapsed: {:?}", start.elapsed()); + + let start = std::time::Instant::now(); + let nums: Vec = (0..(1<< output_domain_size)).collect(); + let r = nums + .par_iter() + .map(|_i| { + let mut host_data: Vec = vec![F::ZERO; total_num_of_fft]; + device_transpose_data.copy_to_host_offset( + host_data.as_mut_slice(), + 0, + total_num_of_fft, + ).expect("copy to host error"); + PolynomialValues::new(host_data).values + }) + .collect::>>(); + println!("collect data from gpu used: {:?}", start.elapsed()); + println!("real lde elapsed: {:?}", start_lde.elapsed()); + return r; } fn lde_values( diff --git a/plonky2/src/gates/gate.rs b/plonky2/src/gates/gate.rs index 82fc412f32..a077d0ba6b 100644 --- a/plonky2/src/gates/gate.rs +++ b/plonky2/src/gates/gate.rs @@ -278,7 +278,7 @@ pub struct PrefixedGate, const D: usize> { } /// A gate's filter designed so that it is non-zero if `s = row`. -fn compute_filter(row: usize, group_range: Range, s: K, many_selector: bool) -> K { +pub fn compute_filter(row: usize, group_range: Range, s: K, many_selector: bool) -> K { debug_assert!(group_range.contains(&row)); group_range .filter(|&i| i != row) @@ -287,7 +287,7 @@ fn compute_filter(row: usize, group_range: Range, s: K, many_se .product() } -fn compute_filter_circuit, const D: usize>( +pub fn compute_filter_circuit, const D: usize>( builder: &mut CircuitBuilder, row: usize, group_range: Range, diff --git a/plonky2/tests/factorial_test.rs b/plonky2/tests/factorial_test.rs new file mode 100644 index 0000000000..84ddb3beb4 --- /dev/null +++ b/plonky2/tests/factorial_test.rs @@ -0,0 +1,50 @@ + +use plonky2::field::types::Field; +use plonky2::iop::witness::{PartialWitness, WitnessWrite}; +use plonky2::plonk::circuit_builder::CircuitBuilder; +use plonky2::plonk::circuit_data::CircuitConfig; +use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; + +#[cfg(feature = "cuda")] +use crate::test_utils::init_cuda; +#[cfg(feature = "cuda")] +pub mod test_utils; + +#[test] +fn test_factorial_proof(){ + + #[cfg(feature = "cuda")] + init_cuda(); + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let config = CircuitConfig::standard_recursion_config(); + let mut builder = CircuitBuilder::::new(config); + + // The arithmetic circuit. + let initial = builder.add_virtual_target(); + let mut cur_target = initial; + for i in 2..101 { + let i_target = builder.constant(F::from_canonical_u32(i)); + cur_target = builder.mul(cur_target, i_target); + } + + // Public inputs are the initial value (provided below) and the result (which is generated). + builder.register_public_input(initial); + builder.register_public_input(cur_target); + + let mut pw = PartialWitness::new(); + pw.set_target(initial, F::ONE); + + let data = builder.build::(); + let proof = data.prove(pw).unwrap(); + + println!( + "Factorial starting at {} is {}", + proof.public_inputs[0], proof.public_inputs[1] + ); + + data.verify(proof); +} diff --git a/plonky2/tests/fibonacci_test.rs b/plonky2/tests/fibonacci_test.rs new file mode 100644 index 0000000000..6b09ca6cda --- /dev/null +++ b/plonky2/tests/fibonacci_test.rs @@ -0,0 +1,48 @@ +use plonky2::field::types::Field; +use plonky2::iop::witness::{PartialWitness, WitnessWrite}; +use plonky2::plonk::circuit_builder::CircuitBuilder; +use plonky2::plonk::circuit_data::CircuitConfig; +use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; +#[cfg(feature = "cuda")] +use crate::test_utils::init_cuda; +#[cfg(feature = "cuda")] +pub mod test_utils; + +#[test] +fn test_fibonacci_proof() { + + #[cfg(feature = "cuda")] + init_cuda(); + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let config = CircuitConfig::standard_recursion_config(); + let mut builder = CircuitBuilder::::new(config); + + // The arithmetic circuit. + let initial = builder.add_virtual_target(); + let mut cur_target = initial; + for i in 2..101 { + let i_target = builder.constant(F::from_canonical_u32(i)); + cur_target = builder.mul(cur_target, i_target); + } + + // Public inputs are the initial value (provided below) and the result (which is generated). + builder.register_public_input(initial); + builder.register_public_input(cur_target); + + let mut pw = PartialWitness::new(); + pw.set_target(initial, F::ONE); + + let data = builder.build::(); + let proof = data.prove(pw).unwrap(); + + println!( + "Factorial starting at {} is {}", + proof.public_inputs[0], proof.public_inputs[1] + ); + + data.verify(proof); +} \ No newline at end of file diff --git a/plonky2/tests/range_check_test.rs b/plonky2/tests/range_check_test.rs new file mode 100644 index 0000000000..f5f83d9c67 --- /dev/null +++ b/plonky2/tests/range_check_test.rs @@ -0,0 +1,39 @@ + +use plonky2::field::types::Field; +use plonky2::iop::witness::{PartialWitness, WitnessWrite}; +use plonky2::plonk::circuit_builder::CircuitBuilder; +use plonky2::plonk::circuit_data::CircuitConfig; +use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig}; +#[cfg(feature = "cuda")] +use crate::test_utils::init_cuda; +#[cfg(feature = "cuda")] +pub mod test_utils; + +#[test] +fn test_range_check_proof() { + #[cfg(feature = "cuda")] + init_cuda(); + + const D: usize = 2; + type C = PoseidonGoldilocksConfig; + type F = >::F; + + let config = CircuitConfig::standard_recursion_config(); + let mut builder = CircuitBuilder::::new(config); + + // The secret value. + let value = builder.add_virtual_target(); + builder.register_public_input(value); + + let log_max = 6; + builder.range_check(value, log_max); + + let mut pw = PartialWitness::new(); + pw.set_target(value, F::from_canonical_usize(42)); + + let data = builder.build::(); + let proof = data.prove(pw).unwrap(); + + + data.verify(proof); +} diff --git a/plonky2/tests/test_utils.rs b/plonky2/tests/test_utils.rs new file mode 100644 index 0000000000..1eb3a7cdda --- /dev/null +++ b/plonky2/tests/test_utils.rs @@ -0,0 +1,26 @@ +#[cfg(feature = "cuda")] +pub fn init_cuda() { + use plonky2_field::goldilocks_field::GoldilocksField; + use plonky2_field::types::Field; + use plonky2_field::types::PrimeField64; + + use cryptography_cuda::{ + get_number_of_gpus_rs, init_twiddle_factors_rs, init_coset_rs, + }; + + let num_of_gpus = get_number_of_gpus_rs(); + println!("num of gpus: {:?}", num_of_gpus); + std::env::set_var("NUM_OF_GPUS", num_of_gpus.to_string()); + + let log_ns: Vec = (6..22).collect(); + + let mut device_id = 0; + while device_id < num_of_gpus { + init_coset_rs(device_id, 24, GoldilocksField::coset_shift().to_canonical_u64()); + for log_n in &log_ns { + // println!("{:?}", log_n); + init_twiddle_factors_rs(device_id, *log_n); + } + device_id = device_id + 1; + } +} \ No newline at end of file diff --git a/run_proof_tests.sh b/run_proof_tests.sh new file mode 100755 index 0000000000..d1ac722f9b --- /dev/null +++ b/run_proof_tests.sh @@ -0,0 +1,4 @@ +#!/bin/sh -e +cargo test --package plonky2 --features=cuda,batch --release --test fibonacci_test -- test_fibonacci_proof --exact --nocapture +cargo test --package plonky2 --features=cuda,batch --release --test range_check_test -- test_range_check_proof --exact --nocapture +cargo test --package plonky2 --features=cuda,batch --release --test factorial_test -- test_factorial_proof --exact --nocapture