Skip to content

Commit

Permalink
merge with latest dev
Browse files Browse the repository at this point in the history
  • Loading branch information
dloghin committed Mar 20, 2024
2 parents a3f39c2 + 278b155 commit 1d46a15
Show file tree
Hide file tree
Showing 11 changed files with 257 additions and 153 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ docs/**bench**
**/*/libposeidon-permute-c-mac.a
**/*/go-iden3-crypto/
node_modules
.vscode
.vscode
1 change: 1 addition & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "depends/cryptography_cuda"]
path = depends/cryptography_cuda
url = [email protected]:okx/cryptography_cuda.git
branch =5c5c3ca7987125507c8eb17572ecb355ffed2256
2 changes: 1 addition & 1 deletion depends/cryptography_cuda
3 changes: 2 additions & 1 deletion field/src/fft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use alloc::vec::Vec;
use core::cmp::{max, min};

#[cfg(feature = "cuda")]
use cryptography_cuda::{intt, ntt, types::NTTInputOutputOrder};
use cryptography_cuda::{ntt, types::NTTInputOutputOrder};
use plonky2_util::{log2_strict, reverse_index_bits_in_place};
use unroll::unroll_for_loops;

Expand Down Expand Up @@ -34,6 +34,7 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
root_table
}

#[allow(dead_code)]
#[cfg(feature = "cuda")]
fn fft_dispatch_gpu<F: Field>(
input: &mut [F],
Expand Down
231 changes: 83 additions & 148 deletions plonky2/src/fri/oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ use alloc::vec::Vec;

#[cfg(feature = "cuda")]
use cryptography_cuda::{
device::memory::HostOrDeviceSlice, device::stream::CudaStream, intt_batch, lde_batch,
ntt_batch, transpose_rev_batch, types::*,
device::memory::HostOrDeviceSlice, lde_batch_multi_gpu, transpose_rev_batch, types::*,
};
use itertools::Itertools;
use plonky2_field::types::Field;
Expand Down Expand Up @@ -140,10 +139,12 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
fft_root_table: Option<&FftRootTable<F>>,
) -> Self {
let degree = polynomials[0].len();
let _log_n = log2_strict(degree) + rate_bits;

#[cfg(feature = "cuda")]
if(log_n > 10 && polynomials.len() > 0){
let log_n = log2_strict(degree);

#[cfg(feature = "cuda")]
if log_n + rate_bits > 10 && polynomials.len() > 0 {
let lde_values = Self::from_coeffs_gpu(
&polynomials,
rate_bits,
Expand All @@ -155,30 +156,10 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
degree
);

let num_gpus: usize = std::env::var("NUM_OF_GPUS")
let _num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();


if num_gpus != 1 {
let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values));
reverse_index_bits_in_place(&mut leaves);

let merkle_tree = timed!(
timing,
"build Merkle tree",
MerkleTree::new(leaves, cap_height)
);

return Self {
polynomials,
merkle_tree,
degree_log: log2_strict(degree),
rate_bits,
blinding,
};
}

let merkle_tree = timed!(
timing,
Expand Down Expand Up @@ -223,15 +204,18 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
polynomials: &[PolynomialCoeffs<F>],
rate_bits: usize,
blinding: bool,
cap_height: usize,
timing: &mut TimingTree,
fft_root_table: Option<&FftRootTable<F>>,
_cap_height: usize,
_timing: &mut TimingTree,
_fft_root_table: Option<&FftRootTable<F>>,
log_n: usize,
degree: usize
_degree: usize
)-> Vec<Vec<F>>{
// If blinding, salt with two random elements to each leaf vector.


let salt_size = if blinding { SALT_SIZE } else { 0 };
println!("salt_size: {:?}", salt_size);
let output_domain_size = log_n + rate_bits;

let num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
Expand All @@ -242,134 +226,85 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
println!("get num of gpus: {:?}", num_gpus);
let total_num_of_fft = polynomials.len();
println!("total_num_of_fft: {:?}", total_num_of_fft);
let per_device_batch = total_num_of_fft.div_ceil(num_gpus);
let chunk_size = total_num_of_fft.div_ceil(num_gpus);

let total_num_input_elements = total_num_of_fft * (1 << log_n);
let total_num_output_elements = total_num_of_fft * (1 << output_domain_size);

let start_lde = std::time::Instant::now();

// let poly_chunk = polynomials;
// let id = 0;
let ret = polynomials
.par_chunks(chunk_size)
.enumerate()
.flat_map(|(id, poly_chunk)| {

println!(
"invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}",
id, per_device_batch
);

let start = std::time::Instant::now();

let input_domain_size = 1 << log2_strict(degree);
let device_input_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, input_domain_size * polynomials.len())
.unwrap();
let device_input_data = std::sync::RwLock::new(device_input_data);

poly_chunk.par_iter().enumerate().for_each(|(i, p)| {
// println!("copy for index: {:?}", i);
let _guard = device_input_data.read().unwrap();
_guard.copy_from_host_offset(
p.coeffs.as_slice(),
input_domain_size * i,
input_domain_size,
);
});

println!("data transform elapsed: {:?}", start.elapsed());
let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = per_device_batch as u32;

let mut gpu_input: Vec<F> = polynomials
.into_iter()
.flat_map(
|v|
v.coeffs.iter().cloned()
)
.collect();

let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = total_num_of_fft as u32;
cfg_lde.extension_rate_bits = rate_bits as u32;
cfg_lde.are_inputs_on_device = true;
cfg_lde.are_inputs_on_device = false;
cfg_lde.are_outputs_on_device = true;
cfg_lde.with_coset = true;
println!(
"start cuda_malloc with elements: {:?}",
(1 << log_n) * per_device_batch
);
let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch)
.unwrap();

let start = std::time::Instant::now();
lde_batch::<F>(
id,
device_output_data.as_mut_ptr(),
device_input_data.read().unwrap().as_ptr(),
log2_strict(degree),
cfg_lde,
);

println!("real lde_batch elapsed: {:?}", start.elapsed());

if num_gpus == 1 {
let mut device_transpose_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch)
.unwrap();

let mut cfg_trans = TransposeConfig::default();
cfg_trans.batches = per_device_batch as u32;
cfg_trans.are_inputs_on_device = true;
cfg_trans.are_outputs_on_device = true;

let start = std::time::Instant::now();
transpose_rev_batch(
id as i32,
device_transpose_data.as_mut_ptr(),
device_output_data.as_mut_ptr(),
log_n,
cfg_trans
);

println!("real transpose_rev_batch elapsed: {:?}", start.elapsed());

let start = std::time::Instant::now();
let nums: Vec<usize> = (0..(1<<log_n)).collect();
let r = nums
.par_iter()
.map(|i| {
let mut host_data: Vec<F> = vec![F::ZERO; per_device_batch];
device_transpose_data.copy_to_host_offset(
host_data.as_mut_slice(),
per_device_batch * i,
per_device_batch,
);
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
return r;
}

let start = std::time::Instant::now();
let nums: Vec<usize> = (0..poly_chunk.len()).collect();

let r = nums
.par_iter()
.map(|i| {
let mut host_data: Vec<F> = vec![F::ZERO; 1 << log_n];
device_output_data.copy_to_host_offset(
host_data.as_mut_slice(),
(1 << log_n) * i,
1 << log_n,
);
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
return r;

})
// .chain(
// (0..salt_size)
// .into_par_iter()
// .map(|_| F::rand_vec(degree << rate_bits)),
// )
.collect();
println!("real lde elapsed: {:?}", start_lde.elapsed());
return ret;
cfg_lde.is_multi_gpu = true;


let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap();

lde_batch_multi_gpu::<F>(
device_output_data.as_mut_ptr(),
gpu_input.as_mut_ptr(),
num_gpus,
cfg_lde.clone(),
log_n,
total_num_input_elements,
total_num_output_elements,
);

println!("real lde_batch elapsed: {:?}", start_lde.elapsed());

let mut cfg_trans = TransposeConfig::default();
cfg_trans.batches = total_num_of_fft as u32;
cfg_trans.are_inputs_on_device = true;
cfg_trans.are_outputs_on_device = true;

let mut device_transpose_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements)
.unwrap();

let start = std::time::Instant::now();

transpose_rev_batch(
0 as i32,
device_transpose_data.as_mut_ptr(),
device_output_data.as_mut_ptr(),
output_domain_size,
cfg_trans
);

println!("real transpose_rev_batch elapsed: {:?}", start.elapsed());

let start = std::time::Instant::now();
let nums: Vec<usize> = (0..(1<< output_domain_size)).collect();
let r = nums
.par_iter()
.map(|_i| {
let mut host_data: Vec<F> = vec![F::ZERO; total_num_of_fft];
device_transpose_data.copy_to_host_offset(
host_data.as_mut_slice(),
0,
total_num_of_fft,
).expect("copy to host error");
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
println!("real lde elapsed: {:?}", start_lde.elapsed());
return r;
}

fn lde_values(
Expand Down
4 changes: 2 additions & 2 deletions plonky2/src/gates/gate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ pub struct PrefixedGate<F: RichField + Extendable<D>, const D: usize> {
}

/// A gate's filter designed so that it is non-zero if `s = row`.
fn compute_filter<K: Field>(row: usize, group_range: Range<usize>, s: K, many_selector: bool) -> K {
pub fn compute_filter<K: Field>(row: usize, group_range: Range<usize>, s: K, many_selector: bool) -> K {
debug_assert!(group_range.contains(&row));
group_range
.filter(|&i| i != row)
Expand All @@ -283,7 +283,7 @@ fn compute_filter<K: Field>(row: usize, group_range: Range<usize>, s: K, many_se
.product()
}

fn compute_filter_circuit<F: RichField + Extendable<D>, const D: usize>(
pub fn compute_filter_circuit<F: RichField + Extendable<D>, const D: usize>(
builder: &mut CircuitBuilder<F, D>,
row: usize,
group_range: Range<usize>,
Expand Down
50 changes: 50 additions & 0 deletions plonky2/tests/factorial_test.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

use plonky2::field::types::Field;
use plonky2::iop::witness::{PartialWitness, WitnessWrite};
use plonky2::plonk::circuit_builder::CircuitBuilder;
use plonky2::plonk::circuit_data::CircuitConfig;
use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};

#[cfg(feature = "cuda")]
use crate::test_utils::init_cuda;
#[cfg(feature = "cuda")]
pub mod test_utils;

#[test]
fn test_factorial_proof(){

#[cfg(feature = "cuda")]
init_cuda();

const D: usize = 2;
type C = PoseidonGoldilocksConfig;
type F = <C as GenericConfig<D>>::F;

let config = CircuitConfig::standard_recursion_config();
let mut builder = CircuitBuilder::<F, D>::new(config);

// The arithmetic circuit.
let initial = builder.add_virtual_target();
let mut cur_target = initial;
for i in 2..101 {
let i_target = builder.constant(F::from_canonical_u32(i));
cur_target = builder.mul(cur_target, i_target);
}

// Public inputs are the initial value (provided below) and the result (which is generated).
builder.register_public_input(initial);
builder.register_public_input(cur_target);

let mut pw = PartialWitness::new();
pw.set_target(initial, F::ONE);

let data = builder.build::<C>();
let proof = data.prove(pw).unwrap();

println!(
"Factorial starting at {} is {}",
proof.public_inputs[0], proof.public_inputs[1]
);

data.verify(proof);
}
Loading

0 comments on commit 1d46a15

Please sign in to comment.