Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests for testing the GPU integration #9

Merged
merged 11 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ pgo-data.profdata
docs/**bench**

**/*/libposeidon-permute-c-mac.a
**/*/go-iden3-crypto/
**/*/go-iden3-crypto/
.vscode
1 change: 1 addition & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "depends/cryptography_cuda"]
path = depends/cryptography_cuda
url = [email protected]:okx/cryptography_cuda.git
branch =5c5c3ca7987125507c8eb17572ecb355ffed2256
2 changes: 1 addition & 1 deletion depends/cryptography_cuda
2 changes: 1 addition & 1 deletion field/examples/fft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ fn main() {
let domain_size = 1usize << 10;

let v: Vec<u64> = (0..domain_size).map(|_| random_fr()).collect();
let mut buffer = v.clone();
let buffer = v.clone();

let coeffs = buffer
.iter()
Expand Down
3 changes: 2 additions & 1 deletion field/src/fft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use alloc::vec::Vec;
use core::cmp::{max, min};

#[cfg(feature = "cuda")]
use cryptography_cuda::{intt, ntt, types::NTTInputOutputOrder};
use cryptography_cuda::{ntt, types::NTTInputOutputOrder};
use plonky2_util::{log2_strict, reverse_index_bits_in_place};
use unroll::unroll_for_loops;

Expand Down Expand Up @@ -34,6 +34,7 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
root_table
}

#[allow(dead_code)]
#[cfg(feature = "cuda")]
fn fft_dispatch_gpu<F: Field>(
input: &mut [F],
Expand Down
2 changes: 1 addition & 1 deletion field/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ lazy_static! {

#[cfg(test)]
mod test {
use super::*;


#[cfg(feature = "precompile")]
#[test]
Expand Down
231 changes: 83 additions & 148 deletions plonky2/src/fri/oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ use alloc::vec::Vec;

#[cfg(feature = "cuda")]
use cryptography_cuda::{
device::memory::HostOrDeviceSlice, device::stream::CudaStream, intt_batch, lde_batch,
ntt_batch, transpose_rev_batch, types::*,
device::memory::HostOrDeviceSlice, lde_batch_multi_gpu, transpose_rev_batch, types::*,
};
use itertools::Itertools;
use plonky2_field::types::Field;
Expand Down Expand Up @@ -140,10 +139,12 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
fft_root_table: Option<&FftRootTable<F>>,
) -> Self {
let degree = polynomials[0].len();
let log_n = log2_strict(degree) + rate_bits;

#[cfg(feature = "cuda")]
if(log_n > 10 && polynomials.len() > 0){
let log_n = log2_strict(degree);

#[cfg(feature = "cuda")]
if log_n + rate_bits > 10 && polynomials.len() > 0 {
let lde_values = Self::from_coeffs_gpu(
&polynomials,
rate_bits,
Expand All @@ -155,30 +156,10 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
degree
);

let num_gpus: usize = std::env::var("NUM_OF_GPUS")
let _num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();


if num_gpus != 1 {
let mut leaves = timed!(timing, "transpose LDEs", transpose(&lde_values));
reverse_index_bits_in_place(&mut leaves);

let merkle_tree = timed!(
timing,
"build Merkle tree",
MerkleTree::new(leaves, cap_height)
);

return Self {
polynomials,
merkle_tree,
degree_log: log2_strict(degree),
rate_bits,
blinding,
};
}

let merkle_tree = timed!(
timing,
Expand Down Expand Up @@ -223,15 +204,18 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
polynomials: &[PolynomialCoeffs<F>],
rate_bits: usize,
blinding: bool,
cap_height: usize,
timing: &mut TimingTree,
fft_root_table: Option<&FftRootTable<F>>,
_cap_height: usize,
_timing: &mut TimingTree,
_fft_root_table: Option<&FftRootTable<F>>,
log_n: usize,
degree: usize
_degree: usize
)-> Vec<Vec<F>>{
// If blinding, salt with two random elements to each leaf vector.


let salt_size = if blinding { SALT_SIZE } else { 0 };
println!("salt_size: {:?}", salt_size);
let output_domain_size = log_n + rate_bits;

let num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
Expand All @@ -242,134 +226,85 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
println!("get num of gpus: {:?}", num_gpus);
let total_num_of_fft = polynomials.len();
println!("total_num_of_fft: {:?}", total_num_of_fft);
let per_device_batch = total_num_of_fft.div_ceil(num_gpus);
let chunk_size = total_num_of_fft.div_ceil(num_gpus);

let total_num_input_elements = total_num_of_fft * (1 << log_n);
let total_num_output_elements = total_num_of_fft * (1 << output_domain_size);

let start_lde = std::time::Instant::now();

// let poly_chunk = polynomials;
// let id = 0;
let ret = polynomials
.par_chunks(chunk_size)
.enumerate()
.flat_map(|(id, poly_chunk)| {

println!(
"invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}",
id, per_device_batch
);

let start = std::time::Instant::now();

let input_domain_size = 1 << log2_strict(degree);
let device_input_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, input_domain_size * polynomials.len())
.unwrap();
let device_input_data = std::sync::RwLock::new(device_input_data);

poly_chunk.par_iter().enumerate().for_each(|(i, p)| {
// println!("copy for index: {:?}", i);
let _guard = device_input_data.read().unwrap();
_guard.copy_from_host_offset(
p.coeffs.as_slice(),
input_domain_size * i,
input_domain_size,
);
});

println!("data transform elapsed: {:?}", start.elapsed());
let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = per_device_batch as u32;

let mut gpu_input: Vec<F> = polynomials
.into_iter()
.flat_map(
|v|
v.coeffs.iter().cloned()
)
.collect();

let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = total_num_of_fft as u32;
cfg_lde.extension_rate_bits = rate_bits as u32;
cfg_lde.are_inputs_on_device = true;
cfg_lde.are_inputs_on_device = false;
cfg_lde.are_outputs_on_device = true;
cfg_lde.with_coset = true;
println!(
"start cuda_malloc with elements: {:?}",
(1 << log_n) * per_device_batch
);
let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch)
.unwrap();

let start = std::time::Instant::now();
lde_batch::<F>(
id,
device_output_data.as_mut_ptr(),
device_input_data.read().unwrap().as_ptr(),
log2_strict(degree),
cfg_lde,
);

println!("real lde_batch elapsed: {:?}", start.elapsed());

if num_gpus == 1 {
let mut device_transpose_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch)
.unwrap();

let mut cfg_trans = TransposeConfig::default();
cfg_trans.batches = per_device_batch as u32;
cfg_trans.are_inputs_on_device = true;
cfg_trans.are_outputs_on_device = true;

let start = std::time::Instant::now();
transpose_rev_batch(
id as i32,
device_transpose_data.as_mut_ptr(),
device_output_data.as_mut_ptr(),
log_n,
cfg_trans
);

println!("real transpose_rev_batch elapsed: {:?}", start.elapsed());

let start = std::time::Instant::now();
let nums: Vec<usize> = (0..(1<<log_n)).collect();
let r = nums
.par_iter()
.map(|i| {
let mut host_data: Vec<F> = vec![F::ZERO; per_device_batch];
device_transpose_data.copy_to_host_offset(
host_data.as_mut_slice(),
per_device_batch * i,
per_device_batch,
);
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
return r;
}

let start = std::time::Instant::now();
let nums: Vec<usize> = (0..poly_chunk.len()).collect();

let r = nums
.par_iter()
.map(|i| {
let mut host_data: Vec<F> = vec![F::ZERO; 1 << log_n];
device_output_data.copy_to_host_offset(
host_data.as_mut_slice(),
(1 << log_n) * i,
1 << log_n,
);
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
return r;

})
// .chain(
// (0..salt_size)
// .into_par_iter()
// .map(|_| F::rand_vec(degree << rate_bits)),
// )
.collect();
println!("real lde elapsed: {:?}", start_lde.elapsed());
return ret;
cfg_lde.is_multi_gpu = true;


let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap();

lde_batch_multi_gpu::<F>(
device_output_data.as_mut_ptr(),
gpu_input.as_mut_ptr(),
num_gpus,
cfg_lde.clone(),
log_n,
total_num_input_elements,
total_num_output_elements,
);

println!("real lde_batch elapsed: {:?}", start_lde.elapsed());

let mut cfg_trans = TransposeConfig::default();
cfg_trans.batches = total_num_of_fft as u32;
cfg_trans.are_inputs_on_device = true;
cfg_trans.are_outputs_on_device = true;

let mut device_transpose_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements)
.unwrap();

let start = std::time::Instant::now();

transpose_rev_batch(
0 as i32,
device_transpose_data.as_mut_ptr(),
device_output_data.as_mut_ptr(),
output_domain_size,
cfg_trans
);

println!("real transpose_rev_batch elapsed: {:?}", start.elapsed());

let start = std::time::Instant::now();
let nums: Vec<usize> = (0..(1<< output_domain_size)).collect();
let r = nums
.par_iter()
.map(|_i| {
let mut host_data: Vec<F> = vec![F::ZERO; total_num_of_fft];
device_transpose_data.copy_to_host_offset(
host_data.as_mut_slice(),
0,
total_num_of_fft,
).expect("copy to host error");
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
println!("real lde elapsed: {:?}", start_lde.elapsed());
return r;
}

fn lde_values(
Expand Down
4 changes: 2 additions & 2 deletions plonky2/src/gates/gate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ pub struct PrefixedGate<F: RichField + Extendable<D>, const D: usize> {
}

/// A gate's filter designed so that it is non-zero if `s = row`.
fn compute_filter<K: Field>(row: usize, group_range: Range<usize>, s: K, many_selector: bool) -> K {
pub fn compute_filter<K: Field>(row: usize, group_range: Range<usize>, s: K, many_selector: bool) -> K {
debug_assert!(group_range.contains(&row));
group_range
.filter(|&i| i != row)
Expand All @@ -287,7 +287,7 @@ fn compute_filter<K: Field>(row: usize, group_range: Range<usize>, s: K, many_se
.product()
}

fn compute_filter_circuit<F: RichField + Extendable<D>, const D: usize>(
pub fn compute_filter_circuit<F: RichField + Extendable<D>, const D: usize>(
builder: &mut CircuitBuilder<F, D>,
row: usize,
group_range: Range<usize>,
Expand Down
Loading
Loading