Skip to content

DPC++ multiGPU stencil and/or transpose #479

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 72 commits into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
a78fdbd
add stencil DPC++ skeletons
Jun 23, 2020
7af1beb
cleanup comments
Jun 23, 2020
684ac41
some fixes - GPU broken
Jun 23, 2020
d3717be
cleanup
Jun 23, 2020
add9e23
workaround Level Zero SPIR detection
Jun 23, 2020
691938c
some fixes - GPU broken
Jun 23, 2020
b31429b
undo minor mistake
Jun 24, 2020
583795d
add transpose multi-GPU DPC++ skeleton
Jun 24, 2020
053cb6d
WIP
Jun 29, 2020
17ff292
add ignores
Jul 20, 2020
9a95639
fix message
Jul 20, 2020
3113c03
add fill, cleanup dead code
Jul 21, 2020
3d33f2a
start decomposition
Jul 21, 2020
74e286b
device queues class and related methods
Jul 21, 2020
da01de4
use device queues stuff - currently broken
Jul 21, 2020
cc31187
trying to fix
Jul 21, 2020
18931c4
trying to fix
Jul 21, 2020
5351212
trying to fix
Jul 21, 2020
945f87f
working with inlined methods
Jul 21, 2020
994f9cc
working with inlined methods
Jul 21, 2020
f2c874e
cleaned up
Jul 21, 2020
7fca06f
cleaned up
Jul 21, 2020
a0ac723
fixed bugs
Jul 21, 2020
fe2d829
working but to be replaced
Jul 21, 2020
6239694
working
Jul 21, 2020
d299f95
fix input helper comment
Jul 22, 2020
e7bb773
add broadcast and reduce (unused and untested)
Jul 22, 2020
bc0ce20
add stencil DPC++ skeletons
Jun 23, 2020
4de1f83
cleanup comments
Jun 23, 2020
af3de74
some fixes - GPU broken
Jun 23, 2020
12abe88
cleanup
Jun 23, 2020
ce6a713
workaround Level Zero SPIR detection
Jun 23, 2020
5466251
some fixes - GPU broken
Jun 23, 2020
02d1f8d
undo minor mistake
Jun 24, 2020
0d358f9
add transpose multi-GPU DPC++ skeleton
Jun 24, 2020
83892be
WIP
Jun 29, 2020
9a8006a
add ignores
Jul 20, 2020
83faa81
fix message
Jul 20, 2020
4e81836
add fill, cleanup dead code
Jul 21, 2020
767db07
start decomposition
Jul 21, 2020
68f1a94
device queues class and related methods
Jul 21, 2020
452c0a3
use device queues stuff - currently broken
Jul 21, 2020
e068ccf
trying to fix
Jul 21, 2020
fc1a39d
trying to fix
Jul 21, 2020
a99fee7
trying to fix
Jul 21, 2020
ee31d76
working with inlined methods
Jul 21, 2020
4fe3b65
working with inlined methods
Jul 21, 2020
63b2d55
cleaned up
Jul 21, 2020
069ceae
cleaned up
Jul 21, 2020
af851be
fixed bugs
Jul 21, 2020
45c3a0b
working but to be replaced
Jul 21, 2020
aac9768
working
Jul 21, 2020
02591d6
fix input helper comment
Jul 22, 2020
f013421
add broadcast and reduce (unused and untested)
Jul 22, 2020
ed9924b
Merge branch 'dpcpp-multi-gpu-transpose' of https://github.com/jeffha…
Jul 24, 2020
d90f799
Merge branch 'default' into dpcpp-multi-gpu-transpose
Jul 24, 2020
e0fe457
remove unnecessary preprocessor
Jul 24, 2020
f5c510d
remove 2D indexing code that won't work with USM
Jul 24, 2020
76a8bd5
fix a bunch of problems
Jul 24, 2020
b5666fd
need a unit test for the collectives...
Jul 26, 2020
ee43a2d
whitespace
Jul 26, 2020
b94896d
hoist invariant; start alltoall
Jul 26, 2020
da594fb
progress on unit test
Jul 26, 2020
0a74fa0
work around issue with CPU-only in "multigpu" tester
Jul 26, 2020
edb8687
broadcast and reduce tested
Jul 26, 2020
dfacd83
there are bugs somewhere
Jul 26, 2020
2bd3063
merge fix
Sep 6, 2020
c222179
Merge branch 'default' into dpcpp-multi-gpu-transpose
Oct 14, 2020
1e855f3
Merge branch 'default' into dpcpp-multi-gpu-transpose
Oct 26, 2020
0193c93
never commit binaries
Oct 28, 2020
69f39c5
never commit binaries
Oct 28, 2020
7a08484
Merge branch 'default' into dpcpp-multi-gpu-transpose
Oct 28, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -168,6 +168,8 @@ Cxx11/dgemm-mpi
Cxx11/dgemm-sycl
Cxx11/dgemm-blas-sycl
Cxx11/dgemm-mkl-sycl
Cxx11/dgemm-multigpu-onemkl
Cxx11/dgemm-onemkl
Cxx11/dgemm-kokkos
Cxx11/dgemm-kernels-kokkos
Cxx11/dgemm-raja
@@ -202,7 +204,7 @@ Cxx11/nstream-opencl
Cxx11/nstream-valarray
Cxx11/nstream-vector
Cxx11/nstream-openmp
Cxx11/nstream-openmp
Cxx11/nstream-openmp-target
Cxx11/nstream-mpi
Cxx11/nstream-pstl
Cxx11/nstream-raja
@@ -217,13 +219,13 @@ Cxx11/nstream-cublas
Cxx11/nstream-cuda
Cxx11/nstream-cuda-managed
Cxx11/nstream-managed-cuda
Cxx11/nstream-openmp-target
Cxx11/nstream-sycl
Cxx11/nstream-sycl-usm
Cxx11/nstream-sycl-explicit
Cxx11/nstream-sycl-explicit-usm
Cxx11/nstream-dpcpp
Cxx11/nstream-multigpu-dpcpp
Cxx11/nstream-onemkl
Cxx11/nstream-celerity
Cxx11/nstream-hpx
Cxx11/nstream-upcxx
@@ -244,9 +246,9 @@ Cxx11/pic-tbb
Cxx11/sparse-vector
Cxx11/stencil
Cxx11/stencil-opencl
Cxx11/stencil-openmp
Cxx11/stencil-openmp-target
Cxx11/stencil-vector
Cxx11/stencil-openmp
Cxx11/stencil-cilk
Cxx11/stencil-stl
Cxx11/stencil-pstl
Binary file added C1z/dgemm
Binary file not shown.
Binary file added C1z/dgemm-target
Binary file not shown.
212 changes: 212 additions & 0 deletions C1z/dgemm-target2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
///
/// Copyright (c) 2020, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME: dgemm
///
/// PURPOSE: This program tests the efficiency with which a dense matrix
/// dense multiplication is carried out
///
/// USAGE: The program takes as input the matrix order,
/// the number of times the matrix-matrix multiplication
/// is carried out, and, optionally, a tile size for matrix
/// blocking
///
/// <progname> <# iterations> <matrix order> [<tile size>]
///
/// The output consists of diagnostics to make sure the
/// algorithm worked, and of timing statistics.
///
/// FUNCTIONS CALLED:
///
/// Other than OpenMP or standard C functions, the following
/// functions are used in this program:
///
/// wtime()
///
/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
/// Converted to C++11 by Jeff Hammond, December, 2017.
/// Converted to C11 by Jeff Hammond, October, 2020.
///
//////////////////////////////////////////////////////////////////////

#include "prk_util.h"
#include "prk_openmp.h"

void prk_dgemm(const int order,
const double * A,
const double * B,
double * C)
{
OMP_TARGET( teams distribute parallel for simd is_device_ptr(A,B,C) )
for (int i=0; i<order; ++i) {
for (int k=0; k<order; ++k) {
for (int j=0; j<order; ++j) {
C[i*order+j] += A[i*order+k] * B[k*order+j];
}
}
}
}

void prk_dgemm_tiled(const int order, const int tile_size,
const double * A,
const double * B,
double * C)
{
OMP_TARGET( teams distribute collapse(3) is_device_ptr(A,B,C) )
for (int it=0; it<order; it+=tile_size) {
for (int kt=0; kt<order; kt+=tile_size) {
for (int jt=0; jt<order; jt+=tile_size) {
// ICC will not hoist these on its own...
int iend = MIN(order,it+tile_size);
int jend = MIN(order,jt+tile_size);
int kend = MIN(order,kt+tile_size);
OMP( parallel for simd )
for (int i=it; i<iend; ++i) {
for (int k=kt; k<kend; ++k) {
for (int j=jt; j<jend; ++j) {
C[i*order+j] += A[i*order+k] * B[k*order+j];
}
}
}
}
}
}
}

int main(int argc, char * argv[])
{
printf("Parallel Research Kernels version %d\n", PRKVERSION );
printf("C11/OpenMP TARGET Dense matrix-matrix multiplication: C += A x B\n");

//////////////////////////////////////////////////////////////////////
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

if (argc < 3) {
printf("Usage: <# iterations> <matrix order> [tile size]\n");
return 1;
}

int iterations = atoi(argv[1]);
if (iterations < 1) {
printf("ERROR: iterations must be >= 1\n");
return 1;
}

int order = atoi(argv[2]);
if (order <= 0) {
printf("ERROR: Matrix Order must be greater than 0\n");
return 1;
}

// default tile size for tiling of local transpose
int tile_size = (argc>3) ? atoi(argv[3]) : 32;
// a negative tile size means no tiling of the local transpose
if (tile_size <= 0) tile_size = order;

printf("Number of threads (max) = %d\n", omp_get_max_threads());
printf("Number of iterations = %d\n", iterations);
printf("Matrix order = %d\n", order);
printf("Tile size = %d\n", tile_size);

//////////////////////////////////////////////////////////////////////
/// Allocate space for matrices
//////////////////////////////////////////////////////////////////////

double dgemm_time = 0.0;

size_t bytes = order*order*sizeof(double);
double * restrict A = prk_malloc(bytes);
double * restrict B = prk_malloc(bytes);
double * restrict C = prk_malloc(bytes);

for (int i=0;i<order; i++) {
for (int j=0;j<order;j++) {
A[i*order+j] = (double)j;
B[i*order+j] = (double)j;
C[i*order+j] = 0.0;
}
}

OMP_TARGET( data map(to: A[0:order*order], B[0:order*order]) map(tofrom: C[0:order*order]) )
{
for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) dgemm_time = omp_get_wtime();

if (tile_size < order) {
prk_dgemm_tiled(order, tile_size, A, B, C);
} else {
prk_dgemm(order, A, B, C);
}
}
dgemm_time = omp_get_wtime() - dgemm_time;
}

//////////////////////////////////////////////////////////////////////
// Analyze and output results
//////////////////////////////////////////////////////////////////////

const double forder = (double)order;
const double reference = 0.25 * pow(forder,3) * pow(forder-1.0,2) * (iterations+1.0);
double checksum = 0.0;
OMP_PARALLEL_FOR_REDUCE( +:checksum )
for (int j=0; j<order; j++) {
for (int i=0; i<order; i++) {
checksum += C[i*order+j];
}
}

prk_free(A);
prk_free(B);

#ifdef VERBOSE
printf("Reference checksum = %lf, checksum = %lf\n", ref_checksum, checksum);
#endif

const double epsilon = 1.0e-8;
const double residuum = fabs(checksum-reference)/reference;
if (residuum < epsilon) {
printf("Solution validates\n");
const double avgtime = dgemm_time/iterations;
double nflops = 2.0*forder*forder*forder;
printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0E-06 * nflops/avgtime, avgtime);
} else {
printf("ERROR: Checksum = %lf, Reference checksum = %lf\n", checksum, reference);
return 1;
}

return 0;
}


Binary file added C1z/nstream-ua-target
Binary file not shown.
7 changes: 6 additions & 1 deletion Cxx11/Makefile
Original file line number Diff line number Diff line change
@@ -95,7 +95,11 @@ sycl: nstream-sycl nstream-sycl-explicit p2p-hyperplane-sycl stencil-sycl transp
stencil-2d-sycl transpose-2d-sycl pic-sycl \
nstream-sycl-usm nstream-sycl-explicit-usm stencil-sycl-usm transpose-sycl-usm

dpcpp: sycl nstream-dpcpp nstream-multigpu-dpcpp transpose-dpcpp
dpcpp: sycl nstream-dpcpp stencil-dpcpp transpose-dpcpp \
nstream-multigpu-dpcpp stencil-multigpu-dpcpp transpose-multigpu-dpcpp

test_dpcpp_collectives: test_dpcpp_collectives.cc prk_util.h prk_sycl.h
$(SYCLCXX) $(SYCLFLAGS) -g $< -o $@

tbb: p2p-innerloop-tbb p2p-tbb stencil-tbb transpose-tbb nstream-tbb \
p2p-hyperplane-tbb p2p-tasks-tbb
@@ -285,6 +289,7 @@ clean:
-rm -f *-sycl-explicit-usm
-rm -f *-dpct
-rm -f *-dpcpp
-rm -f test_dpcpp_collectives
-rm -f *-celerity
-rm -f *-tbb
-rm -f *-stl
313 changes: 313 additions & 0 deletions Cxx11/dgemm-onemkl-dpct-bad-job.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
///
/// Copyright (c) 2018, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME: dgemm
///
/// PURPOSE: This program tests the efficiency with which a dense matrix
/// dense multiplication is carried out
///
/// USAGE: The program takes as input the matrix order,
/// the number of times the matrix-matrix multiplication
/// is carried out, and, optionally, a tile size for matrix
/// blocking
///
/// <progname> <# iterations> <matrix order> [<batches>]
///
/// The output consists of diagnostics to make sure the
/// algorithm worked, and of timing statistics.
///
/// FUNCTIONS CALLED:
///
/// Other than OpenMP or standard C functions, the following
/// functions are used in this program:
///
/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
/// Converted to C++11 by Jeff Hammond, December, 2017.
///
//////////////////////////////////////////////////////////////////////

#include <CL/sycl.hpp>

#include "prk_util.h"
#include "prk_sycl.h"

#include <mkl_blas_sycl.hpp>
#include <mkl_lapack_sycl.hpp>
#include <mkl_sycl_types.hpp>

void init(int order, const int matrices, double * A, double * B, double * C, sycl::nd_item<3> item_ct1)
{
auto i = item_ct1.get_group(2) * item_ct1.get_local_range().get(2) + item_ct1.get_local_id(2);
auto j = item_ct1.get_group(1) * item_ct1.get_local_range().get(1) + item_ct1.get_local_id(1);

for (int b=0; b<matrices; ++b) {
if ((i<order) && (j<order)) {
A[b*order*order+i*order+j] = i;
B[b*order*order+i*order+j] = i;
C[b*order*order+i*order+j] = 0;
}
}
}

void init(int order, const int matrices, double * C, sycl::nd_item<3> item_ct1)
{
auto i = item_ct1.get_group(2) * item_ct1.get_local_range().get(2) + item_ct1.get_local_id(2);
auto j = item_ct1.get_group(1) * item_ct1.get_local_range().get(1) + item_ct1.get_local_id(1);

for (int b=0; b<matrices; ++b) {
if ((i<order) && (j<order)) {
C[b*order*order+i*order+j] = 0;
}
}
}

void prk_dgemm(sycl::queue &q, const int order, const int batches, double *A, double *B, double *C)
{
const double alpha = 1.0;
const double beta = 1.0;

for (int b=0; b<batches; ++b) {
double * pA = &(A[b*order*order]);
double * pB = &(B[b*order*order]);
double * pC = &(C[b*order*order]);
mkl::blas::gemm(q, mkl::transpose::nontrans, // opA
mkl::transpose::nontrans, // opB
order, order, order, // m, n, k
alpha, // alpha
pA, order, // A, lda
pB, order, // B, ldb
beta, // beta
pC, order); // C, ldc
}
q.wait_and_throw();
}

int main(int argc, char * argv[])
{
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
std::cout << "C++11/oneMKL Dense matrix-matrix multiplication: C += A x B" << std::endl;

//////////////////////////////////////////////////////////////////////
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
int order;
int batches = 0;
int input_copy = 0;
try {
if (argc < 2) {
throw "Usage: <# iterations> <matrix order> [<batches>] [<copy input every iteration [0/1]>]";
}

iterations = std::atoi(argv[1]);
if (iterations < 1) {
throw "ERROR: iterations must be >= 1";
}

order = std::atoi(argv[2]);
if (order <= 0) {
throw "ERROR: Matrix Order must be greater than 0";
} else if (order > std::floor(std::sqrt(INT_MAX))) {
throw "ERROR: matrix dimension too large - overflow risk";
}

if (argc > 3) {
batches = std::atoi(argv[3]);
}

if (argc > 4) {
input_copy = std::atoi(argv[4]);
if (input_copy != 0 && input_copy != 1) {
throw "ERROR: input_copy was not 0 or 1";
}
}
}
catch (const char * e) {
std::cout << e << std::endl;
return 1;
}

std::cout << "Number of iterations = " << iterations << std::endl;
std::cout << "Matrix order = " << order << std::endl;
if (batches == 0) {
std::cout << "No batching" << std::endl;
} else if (batches < 0) {
std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl;
} else if (batches > 0) {
std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl;
}
std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl;

sycl::queue q(sycl::default_selector{});
prk::SYCL::print_device_platform(q);

const int tile_size = 32;
sycl::range<3> dimGrid(prk::divceil(order, tile_size), prk::divceil(order, tile_size), 1);
sycl::range<3> dimBlock(tile_size, tile_size, 1);

//////////////////////////////////////////////////////////////////////
// Allocate space for matrices
//////////////////////////////////////////////////////////////////////

double dgemm_time(0);

const int matrices = (batches == 0 ? 1 : abs(batches));
const size_t nelems = (size_t)order * (size_t)order;
const size_t bytes = nelems * sizeof(double);

// host buffers
double * h_a = sycl::malloc_host<double>(nelems, q);
double * h_b = sycl::malloc_host<double>(nelems, q);
double * h_c = sycl::malloc_host<double>(nelems, q);

// device buffers
double * d_a = sycl::malloc_device<double>( matrices * nelems, q);
double * d_b = sycl::malloc_device<double>( matrices * nelems, q);
double * d_c = sycl::malloc_device<double>( matrices * nelems, q);

if (input_copy) {

for (int i=0; i<order; ++i) {
for (int j=0; j<order; ++j) {
h_a[i*order+j] = i;
h_b[i*order+j] = i;
}
}

for (int b=0; b<matrices; ++b) {
q.memcpy( &(d_a[b * order * order]), h_a, bytes);
q.memcpy( &(d_b[b * order * order]), h_b, bytes);
}
q.wait_and_throw();

q.submit([&](sycl::handler &cgh) {
auto dpct_global_range = dimGrid * dimBlock;

cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(dpct_global_range.get(2), dpct_global_range.get(1), dpct_global_range.get(0)),
sycl::range<3>(dimBlock.get(2), dimBlock.get(1), dimBlock.get(0))),
[=](sycl::nd_item<3> item_ct1) {
init(order, matrices, d_c, item_ct1);
});
});

} else {

q.submit([&](sycl::handler &cgh) {
auto dpct_global_range = dimGrid * dimBlock;

cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(dpct_global_range.get(2), dpct_global_range.get(1), dpct_global_range.get(0)),
sycl::range<3>(dimBlock.get(2), dimBlock.get(1), dimBlock.get(0))),
[=](sycl::nd_item<3> item_ct1) {
init(order, matrices, d_a, d_b, d_c, item_ct1);
});
});
}
q.wait_and_throw();

double xfer(0);
double comp(0);
{
for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) dgemm_time = prk::wtime();

if (input_copy) {
double t0 = prk::wtime();
for (int b=0; b<matrices; ++b) {
q.memcpy( &(d_a[b * order * order]), h_a, bytes);
q.memcpy( &(d_b[b * order * order]), h_b, bytes);
}
q.wait_and_throw();
double t1 = prk::wtime();
if (iter==1) xfer += (t1-t0);
}

{
double t0 = prk::wtime();
prk_dgemm(q, order, matrices, d_a, d_b, d_c);
double t1 = prk::wtime();
if (iter==1) comp += (t1-t0);
}
}
dgemm_time = prk::wtime() - dgemm_time;
}
std::cout << "xfer, comp = " << xfer << "," << comp << std::endl;

// copy output back to host
q.memcpy(&(h_c[0]), d_c, matrices * bytes);
sycl::free(d_c, q);
sycl::free(d_b, q);
sycl::free(d_a, q);
sycl::free(h_a, q);
sycl::free(h_b, q);
q.wait_and_throw();

//////////////////////////////////////////////////////////////////////
/// Analyze and output results
//////////////////////////////////////////////////////////////////////

const auto epsilon = 1.0e-8;
const auto forder = static_cast<double>(order);
const auto reference = 0.25 * prk::pow(forder, 3) * prk::pow(forder - 1.0, 2) * (iterations + 1);
double residuum(0);
for (int b=0; b<matrices; ++b) {
const auto checksum = prk::reduce( &(h_c[b*order*order+0]), &(h_c[b*order*order+nelems]), 0.0);
residuum += std::abs(checksum - reference) / reference;
}
residuum /= matrices;

if (residuum < epsilon) {
#if VERBOSE
std::cout << "Reference checksum = " << reference << "\n"
<< "Actual checksum = " << checksum << std::endl;
#endif
std::cout << "Solution validates" << std::endl;
auto avgtime = dgemm_time/iterations/matrices;
auto nflops = 2.0 * prk::pow(forder, 3);
std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime
<< " Avg time (s): " << avgtime << std::endl;
} else {
std::cout << "Reference checksum = " << reference << "\n"
<< "Residuum = " << residuum << std::endl;
return 1;
}

sycl::free(h_c, q);

return 0;
}


392 changes: 392 additions & 0 deletions Cxx11/matrix-multiply.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,392 @@
/***************************************************************************
*
* Copyright (C) 2016 Codeplay Software Limited
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* For your convenience, a copy of the License has been included in this
* repository.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Codeplay's ComputeCpp SDK
*
* matrix-multiply.cpp
*
* Description:
* Example of matrix multiplication in SYCL.
*
**************************************************************************/

/* This example compares an OpenMP blocked matrix multiplication
* implementation with a SYCL blocked matrix multiplication example.
* The purpose is not to compare performance, but to show the similarities
* and differences between them.
* See block_host for the OpenMP implementation. */

#include <CL/sycl.hpp>

#include <chrono>
#include <cmath>
#include <ctime>
#include <iostream>

using namespace cl::sycl;

class mxm_kernel;

void display_matrix(float* m, int matSize) {
if (matSize > 16) {
return;
}

std::cout << "=======" << std::endl;
for (int i = 0; i < matSize; i++) {
for (int j = 0; j < matSize; j++) {
std::cout << m[i * matSize + j] << " ";
}
std::cout << std::endl;
}
std::cout << "=======" << std::endl;
;
}

/* Implements a host C++ version of the matrix multiplication.
* If compiler supports OpenMP, code is parallelized. Scheduling
* uses static chunks of block_size. */
void block_host(float* MA, float* MB, float* MC, int matSize) {
/* We set the block size to 32 for simplicity, though the optimal
* value will depend on the platform this is run on. */
int block_size = 32;
int numBlocks = block_size / matSize;
int extraBlockLength = block_size % matSize;
numBlocks = extraBlockLength ? (numBlocks + 1) : (numBlocks);

#pragma omp parallel for collapse(2)
for (int bIndexI = 0; bIndexI < matSize; bIndexI += block_size)
for (int bIndexJ = 0; bIndexJ < matSize; bIndexJ += block_size)
for (int bIndexK = 0; bIndexK < matSize; bIndexK += block_size) {
int i = bIndexI;
int j = bIndexJ;
int k = bIndexK;
for (int bi = i; bi < std::min(i + block_size, matSize); bi++)
for (int bj = j; bj < std::min(j + block_size, matSize); bj++)
for (int bk = k; bk < std::min(k + block_size, matSize); bk++) {
MC[bi * matSize + bj] +=
MA[bi * matSize + bk] * MB[bk * matSize + bj];
}
}
}

/* Obtains the previous power of two from the given integer.
* It works by masking out all ones after the first one bit,
* then leaves the first one bit intact, effectively
* yielding the first power of two < x. */
inline int prevPowerOfTwo(int x) {
if (x < 0) {
return 0;
}
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return x - (x >> 1);
}

/* Checks if X is a power of two.
* If there are bits sets to one after AND with the
* previous number, then it is not a power of two.
*/
inline bool isPowerOfTwo(int x) { return (x & (x - 1)) == 0; }

/* Function template that performs the matrix * matrix operation. (It is
* a template because only some OpenCL devices support double-precision
* floating-point numbers, but it is interesting to make the comparison
* where available.)
* Broadly, the function chooses an appropriate work size, then enqueues
* the matrix * matrix lambda on the queue provided. Because the queues
* are constructed inside this function, it will block until the work is
* finished.
* Note that this example only works for powers of two.
* */
template <typename T>
bool local_mxm(cl::sycl::queue& q, T* MA, T* MB, T* MC, int matSize) {
// Make sure it is power of two before running
if (!isPowerOfTwo(matSize)) {
std::cout << " This example only works with power of two sizes "
<< std::endl;
return true;
}

auto device = q.get_device();
auto maxBlockSize = device.get_info<cl::sycl::info::device::max_work_group_size>();
auto blockSize = prevPowerOfTwo(std::sqrt(maxBlockSize));
std::cout << " The Device Max Work Group Size is : " << maxBlockSize
<< std::endl;
std::cout << " The order is : " << matSize << std::endl;
std::cout << " The blockSize is : " << blockSize << std::endl;
// Make sure the block size is not larger than the mat size
blockSize = std::min(matSize, blockSize);

{
/* Buffers can be constructed with property lists. In this example,
* the buffer is given the property "use host pointer", which tells
* the runtime to use the host pointer for all data storage (instead
* of making copies internally). Additionally, when running on a
* device that shares memory with the host (for example a CPU),
* "zero-copy" memory optimisations can be used by the driver. */
range<1> dimensions(matSize * matSize);
const property_list props = {property::buffer::use_host_ptr()};
buffer<T> bA(MA, dimensions, props);
buffer<T> bB(MB, dimensions, props);
buffer<T> bC(MC, dimensions, props);

q.submit([&](handler& cgh) {
auto pA = bA.template get_access<access::mode::read>(cgh);
auto pB = bB.template get_access<access::mode::read>(cgh);
auto pC = bC.template get_access<access::mode::write>(cgh);
auto localRange = range<1>(blockSize * blockSize);

accessor<T, 1, access::mode::read_write, access::target::local> pBA(
localRange, cgh);
accessor<T, 1, access::mode::read_write, access::target::local> pBB(
localRange, cgh);

cgh.parallel_for<mxm_kernel>(
nd_range<2>{range<2>(matSize, matSize),
range<2>(blockSize, blockSize)},
[=](nd_item<2> it) {
// Current block
int blockX = it.get_group(1);
int blockY = it.get_group(0);

// Current local item
int localX = it.get_local_id(1);
int localY = it.get_local_id(0);

// Start in the A matrix
int a_start = matSize * blockSize * blockY;
// End in the b matrix
int a_end = a_start + matSize - 1;
// Start in the b matrix
int b_start = blockSize * blockX;

// Result for the current C(i,j) element
T tmp = 0.0f;
// We go through all a, b blocks
for (int a = a_start, b = b_start; a <= a_end;
a += blockSize, b += (blockSize * matSize)) {
// Copy the values in shared memory collectively
pBA[localY * blockSize + localX] =
pA[a + matSize * localY + localX];
// Note the swap of X/Y to maintain contiguous access
pBB[localX * blockSize + localY] =
pB[b + matSize * localY + localX];
it.barrier(access::fence_space::local_space);
// Now each thread adds the value of its sum
for (int k = 0; k < blockSize; k++) {
tmp +=
pBA[localY * blockSize + k] * pBB[localX * blockSize + k];
}
// The barrier ensures that all threads have written to local
// memory before continuing
it.barrier(access::fence_space::local_space);
}
auto elemIndex = it.get_global_id(0) * it.get_global_range()[1] +
it.get_global_id(1);
// Each thread updates its position
pC[elemIndex] = tmp;
});
});
}
return false;
}

/* Helper function to indicate the parameters the sample takes. */
void usage(std::string programName) {
std::cout << " Incorrect number of parameters " << std::endl;
std::cout << " Usage: " << std::endl;
std::cout << programName << " [matrix size] [omp|sycl]" << std::endl;
std::cout << "[matrix size] : Size of the matrix to multiply (minimum 32)"
<< std::endl;
std::cout << "[omp|sycl] : Run the OpenMP or the SYCL variant. "
<< " Default is to use both " << std::endl;
}

int main(int argc, char* argv[]) {
float* MA;
float* MB;
float* MC;
bool sycl = true;
bool omp = true;
bool error = false;

if (argc != 2 && argc != 3) {
usage(argv[0]);
return 1;
}

int matSize = 0;
try {
matSize = std::stoi(argv[1]);
} catch (...) {
usage(argv[0]);
return 1;
}

if (matSize < 32) {
usage(argv[0]);
return 1;
}

if (argc == 3) {
if (std::string(argv[2]) == "omp") {
omp = true;
sycl = false;
} else if (std::string(argv[2]) == "sycl") {
omp = false;
sycl = true;
} else {
usage(argv[0]);
}
}

MA = new float[matSize * matSize];
MB = new float[matSize * matSize];
MC = new float[matSize * matSize];

// Matrix initialization
#pragma omp parallel for collapse(2)
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
MA[i * matSize + j] = 0.0f;
if (i == j) {
MA[i * matSize + j] = 1.0f;
}
MB[i * matSize + j] = 2.0f;
MC[i * matSize + j] = 0.0f; // i * matSize + j;
}

std::cout << " Input matrix " << std::endl;
display_matrix(MA, matSize);
display_matrix(MB, matSize);
display_matrix(MC, matSize);

if (omp) {
#if defined(_OPENMP)
std::cout << "OpenMP: ";
#else
std::cout << "C++: ";
#endif

{
auto start = std::chrono::steady_clock::now();
block_host(MA, MB, MC, matSize);
auto end = std::chrono::steady_clock::now();
auto time =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
std::cout << "Time: " << time << std::endl;
float flops =
(2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
std::cout << "GFLOPs: " << flops << std::endl;

bool error = false;
// Testing
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
if (std::fabs(MC[i * matSize + j] - MB[i * matSize + j]) > 1e-8) {
std::cout << " Position " << i << ", " << j
<< " differs: " << MC[i * matSize + j]
<< " != " << MB[i * matSize + j] << std::endl;
error = true;
}
}
if (!error) {
std::cout << "Success" << std::endl;
} else {
std::cout << " Error in the computation " << std::endl;
}
}
}

if (sycl) {
std::cout << " ***** SYCL " << std::endl;
// Matrix initialization
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
MC[i * matSize + j] = 0.0f; // i * matSize + j;
}

{
{
/* Create the SYCL queue - note that we add an async handler function
* to capture potential asynchronous errors. This function will be
* called every time there is an asynchronous error on the queue (i.e.
* some error occurs while the queue is executing kernels) and one of
* cl::sycl::queue::throw() or cl::sycl::queue::wait_and_throw() is
* called. */
queue q([&](exception_list eL) {
try {
for (auto& e : eL) {
std::rethrow_exception(e);
}
} catch (cl::sycl::exception e) {
std::cout << " An exception has been thrown: " << e.what()
<< std::endl;
}
});

auto start = std::chrono::steady_clock::now();
error = local_mxm(q, MA, MB, MC, matSize);
q.wait_and_throw();
auto end = std::chrono::steady_clock::now();
auto time =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
.count();
std::cout << "SYCL: ";
std::cout << "Time: " << time << std::endl;
float flops =
(2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
std::cout << "GFLOPs: " << flops << std::endl;
std::cout << " Output " << std::endl;
}

if (!error) {
display_matrix(MC, matSize);
error = false;
// Testing
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
if (std::fabs(MC[i * matSize + j] - MB[i * matSize + j]) > 1e-8) {
std::cout << " Position " << i << ", " << j
<< " differs: " << MC[i * matSize + j]
<< " != " << MB[i * matSize + j] << std::endl;
error = true;
}
}
if (!error) {
std::cout << "Success" << std::endl;
;
} else {
std::cout << " Error in the computation " << std::endl;
}
}
}
}

delete[] MA;
delete[] MB;
delete[] MC;

return error ? 1 : 0;
}
7 changes: 4 additions & 3 deletions Cxx11/nstream-dpcpp.cc
Original file line number Diff line number Diff line change
@@ -134,9 +134,10 @@ int main(int argc, char * argv[])
double * d_A = syclx::malloc_device<double>(length, q);
double * d_B = syclx::malloc_device<double>(length, q);
double * d_C = syclx::malloc_device<double>(length, q);
q.memcpy(d_A, &(h_A[0]), bytes).wait();
q.memcpy(d_B, &(h_B[0]), bytes).wait();
q.memcpy(d_C, &(h_C[0]), bytes).wait();
q.memcpy(d_A, &(h_A[0]), bytes);
q.memcpy(d_B, &(h_B[0]), bytes);
q.memcpy(d_C, &(h_C[0]), bytes);
q.wait();

double scalar(3);
{
228 changes: 228 additions & 0 deletions Cxx11/nstream-multigpu-dpcpp-after.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
///
/// Copyright (c) 2020, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME: nstream
///
/// PURPOSE: To compute memory bandwidth when adding a vector of a given
/// number of double precision values to the scalar multiple of
/// another vector of the same length, and storing the result in
/// a third vector.
///
/// USAGE: The program takes as input the number
/// of iterations to loop over the triad vectors, the length of the
/// vectors, and the offset between vectors
///
/// <progname> <# iterations> <vector length> <offset>
///
/// The output consists of diagnostics to make sure the
/// algorithm worked, and of timing statistics.
///
/// NOTES: Bandwidth is determined as the number of words read, plus the
/// number of words written, times the size of the words, divided
/// by the execution time. For a vector length of N, the total
/// number of words read and written is 4*N*sizeof(double).
///
/// HISTORY: This code is loosely based on the Stream benchmark by John
/// McCalpin, but does not follow all the Stream rules. Hence,
/// reported results should not be associated with Stream in
/// external publications
///
/// Converted to C++11 by Jeff Hammond, November 2017.
///
//////////////////////////////////////////////////////////////////////

#include "prk_sycl.h"
#include "prk_util.h"

int main(int argc, char * argv[])
{
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
std::cout << "C++11/DPC++ STREAM triad: A = B + scalar * C" << std::endl;

auto qs = prk::SYCL::queues();

//////////////////////////////////////////////////////////////////////
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
size_t length, local_length;
int use_ngpu = 1;
try {
if (argc < 3) {
throw "Usage: <# iterations> <vector length> [<use_ngpu>]";
}

iterations = std::atoi(argv[1]);
if (iterations < 1) {
throw "ERROR: iterations must be >= 1";
}

length = std::atoi(argv[2]);
if (length <= 0) {
throw "ERROR: vector length must be positive";
}

if (argc > 3) {
use_ngpu = std::atoi(argv[3]);
}
if ( use_ngpu > qs.size() ) {
std::string error = "You cannot use more devices ("
+ std::to_string(use_ngpu)
+ ") than you have ("
+ std::to_string(qs.size()) + ")";
throw error;
}

if (length % use_ngpu != 0) {
std::string error = "ERROR: vector length ("
+ std::to_string(length)
+ ") should be divisible by # procs ("
+ std::to_string(use_ngpu) + ")";
throw error;
}
local_length = length / use_ngpu;
}
catch (const char * e) {
std::cout << e << std::endl;
return 1;
}

std::cout << "Number of devices = " << use_ngpu << std::endl;
std::cout << "Number of iterations = " << iterations << std::endl;
std::cout << "Vector length = " << length << std::endl;
std::cout << "Vector length (local) = " << local_length << std::endl;

int np = use_ngpu;

//////////////////////////////////////////////////////////////////////
// Allocate space and perform the computation
//////////////////////////////////////////////////////////////////////

double nstream_time(0);

auto h_A = prk::vector<double>(length, 0);
auto h_B = prk::vector<double>(length, 2);
auto h_C = prk::vector<double>(length, 2);

auto d_A = std::vector<double*> (np, nullptr);
auto d_B = std::vector<double*> (np, nullptr);
auto d_C = std::vector<double*> (np, nullptr);

qs.allocate<double>(d_A, local_length);
qs.allocate<double>(d_B, local_length);
qs.allocate<double>(d_C, local_length);
qs.waitall();

qs.scatter<double>(d_A, h_A, local_length);
qs.scatter<double>(d_B, h_B, local_length);
qs.scatter<double>(d_C, h_C, local_length);
qs.waitall();

// overwrite host buffer with garbage to detect bugs
h_A.fill(-77777777);

const double scalar(3);
{
for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) nstream_time = prk::wtime();

#if 1
for (int g=0; g<np; ++g) {
auto q = qs.queue(g);

auto p_A = d_A[g];
auto p_B = d_B[g];
auto p_C = d_C[g];

const size_t size = local_length;

q.submit([&](sycl::handler& h) {
h.parallel_for( sycl::range<1>{size}, [=] (sycl::id<1> i) {
p_A[i] += p_B[i] + scalar * p_C[i];
});
});
}
qs.waitall();
#endif
}
nstream_time = prk::wtime() - nstream_time;
}

qs.gather<double>(h_A, d_A, local_length);
qs.waitall();

qs.free(d_A);
qs.free(d_B);
qs.free(d_C);
qs.waitall();

//////////////////////////////////////////////////////////////////////
/// Analyze and output results
//////////////////////////////////////////////////////////////////////

double ar(0);
double br(2);
double cr(2);
for (int i=0; i<=iterations; i++) {
ar += br + scalar * cr;
}

ar *= length;

double asum(0);
for (int i=0; i<length; i++) {
asum += prk::abs(h_A[i]);
}

double epsilon=1.e-8;
if (prk::abs(ar - asum) / asum > epsilon) {
std::cout << "Failed Validation on output array\n"
<< std::setprecision(16)
<< " Expected checksum: " << ar << "\n"
<< " Observed checksum: " << asum << std::endl;
std::cout << "ERROR: solution did not validate" << std::endl;
return 1;
} else {
std::cout << "Solution validates" << std::endl;
double avgtime = nstream_time/iterations;
double nbytes = 4.0 * length * sizeof(double);
std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
<< " Avg time (s): " << avgtime << std::endl;
}

return 0;
}


228 changes: 228 additions & 0 deletions Cxx11/nstream-multigpu-dpcpp-after.cc-good
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
///
/// Copyright (c) 2020, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME: nstream
///
/// PURPOSE: To compute memory bandwidth when adding a vector of a given
/// number of double precision values to the scalar multiple of
/// another vector of the same length, and storing the result in
/// a third vector.
///
/// USAGE: The program takes as input the number
/// of iterations to loop over the triad vectors, the length of the
/// vectors, and the offset between vectors
///
/// <progname> <# iterations> <vector length> <offset>
///
/// The output consists of diagnostics to make sure the
/// algorithm worked, and of timing statistics.
///
/// NOTES: Bandwidth is determined as the number of words read, plus the
/// number of words written, times the size of the words, divided
/// by the execution time. For a vector length of N, the total
/// number of words read and written is 4*N*sizeof(double).
///
/// HISTORY: This code is loosely based on the Stream benchmark by John
/// McCalpin, but does not follow all the Stream rules. Hence,
/// reported results should not be associated with Stream in
/// external publications
///
/// Converted to C++11 by Jeff Hammond, November 2017.
///
//////////////////////////////////////////////////////////////////////

#include "prk_sycl.h"
#include "prk_util.h"

int main(int argc, char * argv[])
{
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
std::cout << "C++11/DPC++ STREAM triad: A = B + scalar * C" << std::endl;

auto qs = prk::SYCL::queues();

//////////////////////////////////////////////////////////////////////
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
size_t length, local_length;
int use_ngpu = 1;
try {
if (argc < 3) {
throw "Usage: <# iterations> <vector length> [<use_ngpu>]";
}

iterations = std::atoi(argv[1]);
if (iterations < 1) {
throw "ERROR: iterations must be >= 1";
}

length = std::atoi(argv[2]);
if (length <= 0) {
throw "ERROR: vector length must be positive";
}

if (argc > 3) {
use_ngpu = std::atoi(argv[3]);
}
if ( use_ngpu > qs.size() ) {
std::string error = "You cannot use more devices ("
+ std::to_string(use_ngpu)
+ ") than you have ("
+ std::to_string(qs.size()) + ")";
throw error;
}

if (length % use_ngpu != 0) {
std::string error = "ERROR: vector length ("
+ std::to_string(length)
+ ") should be divisible by # procs ("
+ std::to_string(use_ngpu) + ")";
throw error;
}
local_length = length / use_ngpu;
}
catch (const char * e) {
std::cout << e << std::endl;
return 1;
}

std::cout << "Number of devices = " << use_ngpu << std::endl;
std::cout << "Number of iterations = " << iterations << std::endl;
std::cout << "Vector length = " << length << std::endl;
std::cout << "Vector length (local) = " << local_length << std::endl;

int np = use_ngpu;

//////////////////////////////////////////////////////////////////////
// Allocate space and perform the computation
//////////////////////////////////////////////////////////////////////

double nstream_time(0);

auto h_A = prk::vector<double>(length, 0);
auto h_B = prk::vector<double>(length, 2);
auto h_C = prk::vector<double>(length, 2);

auto d_A = std::vector<double*> (np, nullptr);
auto d_B = std::vector<double*> (np, nullptr);
auto d_C = std::vector<double*> (np, nullptr);

qs.allocate<double>(d_A, local_length);
qs.allocate<double>(d_B, local_length);
qs.allocate<double>(d_C, local_length);
qs.waitall();

qs.scatter<double>(d_A, h_A, local_length);
qs.scatter<double>(d_B, h_B, local_length);
qs.scatter<double>(d_C, h_C, local_length);
qs.waitall();

// overwrite host buffer with garbage to detect bugs
h_A.fill(-77777777);

const double scalar(3);
{
for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) nstream_time = prk::wtime();

#if 1
for (int g=0; g<np; ++g) {
auto q = qs.queue(g);

auto p_A = d_A[g];
auto p_B = d_B[g];
auto p_C = d_C[g];

const size_t size = local_length;

q.submit([&](sycl::handler& h) {
h.parallel_for( sycl::range<1>{size}, [=] (sycl::id<1> i) {
p_A[i] += p_B[i] + scalar * p_C[i];
});
});
}
qs.waitall();
#endif
}
nstream_time = prk::wtime() - nstream_time;
}

qs.gather<double>(h_A, d_A, local_length);
qs.waitall();

qs.free(d_A);
qs.free(d_B);
qs.free(d_C);
qs.waitall();

//////////////////////////////////////////////////////////////////////
/// Analyze and output results
//////////////////////////////////////////////////////////////////////

double ar(0);
double br(2);
double cr(2);
for (int i=0; i<=iterations; i++) {
ar += br + scalar * cr;
}

ar *= length;

double asum(0);
for (int i=0; i<length; i++) {
asum += prk::abs(h_A[i]);
}

double epsilon=1.e-8;
if (prk::abs(ar - asum) / asum > epsilon) {
std::cout << "Failed Validation on output array\n"
<< std::setprecision(16)
<< " Expected checksum: " << ar << "\n"
<< " Observed checksum: " << asum << std::endl;
std::cout << "ERROR: solution did not validate" << std::endl;
return 1;
} else {
std::cout << "Solution validates" << std::endl;
double avgtime = nstream_time/iterations;
double nbytes = 4.0 * length * sizeof(double);
std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime
<< " Avg time (s): " << avgtime << std::endl;
}

return 0;
}


146 changes: 48 additions & 98 deletions Cxx11/nstream-multigpu-dpcpp.cc
Original file line number Diff line number Diff line change
@@ -69,12 +69,14 @@ int main(int argc, char * argv[])
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
std::cout << "C++11/DPC++ STREAM triad: A = B + scalar * C" << std::endl;

auto qs = prk::SYCL::queues();

//////////////////////////////////////////////////////////////////////
/// Read and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
size_t length;
size_t length, local_length;
int use_ngpu = 1;
try {
if (argc < 3) {
@@ -94,147 +96,95 @@ int main(int argc, char * argv[])
if (argc > 3) {
use_ngpu = std::atoi(argv[3]);
}
if ( use_ngpu > qs.size() ) {
std::string error = "You cannot use more devices ("
+ std::to_string(use_ngpu)
+ ") than you have ("
+ std::to_string(qs.size()) + ")";
throw error;
}

if (length % use_ngpu != 0) {
std::string error = "ERROR: vector length ("
+ std::to_string(length)
+ ") should be divisible by # procs ("
+ std::to_string(use_ngpu) + ")";
throw error;
}
local_length = length / use_ngpu;
}
catch (const char * e) {
std::cout << e << std::endl;
return 1;
}

std::cout << "Number of devices = " << use_ngpu << std::endl;
std::cout << "Number of iterations = " << iterations << std::endl;
std::cout << "Vector length = " << length << std::endl;
std::cout << "Number of GPUs to use = " << use_ngpu << std::endl;

std::vector<sycl::queue> qs;

auto platforms = sycl::platform::get_platforms();
for (auto & p : platforms) {
auto pname = p.get_info<sycl::info::platform::name>();
std::cout << "*Platform: " << pname << std::endl;
if ( pname.find("Level-Zero") != std::string::npos) {
std::cout << "*Level Zero GPU skipped" << std::endl;
break;
}
if ( pname.find("Intel") == std::string::npos) {
std::cout << "*non-Intel skipped" << std::endl;
break;
}
auto devices = p.get_devices();
for (auto & d : devices ) {
std::cout << "**Device: " << d.get_info<sycl::info::device::name>() << std::endl;
if ( d.is_gpu() || d.is_cpu() ) {
std::cout << "**Device is CPU or GPU - adding to vector of queues" << std::endl;
qs.push_back(sycl::queue(d));
}
}
}
std::cout << "Vector length (local) = " << local_length << std::endl;

int haz_ngpu = qs.size();
std::cout << "Number of CPUs and GPUs found = " << haz_ngpu << std::endl;

if (use_ngpu > haz_ngpu) {
std::cout << "You cannot use more GPUs (" << use_ngpu << ") than you have (" << haz_ngpu << ")" << std::endl;
}

int ngpus = use_ngpu;
int np = use_ngpu;

//////////////////////////////////////////////////////////////////////
// Allocate space and perform the computation
//////////////////////////////////////////////////////////////////////

double nstream_time(0);

const size_t bytes = length * sizeof(double);
auto h_A = prk::vector<double>(length, 0);
auto h_B = prk::vector<double>(length, 2);
auto h_C = prk::vector<double>(length, 2);

auto h_A = prk::vector<double>(length);
auto h_B = prk::vector<double>(length);
auto h_C = prk::vector<double>(length);
auto d_A = std::vector<double*> (np, nullptr);
auto d_B = std::vector<double*> (np, nullptr);
auto d_C = std::vector<double*> (np, nullptr);

for (size_t i=0; i<length; ++i) {
h_A[i] = 0;
h_B[i] = 2;
h_C[i] = 2;
}
qs.allocate<double>(d_A, local_length);
qs.allocate<double>(d_B, local_length);
qs.allocate<double>(d_C, local_length);
qs.waitall();

std::vector<size_t> ls(ngpus,0);
{
const size_t elements_per_gpu = prk::divceil(length, ngpus);
for (int g=0; g<ngpus; ++g) {
ls[g] = elements_per_gpu;
}
if (elements_per_gpu * ngpus > length) {
ls[ngpus-1] = length - (ngpus-1) * elements_per_gpu;
}
}

auto d_A = std::vector<double*> (ngpus, nullptr);
auto d_B = std::vector<double*> (ngpus, nullptr);
auto d_C = std::vector<double*> (ngpus, nullptr);

for (int g=0; g<ngpus; ++g) {
auto q = qs[g];

const auto local_length = ls[g];
const auto local_bytes = local_length * sizeof(double);

d_A[g] = syclx::malloc_device<double>(local_length, q);
d_B[g] = syclx::malloc_device<double>(local_length, q);
d_C[g] = syclx::malloc_device<double>(local_length, q);
q.wait();

const size_t start = (g>0) ? ls[g-1] : 0;
const size_t size = ls[g] * sizeof(double);
q.memcpy(d_A[g], &(h_A[start]), size);
q.memcpy(d_B[g], &(h_B[start]), size);
q.memcpy(d_C[g], &(h_C[start]), size);
q.wait();
}
qs.scatter<double>(d_A, h_A, local_length);
qs.scatter<double>(d_B, h_B, local_length);
qs.scatter<double>(d_C, h_C, local_length);
qs.waitall();

for (size_t i=0; i<length; ++i) {
h_A[i] = -77777777;
}
// overwrite host buffer with garbage to detect bugs
h_A.fill(-77777777);

const double scalar(3);
{
for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) nstream_time = prk::wtime();

for (int g=0; g<ngpus; ++g) {
auto q = qs[g];
for (int g=0; g<np; ++g) {
auto q = qs.queue(g);

auto p_A = d_A[g];
auto p_B = d_B[g];
auto p_C = d_C[g];

const size_t size = ls[g];
const size_t size = local_length;

q.submit([&](sycl::handler& h) {
h.parallel_for( sycl::range<1>{size}, [=] (sycl::id<1> i) {
p_A[i] += p_B[i] + scalar * p_C[i];
});
});
}
for (auto & q : qs) {
q.wait();
}
qs.waitall();
}
nstream_time = prk::wtime() - nstream_time;
}

for (int g=0; g<ngpus; ++g) {
auto q = qs[g];
qs.gather<double>(h_A, d_A, local_length);
qs.waitall();

const size_t start = (g>0) ? ls[g-1] : 0;
const size_t size = ls[g] * sizeof(double);

q.memcpy(&(h_A[start]), d_A[g], size);
q.wait();

syclx::free(d_C[g], q);
syclx::free(d_B[g], q);
syclx::free(d_A[g], q);
q.wait();
}
qs.free(d_A);
qs.free(d_B);
qs.free(d_C);
qs.waitall();

//////////////////////////////////////////////////////////////////////
/// Analyze and output results
83,953 changes: 83,953 additions & 0 deletions Cxx11/nstream-openmp-target.bc

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cxx11/occa-git
Submodule occa-git added at a73b68
14 changes: 14 additions & 0 deletions Cxx11/pic-dir/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@


all: pic-dpcpp

pic-dpcpp: pic-dpcpp.cc random_draw.o
dpcpp -DPRKVERSION=\"ALCF\" -DUSE_CPU -g -O3 pic-dpcpp.cc random_draw.o -o pic-dpcpp

random_draw.o: random_draw.c random_draw.h
clang -O3 -c random_draw.c -o random_draw.o

clean:
-rm -f random_draw.o
-rm -f pic-dpcpp.o
-rm -f pic-dpcpp
6 changes: 6 additions & 0 deletions Cxx11/pic-dir/PIC.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
```sh
./pic-dpcpp 10 1000 1000000 1 2 GEOMETRIC 0.99
./pic-dpcpp 10 1000 1000000 0 1 SINUSOIDAL
./pic-dpcpp 10 1000 1000000 1 0 LINEAR 1.0 3.0
./pic-dpcpp 10 1000 1000000 1 0 PATCH 0 200 100 200
```
Binary file added Cxx11/pic-dir/pic-dpcpp
Binary file not shown.
686 changes: 686 additions & 0 deletions Cxx11/pic-dir/pic-dpcpp.cc

Large diffs are not rendered by default.

383 changes: 383 additions & 0 deletions Cxx11/pic-dir/prk_util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,383 @@
///
/// Copyright (c) 2018, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

#ifndef PRK_UTIL_H
#define PRK_UTIL_H

#include <cstdio>
#include <cstdlib> // atoi, getenv
#include <cstdint>
#include <cfloat> // FLT_MIN
#include <climits>

// Test standard library _after_ standard headers have been included...
#if !defined(__NVCC__) && !defined(__PGI) && !defined(__ibmxl__) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI)
# warning You are using an ancient version GNU libstdc++. Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option.
#endif

#if !(defined(__cplusplus) && (__cplusplus >= 201103L))
# error You need a C++11 compiler or a newer C++ standard library.
#endif

#include <string>
#include <iostream>
#include <iomanip> // std::setprecision
#include <exception>
#include <list>
#include <vector>

#include <chrono>
#include <typeinfo>
#include <array>
#include <atomic>
#include <numeric>
#include <algorithm>

//#include "prk_simd.h"

#ifdef USE_RANGES
# include "prk_ranges.h"
#endif

// omp_get_wtime()
#if defined(USE_OPENMP) && defined(_OPENMP)
#include <omp.h>
#endif

#define RESTRICT __restrict__

#if (defined(__cplusplus) && (__cplusplus >= 201703L))
#define PRK_UNUSED [[maybe_unused]]
#else
#define PRK_UNUSED
#endif

namespace prk {

template <typename T>
bool is_power_of_2(T n) {
#if defined(__GNUC__) || defined(__clang__)
return (1 == __builtin_popcount(n));
#else
return ( (a & (~a+1)) == a );
#endif
}

int get_alignment(void)
{
/* a := alignment */
#ifdef PRK_ALIGNMENT
int a = PRK_ALIGNMENT;
#else
const char* temp = std::getenv("PRK_ALIGNMENT");
int a = (temp!=nullptr) ? std::atoi(temp) : 64;
if (a < 8) a = 8;
if ( !prk::is_power_of_2(a) ) {
std::cout << "You requested alignment (" << a << ") that is not a power of two!" << std::endl;
std::abort();
}
#endif
return a;
}

#if defined(__INTEL_COMPILER)

template <typename T>
T * malloc(size_t n)
{
const int alignment = prk::get_alignment();
const size_t bytes = n * sizeof(T);
return (T*)_mm_malloc( bytes, alignment);
}

template <typename T>
void free(T * p)
{
_mm_free(p);
p = nullptr;
}

#else // !__INTEL_COMPILER

template <typename T>
T * malloc(size_t n)
{
const int alignment = prk::get_alignment();
const size_t bytes = n * sizeof(T);

// We cannot use C11 aligned_alloc on Mac.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69680 */
// GCC claims to be C11 without knowing if glibc is compliant...
#if !defined(__GNUC__) && \
!defined(__APPLE__) && \
defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && 0 \

// From ISO C11:
//
// "The aligned_alloc function allocates space for an object
// whose alignment is specified by alignment, whose size is
// specified by size, and whose value is indeterminate.
// The value of alignment shall be a valid alignment supported
// by the implementation and the value of size shall be an
// integral multiple of alignment."
//
// Thus, if we do not round up the bytes to be a multiple
// of the alignment, we violate ISO C.

const size_t padded = bytes;
const size_t excess = bytes % alignment;
if (excess>0) padded += (alignment - excess);
return aligned_alloc(alignment,padded);

#else

T * ptr = nullptr;
const int ret = posix_memalign((void**)&ptr,alignment,bytes);
if (ret!=0) ptr = nullptr;
return ptr;

#endif

}

template <typename T>
void free(T * p)
{
std::free(p);
p = nullptr;
}

#endif // __INTEL_COMPILER

template<class I, class T>
const T reduce(I first, I last, T init) {
#if (defined(__cplusplus) && (__cplusplus >= 201703L)) && !defined(__GNUC__)
return std::reduce(first, last, init);
#elif (defined(__cplusplus) && (__cplusplus >= 201103L))
return std::accumulate(first, last, init);
#else
// unreachable, but preserved as reference implementation
T r(0);
for (I i=first; i!=last; ++i) {
r += *i;
}
return r;
#endif
}

template <typename T>
class vector {

private:
T * data_;
size_t size_;

public:

vector(size_t n) {
//this->data_ = new T[n];
this->data_ = prk::malloc<T>(n);
this->size_ = n;
}

vector(size_t n, T v) {
//this->data_ = new T[n];
this->data_ = prk::malloc<T>(n);
for (size_t i=0; i<n; ++i) this->data_[i] = v;
this->size_ = n;
}

~vector() {
//delete[] this->data_;
prk::free<T>(this->data_);
}

void operator~() {
this->~vector();
}

T * data() {
return this->data_;
}

size_t size() {
return this->size_;
}

#if 0
T const & operator[] (int n) const {
return this->data_[n];
}

T & operator[] (int n) {
return this->data_[n];
}
#endif

T const & operator[] (size_t n) const {
return this->data_[n];
}

T & operator[] (size_t n) {
return this->data_[n];
}

T * begin() {
return &(this->data_[0]);
}

T * end() {
return &(this->data_[this->size_]);
}

#if 0
T & begin() {
return this->data_[0];
}

T & end() {
return this->data_[this->size_];
}
#endif
};

static inline double wtime(void)
{
#if defined(USE_OPENMP) && defined(_OPENMP)
return omp_get_wtime();
#else
using t = std::chrono::high_resolution_clock;
auto c = t::now().time_since_epoch().count();
auto n = t::period::num;
auto d = t::period::den;
double r = static_cast<double>(c)/static_cast<double>(d)*static_cast<double>(n);
return r;
#endif
}

template <class T1, class T2>
static inline auto divceil(T1 numerator, T2 denominator) -> decltype(numerator / denominator) {
return ( numerator / denominator + (numerator % denominator > 0) );
}

bool parse_boolean(const std::string & s)
{
if (s=="t" || s=="T" || s=="y" || s=="Y" || s=="1") {
return true;
} else {
return false;
}

}

template<typename T>
T * alloc(size_t bytes)
{
int alignment = ::prk::get_alignment();
#if defined(__INTEL_COMPILER)
return (void*)_mm_malloc(bytes,alignment);
#else
T * ptr = nullptr;
int ret = posix_memalign((void**)&ptr,alignment,bytes);
if (ret!=0) ptr = NULL;
return ptr;
#endif

}

template<typename T>
void dealloc(T * p)
{
#if defined(__INTEL_COMPILER)
_mm_free((void*)p);
#else
free((void*)p);
#endif
}

int get_max_matrix_size(void)
{
// std::floor( std::sqrt(INT_MAX) )
return 46340;
}

template <typename T>
T abs(T x) {
return (x >= 0 ? x : -x);
}

template <>
float abs(float x) {
return __builtin_fabsf(x);
}

template <>
double abs(double x) {
return __builtin_fabs(x);
}

template <typename T>
T sqrt(T x) {
double y = static_cast<double>(x);
double z = __builtin_sqrt(y);
return static_cast<T>(z);
}

template <>
float sqrt(float x) {
return __builtin_sqrtf(x);
}

template <>
double sqrt(double x) {
return __builtin_sqrt(x);
}

template <typename T>
T pow(T x, int n) {
double y = static_cast<double>(x);
double z = __builtin_pow(y,n);
return static_cast<T>(z);
}

template <>
double pow(double x, int n) {
return __builtin_pow(x,n);
}

template <>
float pow(float x, int n) {
return __builtin_pow(x,n);
}

} // namespace prk

#endif /* PRK_UTIL_H */
190 changes: 190 additions & 0 deletions Cxx11/pic-dir/random_draw.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
/*
Copyright (c) 2015, Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/

/**********************************************************************
Name: LCG
Purpose: Provide a mixed Linear Congruential Generator of pseudo-random
numbers with a period of 2^64, plus tools to jump ahead in a sequence
of such generated numbers. For details, see individual functions.
Functions: LCG_next: a new pseudo-randon number
LCG_get_chunk: return subset of an interval of natural numbers
LCG_init: initialize the generator
LCG_jump: jump ahead into a sequence of pseudo-random numbers
random_draw:
Notes: LCG_init must be called by each thread or rank before any jump
into a sequence of pseudo-random numbers is made
History: Written by Rob Van der Wijngaart, December 2015
**********************************************************************/

#include <math.h>
#include <stdint.h>
#include <inttypes.h>
#include <limits.h>

#include "random_draw.h"

static uint64_t LCG_a = 6364136223846793005;
static uint64_t LCG_c = 1442695040888963407;
static uint64_t LCG_seed_init = 27182818285; //used to (re)set seed

#ifdef __OPENMP
#pragma omp threadprivate (LCG_seed, LCG_A)
#endif

/* for a range of 0 to size-i, find chunk assigned to calling thread */
void LCG_get_chunk(uint64_t *start, uint64_t *end, int tid, int nthreads, uint64_t size) {
uint64_t chunk, remainder;
chunk = size/nthreads;
remainder = size - chunk*nthreads;

if ((uint64_t)tid < remainder) {
*start = tid*(chunk+1);
*end = *start + chunk;
}
else {
*start = remainder*(chunk+1) + (tid-remainder)*chunk;
*end = *start + chunk -1;
}
return;
}


static uint64_t tail(uint64_t x) {
uint64_t x2 = x;
if (!x) return x;
uint64_t result = 1;
while (x>>=1) result <<=1;
return (x2 - result);
}

/* Sum(i=1,2^k) a^i */
static uint64_t SUMPOWER(int k, random_draw_t *parm) {
if (!k) return LCG_a;
return SUMPOWER(k-1, parm)*(1+parm->LCG_A[k-1]);
}

static int LOG(uint64_t n) {
int result = 0;
while (n>>=1) result++;
return(result);
}

/* Sum(i=1,n) a^i, with n arbitrary */
static uint64_t SUMK(uint64_t n, random_draw_t *parm) {
if (n==0) return(0);
uint64_t HEAD = SUMPOWER(LOG(n),parm);
uint64_t TAILn = tail(n);
if (TAILn==0) return(HEAD);
return(HEAD + (parm->LCG_A[LOG(n)])*SUMK(TAILn,parm));
}

uint64_t LCG_next(uint64_t bound, random_draw_t *parm) {
parm->LCG_seed = LCG_a*parm->LCG_seed + LCG_c;
return (parm->LCG_seed%bound);
}

void LCG_init(random_draw_t *parm){

int i;

parm->LCG_seed = LCG_seed_init;
parm->LCG_A[0] = LCG_a;
for (i=1; i<NMAX; i++) {
parm->LCG_A[i] = parm->LCG_A[i-1]*parm->LCG_A[i-1];
}
return;
}

void LCG_jump(uint64_t m, uint64_t bound, random_draw_t *parm){

int i, index, LCG_power[NMAX];
uint64_t mm, s_part;

for (i=0; i<NMAX; i++) LCG_power[i] = 0;
parm->LCG_seed = LCG_seed_init;

/* Catch two special cases */
switch (m) {
case 0: return;
case 1: LCG_next(bound, parm); return;
}

mm = m;
index = 0;
while (mm) {
LCG_power[index++] = mm&1;
mm >>=1;
}

s_part = 1;
for (i=0; i<index; i++) if (LCG_power[i]) s_part *= parm->LCG_A[i];
parm->LCG_seed = s_part*parm->LCG_seed + (SUMK(m-1,parm)+1)*LCG_c;
return;
}

uint64_t random_draw(double mu, random_draw_t *parm)
{
const double two_pi = 2.0*3.14159265358979323846;
const uint64_t rand_max = ULLONG_MAX;
const double rand_div = 1.0/ULLONG_MAX;
const uint64_t denominator = UINT_MAX;

static double z0, z1;
double u0, u1, sigma;
static uint64_t numerator;
static uint64_t i0, i1;

if (mu>=1.0) {
/* set std dev equal to 15% of average; ensures result will never be negative */
sigma = mu*0.15;
u0 = LCG_next(rand_max, parm) * rand_div;
u1 = LCG_next(rand_max, parm) * rand_div;

z0 = sqrt(-2.0 * log(u0)) * cos(two_pi * u1);
z1 = sqrt(-2.0 * log(u0)) * sin(two_pi * u1);
return (uint64_t) (z0 * sigma + mu+0.5);
}
else {
/* we need to pick two integers whose quotient approximates mu; set one to UINT_MAX */
numerator = (uint32_t) (mu*(double)denominator);
i0 = LCG_next(denominator, parm); /* don't use value, but must call LCG_next twice */
i1 = LCG_next(denominator, parm);
return ((uint64_t)(i1<=numerator));
}

}
57 changes: 57 additions & 0 deletions Cxx11/pic-dir/random_draw.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
Copyright (c) 2015, Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/

#ifdef __cplusplus
extern "C" {
#endif

#ifndef RANDOM_DRAW_H
#define RANDOM_DRAW_H

#define NMAX 64

typedef struct {
uint64_t LCG_seed;
uint64_t LCG_A[NMAX];
} random_draw_t;

extern void LCG_init(random_draw_t *);
extern uint64_t LCG_next(uint64_t, random_draw_t *);
extern void LCG_get_chunk(uint64_t *, uint64_t *, int, int, uint64_t);
extern void LCG_jump(uint64_t, uint64_t, random_draw_t *);
extern uint64_t random_draw(double, random_draw_t *);

#endif /* RANDOM_DRAW_H */

#ifdef __cplusplus
}
#endif
35 changes: 35 additions & 0 deletions Cxx11/platforms_devices.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include <iostream>
#include <CL/sycl.hpp>
namespace sycl = cl::sycl;
int
main(int argc, char *argv[])
{
auto platforms = sycl::platform::get_platforms();
std::cout << platforms.size() << " Platforms" << std::endl;
for (auto &platform : platforms) {
std::cout << "Platform: "
<< platform.get_info<sycl::info::platform::name>()
<< std::endl;
}

size_t number = 0;
for (auto &OnePlatform : platforms) {
std::cout
<< ++number << " found..."
<< std::endl
<< "Platform: "
<< OnePlatform.get_info<sycl::info::platform::name>()
<< std::endl;

/* Loop through the devices SYCL can find
* there is always ONE */
auto MyDevices = OnePlatform.get_devices();
for (auto &OneDevice : MyDevices ) {
std::cout
<< " Device: "
<< OneDevice.get_info<sycl::info::device::name>()
<< std::endl;
} //devices
std::cout << std::endl;
}//platforms
}
200 changes: 200 additions & 0 deletions Cxx11/prk_sycl.h
Original file line number Diff line number Diff line change
@@ -4,12 +4,17 @@
#include <cstdlib>
#include <iostream>

//#include <iterator> // std::distance
#include <boost/range/adaptor/indexed.hpp>

#include "CL/sycl.hpp"

#ifdef __COMPUTECPP__
#include "SYCL/experimental/usm.h"
#endif

#include "prk_util.h" // prk::vector

namespace sycl = cl::sycl;

#ifdef __COMPUTECPP__
@@ -84,6 +89,201 @@ namespace prk {
#endif
}

class queues {

private:
std::vector<sycl::queue> list;

public:
queues(bool use_cpu = true, bool use_gpu = true)
{
auto platforms = sycl::platform::get_platforms();
for (auto & p : platforms) {
auto pname = p.get_info<sycl::info::platform::name>();
std::cout << "*Platform: " << pname << std::endl;
#if 0
if ( pname.find("Level-Zero") != std::string::npos) {
std::cout << "*Level Zero GPU skipped" << std::endl;
break;
}
#endif
if ( pname.find("Intel") == std::string::npos) {
std::cout << "*non-Intel skipped" << std::endl;
break;
}
auto devices = p.get_devices();
for (auto & d : devices ) {
std::cout << "**Device: " << d.get_info<sycl::info::device::name>() << std::endl;
if ( d.is_gpu() && use_gpu ) {
std::cout << "**Device is GPU - adding to vector of queues" << std::endl;
list.push_back(sycl::queue(d));
}
}
#if 1
for (auto & d : devices ) {
std::cout << "**Device: " << d.get_info<sycl::info::device::name>() << std::endl;
if ( d.is_cpu() && use_cpu ) {
std::cout << "**Device is CPU - adding to vector of queues" << std::endl;
list.push_back(sycl::queue(d));
}
}
#endif
}
}

int size(void)
{
return list.size();
}

void wait(int i)
{
if ( i > this->size() ) {
std::cerr << "ERROR: invalid device id: " << i << std::endl;
}
list.at(i).wait();
}

void waitall(void)
{
for (auto & i : list) {
i.wait();
}
}

sycl::queue queue(int i) {
if ( i > this->size() ) {
std::cerr << "ERROR: invalid device id: " << i << std::endl;
}
return this->list.at(i);
}

template <typename T>
void allocate(std::vector<T*> & device_pointers,
size_t num_elements)
{
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
device_pointers[i] = syclx::malloc_device<T>(num_elements, v);
}
}

template <typename T>
void free(std::vector<T*> & device_pointers)
{
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
syclx::free(device_pointers[i], v);
}
}

template <typename T, typename B>
void broadcast(std::vector<T*> & device_pointers,
const B & host_pointer,
size_t num_elements)
{
auto bytes = num_elements * sizeof(T);
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
auto target = device_pointers[i];
auto source = &host_pointer[0];
std::cout << "BCAST: device " << i << std::endl;
v.memcpy(target, source, bytes);
}
}

template <typename T, typename B>
void reduce(B & host_pointer,
const std::vector<T*> & device_pointers,
size_t num_elements)
{
std::cout << "REDUCE: num_elements " << num_elements << std::endl;
auto bytes = num_elements * sizeof(T);
std::cout << "REDUCE: bytes " << bytes << std::endl;
auto temp = prk::vector<T>(num_elements, 0);
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
std::cout << "REDUCE: device " << i << std::endl;
auto target = &temp[0];
auto source = device_pointers[i];
v.memcpy(target, source, bytes);
for (size_t e=0; e<num_elements; ++e) {
host_pointer[e] += temp[e];
}
}
}

template <typename T, typename B>
void gather(B & host_pointer,
const std::vector<T*> & device_pointers,
size_t num_elements)
{
auto bytes = num_elements * sizeof(T);
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
auto target = &host_pointer[i * num_elements];
auto source = device_pointers[i];
v.memcpy(target, source, bytes);
}
}

template <typename T, typename B>
void scatter(std::vector<T*> & device_pointers,
const B & host_pointer,
size_t num_elements)
{
auto bytes = num_elements * sizeof(T);
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
auto target = device_pointers[i];
auto source = &host_pointer[i * num_elements];
v.memcpy(target, source, bytes);
}
}

// num_elements is defined the same as MPI
// each device contributes np * num_elements
// each device receives np * num_elements
template <typename T>
void alltoall(std::vector<T*> & device_pointers_out,
std::vector<T*> & device_pointers_in,
size_t num_elements)
{
auto bytes = num_elements * sizeof(T);
// allocate np*np temp space on the host, because
// we cannot copy device-to-device if they are in
// different contexts.
// we can specialize for single-context later...
int np = this->list.size();
prk::vector<double> temp(num_elements * np * np);

// gather phase - contiguous
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
auto target = &temp[i * np * num_elements];
auto source = device_pointers_in[i];
v.memcpy(target, source, np * bytes);
}

// scatter phase - noncontiguous
for (const auto & l : list | boost::adaptors::indexed(0) ) {
auto i = l.index();
auto v = l.value();
auto target = device_pointers_out[i];
auto source = &temp[i * num_elements];
v.memcpy(target, source, bytes);
}

}
};

} // namespace SYCL

} // namespace prk
7 changes: 4 additions & 3 deletions Cxx11/prk_util.h
Original file line number Diff line number Diff line change
@@ -226,20 +226,17 @@ namespace prk {
public:

vector(size_t n) {
//this->data_ = new T[n];
this->data_ = prk::malloc<T>(n);
this->size_ = n;
}

vector(size_t n, T v) {
//this->data_ = new T[n];
this->data_ = prk::malloc<T>(n);
for (size_t i=0; i<n; ++i) this->data_[i] = v;
this->size_ = n;
}

~vector() {
//delete[] this->data_;
prk::free<T>(this->data_);
}

@@ -281,6 +278,10 @@ namespace prk {
return &(this->data_[this->size_]);
}

void fill(T v) {
for (size_t i=0; i<this->size_; ++i) this->data_[i] = v;
}

#if 0
T & begin() {
return this->data_[0];
255 changes: 255 additions & 0 deletions Cxx11/stencil-dpcpp-1.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
///
/// Copyright (c) 2020, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME: Stencil
///
/// PURPOSE: This program tests the efficiency with which a space-invariant,
/// linear, symmetric filter (stencil) can be applied to a square
/// grid or image.
///
/// USAGE: The program takes as input the linear
/// dimension of the grid, and the number of iterations on the grid
///
/// <progname> <iterations> <grid size>
///
/// The output consists of diagnostics to make sure the
/// algorithm worked, and of timing statistics.
///
/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
/// - RvdW: Removed unrolling pragmas for clarity;
/// added constant to array "in" at end of each iteration to force
/// refreshing of neighbor data in parallel versions; August 2013
/// C++11-ification by Jeff Hammond, May 2017.
///
//////////////////////////////////////////////////////////////////////

#include "prk_sycl.h"
#include "prk_util.h"
#include "stencil_sycl.hpp"

template <typename T>
void nothing(sycl::queue & q, const size_t n, const T * in, T * out)
{
std::cout << "You are trying to use a stencil that does not exist.\n";
std::cout << "Please generate the new stencil using the code generator\n";
std::cout << "and add it to the case-switch in the driver." << std::endl;
prk::Abort();
}

int main(int argc, char* argv[])
{
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
std::cout << "C++11/DPC++ Stencil execution on 2D grid" << std::endl;

//////////////////////////////////////////////////////////////////////
// Process and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
size_t n, tile_size;
bool star = true;
size_t radius = 2;
try {
if (argc < 3) {
throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
}

// number of times to run the algorithm
iterations = std::atoi(argv[1]);
if (iterations < 1) {
throw "ERROR: iterations must be >= 1";
}

// linear grid dimension
n = std::atoi(argv[2]);
if (n < 1) {
throw "ERROR: grid dimension must be positive";
} else if (n > prk::get_max_matrix_size()) {
throw "ERROR: grid dimension too large - overflow risk";
}

// default tile size for tiling of local transpose
tile_size = 32;
if (argc > 3) {
tile_size = std::atoi(argv[3]);
if (tile_size <= 0) tile_size = n;
if (tile_size > n) tile_size = n;
}

// stencil pattern
if (argc > 4) {
auto stencil = std::string(argv[4]);
auto grid = std::string("grid");
star = (stencil == grid) ? false : true;
}

// stencil radius
radius = 2;
if (argc > 5) {
radius = std::atoi(argv[5]);
}

if ( (radius < 1) || (2*radius+1 > n) ) {
throw "ERROR: Stencil radius negative or too large";
}
}
catch (const char * e) {
std::cout << e << std::endl;
return 1;
}

std::cout << "Number of iterations = " << iterations << std::endl;
std::cout << "Grid size = " << n << std::endl;
std::cout << "Tile size = " << tile_size << std::endl;
std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl;
std::cout << "Radius of stencil = " << radius << std::endl;

auto stencil = nothing<double>;
if (star) {
switch (radius) {
case 1: stencil = star1<double>; break;
case 2: stencil = star2<double>; break;
case 3: stencil = star3<double>; break;
case 4: stencil = star4<double>; break;
case 5: stencil = star5<double>; break;
}
}
#if 0
else {
switch (radius) {
case 1: stencil = grid1; break;
case 2: stencil = grid2; break;
case 3: stencil = grid3; break;
case 4: stencil = grid4; break;
case 5: stencil = grid5; break;
}
}
#endif

sycl::queue q(sycl::default_selector{});
prk::SYCL::print_device_platform(q);

//////////////////////////////////////////////////////////////////////
// Allocate space and perform the computation
//////////////////////////////////////////////////////////////////////

double stencil_time(0);

prk::vector<double> h_in(n*n, 0);
prk::vector<double> h_out(n*n, 0);

const size_t bytes = n * n * sizeof(double);

double * d_in = syclx::malloc_device<double>(n*n, q);
double * d_out = syclx::malloc_device<double>(n*n, q);
q.wait();

q.submit([&](sycl::handler& h) {
h.parallel_for(sycl::range<2> {n,n}, [=] (sycl::item<2> it) {
const auto i = it[0];
const auto j = it[1];
d_in[i*n+j] = static_cast<double>(i+j);
d_out[i*n+j] = static_cast<double>(0);
});
});
q.wait();

for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) stencil_time = prk::wtime();

// Apply the stencil operator
stencil(q, n, d_in, d_out);
q.wait();

// Add constant to solution to force refresh of neighbor data, if any
q.submit([&](sycl::handler& h) {
h.parallel_for(sycl::range<2> {n,n}, [=] (sycl::item<2> it) {
const auto i = it[0];
const auto j = it[1];
d_in[i*n+j] += static_cast<double>(1);
});
});
q.wait();
}
stencil_time = prk::wtime() - stencil_time;

q.memcpy(&(h_in[0]), d_in, bytes);
q.memcpy(&(h_out[0]), d_out, bytes);
q.wait();

syclx::free(d_in, q);
syclx::free(d_out,q);
q.wait();

//////////////////////////////////////////////////////////////////////
// Analyze and output results.
//////////////////////////////////////////////////////////////////////

// interior of grid with respect to stencil
size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
double norm = 0.0;
for (int i=radius; i<n-radius; i++) {
for (int j=radius; j<n-radius; j++) {
norm += prk::abs(h_out[i*n+j]);
}
}
norm /= active_points;

// verify correctness
const double epsilon = 1.0e-8;
double reference_norm = 2.*(iterations+1.);
if (prk::abs(norm-reference_norm) > epsilon) {
std::cout << "ERROR: L1 norm = " << norm
<< " Reference L1 norm = " << reference_norm << std::endl;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
std::cerr << i << "," << j << " = " << h_in[i*n+j] <<", " << h_out[i*n+j] << "\n";
}
}
return 1;
} else {
std::cout << "Solution validates" << std::endl;
#ifdef VERBOSE
std::cout << "L1 norm = " << norm
<< " Reference L1 norm = " << reference_norm << std::endl;
#endif
const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
auto avgtime = stencil_time/iterations;
std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
<< " Avg time (s): " << avgtime << std::endl;
}

return 0;
}
255 changes: 255 additions & 0 deletions Cxx11/stencil-dpcpp-2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
///
/// Copyright (c) 2020, Intel Corporation
///
/// Redistribution and use in source and binary forms, with or without
/// modification, are permitted provided that the following conditions
/// are met:
///
/// * Redistributions of source code must retain the above copyright
/// notice, this list of conditions and the following disclaimer.
/// * Redistributions in binary form must reproduce the above
/// copyright notice, this list of conditions and the following
/// disclaimer in the documentation and/or other materials provided
/// with the distribution.
/// * Neither the name of Intel Corporation nor the names of its
/// contributors may be used to endorse or promote products
/// derived from this software without specific prior written
/// permission.
///
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
/// POSSIBILITY OF SUCH DAMAGE.

//////////////////////////////////////////////////////////////////////
///
/// NAME: Stencil
///
/// PURPOSE: This program tests the efficiency with which a space-invariant,
/// linear, symmetric filter (stencil) can be applied to a square
/// grid or image.
///
/// USAGE: The program takes as input the linear
/// dimension of the grid, and the number of iterations on the grid
///
/// <progname> <iterations> <grid size>
///
/// The output consists of diagnostics to make sure the
/// algorithm worked, and of timing statistics.
///
/// HISTORY: - Written by Rob Van der Wijngaart, February 2009.
/// - RvdW: Removed unrolling pragmas for clarity;
/// added constant to array "in" at end of each iteration to force
/// refreshing of neighbor data in parallel versions; August 2013
/// C++11-ification by Jeff Hammond, May 2017.
///
//////////////////////////////////////////////////////////////////////

#include "prk_sycl.h"
#include "prk_util.h"
#include "stencil_sycl.hpp"

template <typename T>
void nothing(sycl::queue & q, const size_t n, const T * in, T * out)
{
std::cout << "You are trying to use a stencil that does not exist.\n";
std::cout << "Please generate the new stencil using the code generator\n";
std::cout << "and add it to the case-switch in the driver." << std::endl;
prk::Abort();
}

int main(int argc, char* argv[])
{
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
std::cout << "C++11/DPC++ Stencil execution on 2D grid" << std::endl;

//////////////////////////////////////////////////////////////////////
// Process and test input parameters
//////////////////////////////////////////////////////////////////////

int iterations;
size_t n, tile_size;
bool star = true;
size_t radius = 2;
try {
if (argc < 3) {
throw "Usage: <# iterations> <array dimension> [<tile_size> <star/grid> <radius>]";
}

// number of times to run the algorithm
iterations = std::atoi(argv[1]);
if (iterations < 1) {
throw "ERROR: iterations must be >= 1";
}

// linear grid dimension
n = std::atoi(argv[2]);
if (n < 1) {
throw "ERROR: grid dimension must be positive";
} else if (n > prk::get_max_matrix_size()) {
throw "ERROR: grid dimension too large - overflow risk";
}

// default tile size for tiling of local transpose
tile_size = 32;
if (argc > 3) {
tile_size = std::atoi(argv[3]);
if (tile_size <= 0) tile_size = n;
if (tile_size > n) tile_size = n;
}

// stencil pattern
if (argc > 4) {
auto stencil = std::string(argv[4]);
auto grid = std::string("grid");
star = (stencil == grid) ? false : true;
}

// stencil radius
radius = 2;
if (argc > 5) {
radius = std::atoi(argv[5]);
}

if ( (radius < 1) || (2*radius+1 > n) ) {
throw "ERROR: Stencil radius negative or too large";
}
}
catch (const char * e) {
std::cout << e << std::endl;
return 1;
}

std::cout << "Number of iterations = " << iterations << std::endl;
std::cout << "Grid size = " << n << std::endl;
std::cout << "Tile size = " << tile_size << std::endl;
std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl;
std::cout << "Radius of stencil = " << radius << std::endl;

auto stencil = nothing<double>;
if (star) {
switch (radius) {
case 1: stencil = star1<double>; break;
case 2: stencil = star2<double>; break;
case 3: stencil = star3<double>; break;
case 4: stencil = star4<double>; break;
case 5: stencil = star5<double>; break;
}
}
#if 0
else {
switch (radius) {
case 1: stencil = grid1; break;
case 2: stencil = grid2; break;
case 3: stencil = grid3; break;
case 4: stencil = grid4; break;
case 5: stencil = grid5; break;
}
}
#endif

sycl::queue q(sycl::default_selector{});
prk::SYCL::print_device_platform(q);

//////////////////////////////////////////////////////////////////////
// Allocate space and perform the computation
//////////////////////////////////////////////////////////////////////

double stencil_time(0);

prk::vector<double> h_in(n*n, 0);
prk::vector<double> h_out(n*n, 0);

const size_t bytes = n * n * sizeof(double);

double * d_in = syclx::malloc_device<double>(n*n, q);
double * d_out = syclx::malloc_device<double>(n*n, q);
q.wait();

q.submit([&](sycl::handler& h) {
h.parallel_for(sycl::range<2> {n,n}, [=] (sycl::item<2> it) {
const auto i = it[0];
const auto j = it[1];
d_in[i*n+j] = static_cast<double>(i+j);
d_out[i*n+j] = static_cast<double>(0);
});
});
q.wait();

for (int iter = 0; iter<=iterations; iter++) {

if (iter==1) stencil_time = prk::wtime();

// Apply the stencil operator
stencil(q, n, d_in, d_out);
q.wait();

// Add constant to solution to force refresh of neighbor data, if any
q.submit([&](sycl::handler& h) {
h.parallel_for(sycl::range<2> {n,n}, [=] (sycl::item<2> it) {
const auto i = it[0];
const auto j = it[1];
d_in[i*n+j] += static_cast<double>(1);
});
});
q.wait();
}
stencil_time = prk::wtime() - stencil_time;

q.memcpy(&(h_in[0]), d_in, bytes);
q.memcpy(&(h_out[0]), d_out, bytes);
q.wait();

syclx::free(d_in, q);
syclx::free(d_out,q);
q.wait();

//////////////////////////////////////////////////////////////////////
// Analyze and output results.
//////////////////////////////////////////////////////////////////////

// interior of grid with respect to stencil
size_t active_points = static_cast<size_t>(n-2*radius)*static_cast<size_t>(n-2*radius);
double norm = 0.0;
for (int i=radius; i<n-radius; i++) {
for (int j=radius; j<n-radius; j++) {
norm += prk::abs(h_out[i*n+j]);
}
}
norm /= active_points;

// verify correctness
const double epsilon = 1.0e-8;
double reference_norm = 2.*(iterations+1.);
if (prk::abs(norm-reference_norm) > epsilon) {
std::cout << "ERROR: L1 norm = " << norm
<< " Reference L1 norm = " << reference_norm << std::endl;
for (int i=0; i<n; i++) {
for (int j=0; j<n; j++) {
std::cerr << i << "," << j << " = " << h_in[i*n+j] <<", " << h_out[i*n+j] << "\n";
}
}
return 1;
} else {
std::cout << "Solution validates" << std::endl;
#ifdef VERBOSE
std::cout << "L1 norm = " << norm
<< " Reference L1 norm = " << reference_norm << std::endl;
#endif
const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1);
size_t flops = (2L*(size_t)stencil_size+1L) * active_points;
auto avgtime = stencil_time/iterations;
std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast<double>(flops)/avgtime
<< " Avg time (s): " << avgtime << std::endl;
}

return 0;
}
Loading