Skip to content

Commit 38d68cf

Browse files
author
Jeff Hammond
committed
add transpose multi-GPU DPC++ skeleton
1 parent bee98b8 commit 38d68cf

File tree

2 files changed

+182
-1
lines changed

2 files changed

+182
-1
lines changed

Cxx11/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ sycl-usm: nstream-sycl-usm nstream-sycl-explicit-usm stencil-sycl-usm transpose-
9797

9898
sycl-explicit: nstream-sycl-explicit transpose-sycl-explicit
9999

100-
dpcpp: nstream-dpcpp nstream-multigpu-dpcpp stencil-dpcpp stencil-multigpu-dpcpp transpose-dpcpp
100+
dpcpp: nstream-dpcpp nstream-multigpu-dpcpp stencil-dpcpp stencil-multigpu-dpcpp transpose-dpcpp transpose-multigpu-dpcpp
101101

102102
tbb: p2p-innerloop-tbb p2p-tbb stencil-tbb transpose-tbb nstream-tbb \
103103
p2p-hyperplane-tbb p2p-tasks-tbb

Cxx11/transpose-multigpu-dpcpp.cc

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
///
2+
/// Copyright (c) 2020, Intel Corporation
3+
///
4+
/// Redistribution and use in source and binary forms, with or without
5+
/// modification, are permitted provided that the following conditions
6+
/// are met:
7+
///
8+
/// * Redistributions of source code must retain the above copyright
9+
/// notice, this list of conditions and the following disclaimer.
10+
/// * Redistributions in binary form must reproduce the above
11+
/// copyright notice, this list of conditions and the following
12+
/// disclaimer in the documentation and/or other materials provided
13+
/// with the distribution.
14+
/// * Neither the name of Intel Corporation nor the names of its
15+
/// contributors may be used to endorse or promote products
16+
/// derived from this software without specific prior written
17+
/// permission.
18+
///
19+
/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20+
/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21+
/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22+
/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23+
/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24+
/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25+
/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26+
/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28+
/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29+
/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30+
/// POSSIBILITY OF SUCH DAMAGE.
31+
32+
//////////////////////////////////////////////////////////////////////
33+
///
34+
/// NAME: transpose
35+
///
36+
/// PURPOSE: This program measures the time for the transpose of a
37+
/// column-major stored matrix into a row-major stored matrix.
38+
///
39+
/// USAGE: Program input is the matrix order and the number of times to
40+
/// repeat the operation:
41+
///
42+
/// transpose <matrix_size> <# iterations>
43+
///
44+
/// The output consists of diagnostics to make sure the
45+
/// transpose worked and timing statistics.
46+
///
47+
/// HISTORY: Written by Rob Van der Wijngaart, February 2009.
48+
/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017.
49+
///
50+
//////////////////////////////////////////////////////////////////////
51+
52+
#include "prk_util.h"
53+
#include "prk_sycl.h"
54+
55+
int main(int argc, char * argv[])
56+
{
57+
std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl;
58+
std::cout << "C++11/DPCT Matrix transpose: B = A^T" << std::endl;
59+
60+
//////////////////////////////////////////////////////////////////////
61+
/// Read and test input parameters
62+
//////////////////////////////////////////////////////////////////////
63+
64+
int iterations;
65+
size_t order;
66+
try {
67+
if (argc < 3) {
68+
throw "Usage: <# iterations> <matrix order>";
69+
}
70+
71+
iterations = std::atoi(argv[1]);
72+
if (iterations < 1) {
73+
throw "ERROR: iterations must be >= 1";
74+
}
75+
76+
order = std::atoi(argv[2]);
77+
if (order <= 0) {
78+
throw "ERROR: Matrix Order must be greater than 0";
79+
} else if (order > prk::get_max_matrix_size()) {
80+
throw "ERROR: matrix dimension too large - overflow risk";
81+
}
82+
}
83+
catch (const char * e) {
84+
std::cout << e << std::endl;
85+
return 1;
86+
}
87+
88+
std::cout << "Number of iterations = " << iterations << std::endl;
89+
std::cout << "Matrix order = " << order << std::endl;
90+
91+
sycl::queue q(sycl::default_selector{});
92+
prk::SYCL::print_device_platform(q);
93+
94+
//////////////////////////////////////////////////////////////////////
95+
// Allocate space for the input and transpose matrix
96+
//////////////////////////////////////////////////////////////////////
97+
98+
const size_t nelems = (size_t)order * (size_t)order;
99+
const size_t bytes = nelems * sizeof(double);
100+
double * h_a = syclx::malloc_host<double>( nelems, q);
101+
double * h_b = syclx::malloc_host<double>( nelems, q);
102+
103+
// fill A with the sequence 0 to order^2-1
104+
for (int j=0; j<order; j++) {
105+
for (int i=0; i<order; i++) {
106+
h_a[j*order+i] = static_cast<double>(order*j+i);
107+
h_b[j*order+i] = static_cast<double>(0);
108+
}
109+
}
110+
111+
// copy input from host to device
112+
double * A = syclx::malloc_device<double>( nelems, q);
113+
double * B = syclx::malloc_device<double>( nelems, q);
114+
q.memcpy(A, &(h_a[0]), bytes).wait();
115+
q.memcpy(B, &(h_b[0]), bytes).wait();
116+
117+
auto trans_time = 0.0;
118+
119+
for (int iter = 0; iter<=iterations; iter++) {
120+
121+
if (iter==1) trans_time = prk::wtime();
122+
123+
q.submit([&](sycl::handler& h) {
124+
125+
h.parallel_for( sycl::range<2>{order,order}, [=] (sycl::id<2> it) {
126+
#if USE_2D_INDEXING
127+
sycl::id<2> ij{it[0],it[1]};
128+
sycl::id<2> ji{it[1],it[0]};
129+
B[ij] += A[ji];
130+
A[ji] += (T)1;
131+
#else
132+
B[it[0] * order + it[1]] += A[it[1] * order + it[0]];
133+
A[it[1] * order + it[0]] += 1.0;
134+
#endif
135+
});
136+
});
137+
q.wait();
138+
}
139+
trans_time = prk::wtime() - trans_time;
140+
141+
// copy output back to host
142+
q.memcpy(&(h_b[0]), B, bytes).wait();
143+
144+
syclx::free(B, q);
145+
syclx::free(A, q);
146+
147+
//////////////////////////////////////////////////////////////////////
148+
/// Analyze and output results
149+
//////////////////////////////////////////////////////////////////////
150+
151+
const double addit = (iterations+1.) * (iterations/2.);
152+
double abserr(0);
153+
for (int j=0; j<order; j++) {
154+
for (int i=0; i<order; i++) {
155+
const size_t ij = (size_t)i*(size_t)order+(size_t)j;
156+
const size_t ji = (size_t)j*(size_t)order+(size_t)i;
157+
const double reference = static_cast<double>(ij)*(1.+iterations)+addit;
158+
abserr += prk::abs(h_b[ji] - reference);
159+
}
160+
}
161+
162+
syclx::free(h_b, q);
163+
syclx::free(h_a, q);
164+
165+
const auto epsilon = 1.0e-8;
166+
if (abserr < epsilon) {
167+
std::cout << "Solution validates" << std::endl;
168+
auto avgtime = trans_time/iterations;
169+
auto bytes = (size_t)order * (size_t)order * sizeof(double);
170+
std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime
171+
<< " Avg time (s): " << avgtime << std::endl;
172+
} else {
173+
std::cout << "ERROR: Aggregate squared error " << abserr
174+
<< " exceeds threshold " << epsilon << std::endl;
175+
return 1;
176+
}
177+
178+
return 0;
179+
}
180+
181+

0 commit comments

Comments
 (0)