Skip to content

Commit 272aa38

Browse files
author
Jeff Hammond
committed
WIP
1 parent 38d68cf commit 272aa38

File tree

1 file changed

+57
-15
lines changed

1 file changed

+57
-15
lines changed

Cxx11/transpose-multigpu-dpcpp.cc

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,10 @@ int main(int argc, char * argv[])
6363

6464
int iterations;
6565
size_t order;
66+
int use_ngpu = 1;
6667
try {
6768
if (argc < 3) {
68-
throw "Usage: <# iterations> <matrix order>";
69+
throw "Usage: <# iterations> <matrix order> [<use_ngpu>]";
6970
}
7071

7172
iterations = std::atoi(argv[1]);
@@ -79,6 +80,15 @@ int main(int argc, char * argv[])
7980
} else if (order > prk::get_max_matrix_size()) {
8081
throw "ERROR: matrix dimension too large - overflow risk";
8182
}
83+
84+
if (argc > 3) {
85+
use_ngpu = std::atoi(argv[3]);
86+
}
87+
88+
if (order % use_ngpu) {
89+
std::cerr << "order = " << order << ", device count = " << use_ngpu << std::endl;
90+
throw "ERROR: matrix order should be divisible by device count!";
91+
}
8292
}
8393
catch (const char * e) {
8494
std::cout << e << std::endl;
@@ -87,34 +97,66 @@ int main(int argc, char * argv[])
8797

8898
std::cout << "Number of iterations = " << iterations << std::endl;
8999
std::cout << "Matrix order = " << order << std::endl;
100+
std::cout << "Number of GPUs to use = " << use_ngpu << std::endl;
101+
102+
std::vector<sycl::queue> qs;
103+
104+
auto platforms = sycl::platform::get_platforms();
105+
for (auto & p : platforms) {
106+
auto pname = p.get_info<sycl::info::platform::name>();
107+
std::cout << "*Platform: " << pname << std::endl;
108+
if ( pname.find("Level-Zero") != std::string::npos) {
109+
std::cout << "*Level Zero GPU skipped" << std::endl;
110+
break;
111+
}
112+
if ( pname.find("Intel") == std::string::npos) {
113+
std::cout << "*non-Intel skipped" << std::endl;
114+
break;
115+
}
116+
auto devices = p.get_devices();
117+
for (auto & d : devices ) {
118+
std::cout << "**Device: " << d.get_info<sycl::info::device::name>() << std::endl;
119+
if ( d.is_gpu() || d.is_cpu() ) {
120+
std::cout << "**Device is CPU or GPU - adding to vector of queues" << std::endl;
121+
qs.push_back(sycl::queue(d));
122+
}
123+
}
124+
}
125+
126+
int haz_ngpu = qs.size();
127+
std::cout << "Number of CPUs and GPUs found = " << haz_ngpu << std::endl;
90128

91-
sycl::queue q(sycl::default_selector{});
92-
prk::SYCL::print_device_platform(q);
129+
if (use_ngpu > haz_ngpu) {
130+
std::cout << "You cannot use more GPUs (" << use_ngpu << ") than you have (" << haz_ngpu << ")" << std::endl;
131+
}
132+
133+
int ngpus = use_ngpu;
93134

94135
//////////////////////////////////////////////////////////////////////
95136
// Allocate space for the input and transpose matrix
96137
//////////////////////////////////////////////////////////////////////
97138

98-
const size_t nelems = (size_t)order * (size_t)order;
99-
const size_t bytes = nelems * sizeof(double);
100-
double * h_a = syclx::malloc_host<double>( nelems, q);
101-
double * h_b = syclx::malloc_host<double>( nelems, q);
139+
double trans_time(0);
140+
141+
auto h_a = prk::vector<double>(order * order);
142+
auto h_b = prk::vector<double>(order * order);
102143

103144
// fill A with the sequence 0 to order^2-1
104-
for (int j=0; j<order; j++) {
105-
for (int i=0; i<order; i++) {
145+
for (size_t j=0; j<order; j++) {
146+
for (size_t i=0; i<order; i++) {
106147
h_a[j*order+i] = static_cast<double>(order*j+i);
107148
h_b[j*order+i] = static_cast<double>(0);
108149
}
109150
}
110151

111-
// copy input from host to device
112-
double * A = syclx::malloc_device<double>( nelems, q);
113-
double * B = syclx::malloc_device<double>( nelems, q);
114-
q.memcpy(A, &(h_a[0]), bytes).wait();
115-
q.memcpy(B, &(h_b[0]), bytes).wait();
152+
const size_t bytes = order * order * sizeof(double);
116153

117-
auto trans_time = 0.0;
154+
// copy input from host to device
155+
double * A = syclx::malloc_device<double>(order * order, q);
156+
double * B = syclx::malloc_device<double>(order * order, q);
157+
q.memcpy(A, &(h_a[0]), bytes);
158+
q.memcpy(B, &(h_b[0]), bytes);
159+
q.wait();
118160

119161
for (int iter = 0; iter<=iterations; iter++) {
120162

0 commit comments

Comments
 (0)