@@ -63,9 +63,10 @@ int main(int argc, char * argv[])
63
63
64
64
int iterations;
65
65
size_t order;
66
+ int use_ngpu = 1 ;
66
67
try {
67
68
if (argc < 3 ) {
68
- throw " Usage: <# iterations> <matrix order>" ;
69
+ throw " Usage: <# iterations> <matrix order> [<use_ngpu>] " ;
69
70
}
70
71
71
72
iterations = std::atoi (argv[1 ]);
@@ -79,6 +80,15 @@ int main(int argc, char * argv[])
79
80
} else if (order > prk::get_max_matrix_size ()) {
80
81
throw " ERROR: matrix dimension too large - overflow risk" ;
81
82
}
83
+
84
+ if (argc > 3 ) {
85
+ use_ngpu = std::atoi (argv[3 ]);
86
+ }
87
+
88
+ if (order % use_ngpu) {
89
+ std::cerr << " order = " << order << " , device count = " << use_ngpu << std::endl;
90
+ throw " ERROR: matrix order should be divisible by device count!" ;
91
+ }
82
92
}
83
93
catch (const char * e) {
84
94
std::cout << e << std::endl;
@@ -87,34 +97,66 @@ int main(int argc, char * argv[])
87
97
88
98
std::cout << " Number of iterations = " << iterations << std::endl;
89
99
std::cout << " Matrix order = " << order << std::endl;
100
+ std::cout << " Number of GPUs to use = " << use_ngpu << std::endl;
101
+
102
+ std::vector<sycl::queue> qs;
103
+
104
+ auto platforms = sycl::platform::get_platforms ();
105
+ for (auto & p : platforms) {
106
+ auto pname = p.get_info <sycl::info::platform::name>();
107
+ std::cout << " *Platform: " << pname << std::endl;
108
+ if ( pname.find (" Level-Zero" ) != std::string::npos) {
109
+ std::cout << " *Level Zero GPU skipped" << std::endl;
110
+ break ;
111
+ }
112
+ if ( pname.find (" Intel" ) == std::string::npos) {
113
+ std::cout << " *non-Intel skipped" << std::endl;
114
+ break ;
115
+ }
116
+ auto devices = p.get_devices ();
117
+ for (auto & d : devices ) {
118
+ std::cout << " **Device: " << d.get_info <sycl::info::device::name>() << std::endl;
119
+ if ( d.is_gpu () || d.is_cpu () ) {
120
+ std::cout << " **Device is CPU or GPU - adding to vector of queues" << std::endl;
121
+ qs.push_back (sycl::queue (d));
122
+ }
123
+ }
124
+ }
125
+
126
+ int haz_ngpu = qs.size ();
127
+ std::cout << " Number of CPUs and GPUs found = " << haz_ngpu << std::endl;
90
128
91
- sycl::queue q (sycl::default_selector{});
92
- prk::SYCL::print_device_platform (q);
129
+ if (use_ngpu > haz_ngpu) {
130
+ std::cout << " You cannot use more GPUs (" << use_ngpu << " ) than you have (" << haz_ngpu << " )" << std::endl;
131
+ }
132
+
133
+ int ngpus = use_ngpu;
93
134
94
135
// ////////////////////////////////////////////////////////////////////
95
136
// Allocate space for the input and transpose matrix
96
137
// ////////////////////////////////////////////////////////////////////
97
138
98
- const size_t nelems = ( size_t )order * ( size_t )order ;
99
- const size_t bytes = nelems * sizeof ( double );
100
- double * h_a = syclx::malloc_host <double >( nelems, q );
101
- double * h_b = syclx::malloc_host <double >( nelems, q );
139
+ double trans_time ( 0 ) ;
140
+
141
+ auto h_a = prk::vector <double >(order * order );
142
+ auto h_b = prk::vector <double >(order * order );
102
143
103
144
// fill A with the sequence 0 to order^2-1
104
- for (int j=0 ; j<order; j++) {
105
- for (int i=0 ; i<order; i++) {
145
+ for (size_t j=0 ; j<order; j++) {
146
+ for (size_t i=0 ; i<order; i++) {
106
147
h_a[j*order+i] = static_cast <double >(order*j+i);
107
148
h_b[j*order+i] = static_cast <double >(0 );
108
149
}
109
150
}
110
151
111
- // copy input from host to device
112
- double * A = syclx::malloc_device<double >( nelems, q);
113
- double * B = syclx::malloc_device<double >( nelems, q);
114
- q.memcpy (A, &(h_a[0 ]), bytes).wait ();
115
- q.memcpy (B, &(h_b[0 ]), bytes).wait ();
152
+ const size_t bytes = order * order * sizeof (double );
116
153
117
- auto trans_time = 0.0 ;
154
+ // copy input from host to device
155
+ double * A = syclx::malloc_device<double >(order * order, q);
156
+ double * B = syclx::malloc_device<double >(order * order, q);
157
+ q.memcpy (A, &(h_a[0 ]), bytes);
158
+ q.memcpy (B, &(h_b[0 ]), bytes);
159
+ q.wait ();
118
160
119
161
for (int iter = 0 ; iter<=iterations; iter++) {
120
162
0 commit comments