34
34
35
35
#include " rocm_smi/rocm_smi.h"
36
36
#include " include/gpu_util.h"
37
+ #define __HIP_PLATFORM_HCC__
38
+ #include " hip/hip_runtime.h"
39
+ #include " hip/hip_runtime_api.h"
40
+
37
41
std::vector<uint16_t > rvs::gpulist::location_id;
38
42
std::vector<uint16_t > rvs::gpulist::gpu_id;
39
43
std::vector<uint16_t > rvs::gpulist::device_id;
@@ -44,10 +48,6 @@ using std::vector;
44
48
using std::string;
45
49
using std::ifstream;
46
50
47
- /* No of GPU devices with MCM GPU */
48
- #define MAX_NUM_MCM_GPU 4
49
-
50
-
51
51
int gpu_num_subdirs (const char * dirpath, const char * prefix) {
52
52
int count = 0 ;
53
53
DIR *dirp;
@@ -269,19 +269,68 @@ void gpu_get_all_domain_id(std::vector<uint16_t>* pgpus_domain_id,
269
269
* @param device_id GPU Device ID
270
270
* @return true if GPU is die in MCM GPU, false if GPU is single die GPU.
271
271
**/
272
-
273
-
274
272
bool gpu_check_if_mcm_die (int idx) {
275
273
rsmi_status_t ret;
276
274
uint64_t val =0 , time_stamp;
277
275
float cntr_resolution;
276
+ uint32_t smi_index = 0 ;
277
+
278
+ if (gpu_hip_to_smi_index (idx, &smi_index)) {
279
+ return false ;
280
+ }
281
+
278
282
// in case of secondary die, energy accumulator will return zero.
279
- ret = rsmi_dev_energy_count_get (idx , &val, &cntr_resolution, &time_stamp);
283
+ ret = rsmi_dev_energy_count_get (smi_index , &val, &cntr_resolution, &time_stamp);
280
284
if (!((RSMI_STATUS_SUCCESS == ret) && val == 0 ))
281
- return false ;
285
+ return false ;
282
286
return true ;
283
287
}
284
288
289
+ /* *
290
+ * @brief Get GPU smi index from hip index.
291
+ * @param hip_index GPU hip index
292
+ * @param smi_index GPU smi index
293
+ * @return 0 if successful, -1 otherwise
294
+ **/
295
+ int gpu_hip_to_smi_index (int hip_index, uint32_t * smi_index) {
296
+
297
+ int hip_num_gpu_devices = 0 ;
298
+ uint32_t smi_num_devices = 0 ;
299
+ uint64_t val_ui64 = 0 ;
300
+ std::map<uint64_t , int > smi_map;
301
+
302
+ // map this to smi as only these are visible
303
+ hipGetDeviceCount (&hip_num_gpu_devices);
304
+ if (hip_index >= hip_num_gpu_devices) {
305
+ return -1 ;
306
+ }
307
+
308
+ rsmi_status_t err = rsmi_num_monitor_devices (&smi_num_devices);
309
+ if ( err == RSMI_STATUS_SUCCESS){
310
+ for (auto i = 0 ; i < smi_num_devices; ++i){
311
+ err = rsmi_dev_pci_id_get (i, &val_ui64);
312
+ smi_map.insert ({val_ui64, i});
313
+ }
314
+ }
315
+ else {
316
+ return -1 ;
317
+ }
318
+
319
+ // get GPU device properties
320
+ hipDeviceProp_t props;
321
+ hipGetDeviceProperties (&props, hip_index);
322
+
323
+ // compute device location_id (needed to match this device
324
+ // with one of those found while querying the pci bus
325
+ uint16_t hip_dev_location_id =
326
+ ((((uint16_t ) (props.pciBusID )) << 8 ) | (((uint16_t )(props.pciDeviceID )) << 3 ));
327
+ if (smi_map.find (hip_dev_location_id) != smi_map.end ()) {
328
+ *smi_index = smi_map[hip_dev_location_id];
329
+ return 0 ;
330
+ }
331
+ return -1 ;
332
+ }
333
+
285
334
/* *
286
335
* @brief Initialize gpulist helper class
287
336
* @return 0 if successful, -1 otherwise
0 commit comments