Skip to content

Commit f1f5f89

Browse files
authored
Merge pull request #680 from ROCm-Developer-Tools/rocm-rel-6.0-staging
GPU hip index to smi index mapping
2 parents cead0c4 + 407b386 commit f1f5f89

File tree

3 files changed

+60
-17
lines changed

3 files changed

+60
-17
lines changed

iet.so/src/action.cpp

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -443,15 +443,8 @@ bool iet_action::do_edp_test(map<int, uint16_t> iet_gpus_device_index) {
443443
if (property_wait != 0) // delay iet execution
444444
sleep(property_wait);
445445

446-
447-
uint32_t smi_num_devices;
448-
rsmi_status_t err = rsmi_num_monitor_devices(&smi_num_devices);
449-
if(smi_num_devices != hip_num_gpu_devices)
450-
gpu_masking = true;
451-
if(gpu_masking){ // this is the case when using HIP_VISIBLE_DEVICES variable to modify GPU visibility
452-
// smi output wont be affected by the flag and hence indices should be appropriately used.
453-
hip_to_smi_indices();
454-
}
446+
// map hip indexes to smi indexes
447+
hip_to_smi_indices();
455448

456449
IETWorker::set_use_json(bjson);
457450
for (it = iet_gpus_device_index.begin(); it != iet_gpus_device_index.end(); ++it) {

include/gpu_util.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ extern void gpu_get_all_node_id(std::vector<uint16_t>* pgpus_node_id);
4141
extern void gpu_get_all_domain_id(std::vector<uint16_t>* pgpus_domain_id,
4242
std::map<std::pair<uint16_t, uint16_t> , uint16_t>& pgpus_dom_loc_map);
4343
extern bool gpu_check_if_mcm_die (int idx);
44+
extern int gpu_hip_to_smi_index(int hip_index, uint32_t* smi_index);
4445

4546
namespace rvs {
4647

src/gpu_util.cpp

Lines changed: 57 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@
3434

3535
#include "rocm_smi/rocm_smi.h"
3636
#include "include/gpu_util.h"
37+
#define __HIP_PLATFORM_HCC__
38+
#include "hip/hip_runtime.h"
39+
#include "hip/hip_runtime_api.h"
40+
3741
std::vector<uint16_t> rvs::gpulist::location_id;
3842
std::vector<uint16_t> rvs::gpulist::gpu_id;
3943
std::vector<uint16_t> rvs::gpulist::device_id;
@@ -44,10 +48,6 @@ using std::vector;
4448
using std::string;
4549
using std::ifstream;
4650

47-
/* No of GPU devices with MCM GPU */
48-
#define MAX_NUM_MCM_GPU 4
49-
50-
5151
int gpu_num_subdirs(const char* dirpath, const char* prefix) {
5252
int count = 0;
5353
DIR *dirp;
@@ -269,19 +269,68 @@ void gpu_get_all_domain_id(std::vector<uint16_t>* pgpus_domain_id,
269269
* @param device_id GPU Device ID
270270
* @return true if GPU is die in MCM GPU, false if GPU is single die GPU.
271271
**/
272-
273-
274272
bool gpu_check_if_mcm_die (int idx) {
275273
rsmi_status_t ret;
276274
uint64_t val =0 , time_stamp;
277275
float cntr_resolution;
276+
uint32_t smi_index = 0;
277+
278+
if (gpu_hip_to_smi_index(idx, &smi_index)) {
279+
return false;
280+
}
281+
278282
// in case of secondary die, energy accumulator will return zero.
279-
ret = rsmi_dev_energy_count_get(idx, &val, &cntr_resolution, &time_stamp);
283+
ret = rsmi_dev_energy_count_get(smi_index, &val, &cntr_resolution, &time_stamp);
280284
if (!((RSMI_STATUS_SUCCESS == ret) && val == 0))
281-
return false;
285+
return false;
282286
return true;
283287
}
284288

289+
/**
290+
* @brief Get GPU smi index from hip index.
291+
* @param hip_index GPU hip index
292+
* @param smi_index GPU smi index
293+
* @return 0 if successful, -1 otherwise
294+
**/
295+
int gpu_hip_to_smi_index(int hip_index, uint32_t* smi_index) {
296+
297+
int hip_num_gpu_devices = 0;
298+
uint32_t smi_num_devices = 0;
299+
uint64_t val_ui64 = 0;
300+
std::map<uint64_t, int> smi_map;
301+
302+
// map this to smi as only these are visible
303+
hipGetDeviceCount(&hip_num_gpu_devices);
304+
if(hip_index >= hip_num_gpu_devices) {
305+
return -1;
306+
}
307+
308+
rsmi_status_t err = rsmi_num_monitor_devices(&smi_num_devices);
309+
if( err == RSMI_STATUS_SUCCESS){
310+
for(auto i = 0; i < smi_num_devices; ++i){
311+
err = rsmi_dev_pci_id_get(i, &val_ui64);
312+
smi_map.insert({val_ui64, i});
313+
}
314+
}
315+
else {
316+
return -1;
317+
}
318+
319+
// get GPU device properties
320+
hipDeviceProp_t props;
321+
hipGetDeviceProperties(&props, hip_index);
322+
323+
// compute device location_id (needed to match this device
324+
// with one of those found while querying the pci bus
325+
uint16_t hip_dev_location_id =
326+
((((uint16_t) (props.pciBusID)) << 8) | (((uint16_t)(props.pciDeviceID)) << 3));
327+
if(smi_map.find(hip_dev_location_id) != smi_map.end()) {
328+
*smi_index = smi_map[hip_dev_location_id];
329+
return 0;
330+
}
331+
return -1;
332+
}
333+
285334
/**
286335
* @brief Initialize gpulist helper class
287336
* @return 0 if successful, -1 otherwise

0 commit comments

Comments
 (0)