Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add class for AMD ROCm devices #19

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
109 changes: 108 additions & 1 deletion machinestate.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@
VEOS_BASE = "/opt/nec/ve/bin"
# The NvidiaInfo class requires this path if nvidia-smi is not in $PATH
NVIDIA_PATH = "/opt/nvidia/bin"
#The RocmInfo class requires this path to call rocm-smi if not in $PATH
ROCM_PATH = "/opt/rocm/bin"
# The OpenCLInfo class requires this path if clinfo is not in $PATH
CLINFO_PATH = "/usr/bin"

Expand Down Expand Up @@ -1227,7 +1229,8 @@ def __init__(self,
nvidia_path=NVIDIA_PATH,
modulecmd=MODULECMD_PATH,
vecmd_path=VEOS_BASE,
clinfo_path=CLINFO_PATH):
clinfo_path=CLINFO_PATH,
rocm_path=ROCM_PATH):
super(MachineState, self).__init__(extended=extended, anonymous=anonymous)
self.loglevel = loglevel
self.dmifile = dmifile
Expand All @@ -1238,6 +1241,7 @@ def __init__(self,
self.modulecmd = modulecmd
self.vecmd_path = vecmd_path
self.clinfo_path = clinfo_path
self.rocm_path = rocm_path
ostype = get_ostype()
if ostype == "Linux":
self.classlist = [
Expand Down Expand Up @@ -1288,6 +1292,8 @@ def __init__(self,
self.classargs.append({"executable" : executable})
self.classlist.append(OpenCLInfo)
self.classargs.append({"clinfo_path" : clinfo_path})
self.classlist.append(RocmInfo)
self.classargs.append({"rocm_path" : rocm_path})
if likwid_enable:
if likwid_path is None or not pexists(likwid_path):
path = which("likwid-topology")
Expand Down Expand Up @@ -3196,6 +3202,106 @@ def __init__(self, vecmd_path="", extended=False, anonymous=False):
self.subargs = {"vecmd_path" : vecmd_path}

################################################################################
# Infos about AMD ROCm devices
################################################################################
class RocmInfoClass(InfoGroup):
'''Class for a single ROCm devices'''
def __init__(self, device, extended=False, anonymous=False, rocm_path=ROCM_PATH):
super(RocmInfoClass, self).__init__(name="Card{}".format(device),
extended=extended,
anonymous=anonymous)
self.device = device
self.rocm_path = rocm_path
rocmcmd = pjoin(self.rocm_path, "rocm-smi")
if not os.access(rocmcmd, os.X_OK) and which("rocm-smi"):
rocmcmd = which("rocm-smi")
rocm_args = "-d {} -a --json | python3 -m json.tool".format(self.device)
self.addc("Vendor", rocmcmd, rocm_args, r'^\s+"Card vendor\":\s+\"(.*)\"[,]?$', str)
self.addc("VBiosVersion", rocmcmd, rocm_args, r'^\s+"VBIOS version\":\s+\"(.*)\"[,]?$', str)
self.addc("SKU", rocmcmd, rocm_args, r'^\s+"Card SKU\":\s+\"(.*)\"[,]?$', str)
self.addc("Model", rocmcmd, rocm_args, r'^\s+"Card model\":\s+\"(.*)\"[,]?$', str)
self.addc("Series", rocmcmd, rocm_args, r'^\s+"Card series\":\s+\"(.*)\"[,]?$', str)
self.addc("ID", rocmcmd, rocm_args, r'^\s+"GPU ID\":\s+\"(.*)\"[,]?$', str)
self.addc("MemoryVendor", rocmcmd, rocm_args, r'^\s+"GPU memory vendor\":\s+\"(.*)\"[,]?$', str)
self.addc("MaxPackagePowerWatt", rocmcmd, rocm_args, r'^\s+"Max Graphics Package Power \(W\)\":\s+\"(.*)\"[,]?$', float)
self.addc("PerformanceLevel", rocmcmd, rocm_args, r'^\s+"Performance Level\":\s+\"(.*)\"[,]?$', str)
self.addc("SerialNumber", rocmcmd, rocm_args, r'^\s+"Serial Number\":\s+\"(.*)\"[,]?$', str)
self.addc("VoltageMilliVolt", rocmcmd, rocm_args, r'^\s+"Voltage \(mV\)\":\s+\"(.*)\"[,]?$', int)
self.addc("TemperatureEdge", rocmcmd, rocm_args, r'^\s+"Temperature \(Sensor edge\) \(C\)\":\s+\"(.*)\"[,]?$', int)
self.addc("TemperatureJunction", rocmcmd, rocm_args, r'^\s+"Temperature \(Sensor junction\) \(C\)\":\s+\"(.*)\"[,]?$', int)
self.addc("TemperatureMemory", rocmcmd, rocm_args, r'^\s+"Temperature \(Sensor memory\) \(C\)\":\s+\"(.*)\"[,]?$', int)
self.addc("FClk", rocmcmd, rocm_args, r'^\s+"fclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz)
self.addc("MClk", rocmcmd, rocm_args, r'^\s+"mclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz)
self.addc("SClk", rocmcmd, rocm_args, r'^\s+"sclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz)
self.addc("SocClk", rocmcmd, rocm_args, r'^\s+"socclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz)
self.addc("PcieClk", rocmcmd, rocm_args, r'^\s+"pcie clock level\":\s+\"(.*)\"[,]?$', str)
self.required("Vendor", "VBiosVersion", "SKU", "Model", "Series", "PerformanceLevel", "MemoryVendor", "PcieClk")
if self.extended:
self.addc("DmcuFirmwareVersion", rocmcmd, rocm_args, r'^\s+"DMCU firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("CeFirmwareVersion", rocmcmd, rocm_args, r'^\s+"CE firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("AsdFirmwareVersion", rocmcmd, rocm_args, r'^\s+"ASD firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("McFirmwareVersion", rocmcmd, rocm_args, r'^\s+"MC firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("MeFirmwareVersion", rocmcmd, rocm_args, r'^\s+"ME firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("MecFirmwareVersion", rocmcmd, rocm_args, r'^\s+"MEC firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("Mec2FirmwareVersion", rocmcmd, rocm_args, r'^\s+"MEC2 firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("PfpFirmwareVersion", rocmcmd, rocm_args, r'^\s+"PFP firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("RlcFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("RlcSrlcFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC SRLC firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("RlcSrlgFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC SRLG firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("RlcSrlsFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC SRLS firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("SdmaFirmwareVersion", rocmcmd, rocm_args, r'^\s+"SDMA firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("Sdma2FirmwareVersion", rocmcmd, rocm_args, r'^\s+"SDMA2 firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("SmcFirmwareVersion", rocmcmd, rocm_args, r'^\s+"SMC firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("SosFirmwareVersion", rocmcmd, rocm_args, r'^\s+"SOS firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("TaRasFirmwareVersion", rocmcmd, rocm_args, r'^\s+"TA RAS firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("TaXgmiFirmwareVersion", rocmcmd, rocm_args, r'^\s+"TA XGMI firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("UvdFirmwareVersion", rocmcmd, rocm_args, r'^\s+"UVD firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("VceFirmwareVersion", rocmcmd, rocm_args, r'^\s+"VCE firmware version\":\s+\"(.*)\"[,]?$', str)
self.addc("VcnFirmwareVersion", rocmcmd, rocm_args, r'^\s+"VCN firmware version\":\s+\"(.*)\"[,]?$', str)

class RocmInfo(ListInfoGroup):
'''Class to spawn subclasses for ROCm devices'''
def __init__(self, extended=False, anonymous=False, rocm_path=ROCM_PATH):
super(RocmInfo, self).__init__(name="RocmInfo",
extended=extended,
anonymous=anonymous)
self.rocm_path = rocm_path
rocmsmi = pjoin(self.rocm_path, "rocm-smi")
if not os.access(rocmsmi, os.X_OK) and which("rocm-smi"):
rocmsmi = which("rocm-smi")
rocminfo = pjoin(self.rocm_path, "rocminfo")
if not os.access(rocminfo, os.X_OK) and which("rocminfo"):
rocminfo = which("rocminfo")
hipconfig = pjoin(self.rocm_path, "hipconfig")
if not os.access(hipconfig, os.X_OK) and which("hipconfig"):
hipconfig = which("hipconfig")
data = process_cmd((rocmsmi))
if data is not None:
for l in data.split("\n"):
m = re.match(r"^(\d+)\s+[\d\.]+c.*", l)
if m:
dev = int(m.group(1))
if dev not in devicelist:
self.userlist.append(dev)
self.subclass = RocmInfoClass
self.subargs = {"rocm_path" : self.rocm_path}
self.addc("DriverVersion", rocmsmi, "--showdriverversion", r"Driver version:\s+([\d\.]+)", str)
self.addc("RocmSmiVersion", rocmsmi, "--help", r"ROCM-SMI version: (\d+\.\d+\.\d+)", str)
self.addc("RocmKernelVersion", rocmsmi, "--help", r"Kernel version: (\d+\.\d+\.\d+)", str)
self.addc("RuntimeVersion", rocminfo, regex=r"Runtime Version:\s+(\d+\.\d+\.\d+)", parse=str)
self.addc("HipVersion", hipconfig, "--version")
self.addc("HipRuntime", hipconfig, "--runtime")
self.addc("HipCompiler", hipconfig, "--compiler")
self.addc("HipPlatform", hipconfig, "--platform")
self.addc("HipC++Options", hipconfig, "--cpp_config")
self.required("DriverVersion")
self.required("RuntimeVersion")
self.required("HipVersion")
self.required("HipRuntime")
self.required("HipCompiler")
self.required("HipPlatform")

################################################################################
# Infos from clinfo (OpenCL devices and runtime)
################################################################################
class OpenCLInfoPlatformDeviceClass(InfoGroup):
Expand Down Expand Up @@ -3441,6 +3547,7 @@ def read_config(config={"extended" : False, "anonymous" : False, "executable" :
"modulecmd" : MODULECMD_PATH,
"vecmd_path" : VEOS_BASE,
"nvidia_path" : NVIDIA_PATH,
"rocm_path" : ROCM_PATH,
"loglevel" : DEFAULT_LOGLEVEL,
"clinfo_path" : CLINFO_PATH,
"anonymous" : False,
Expand Down