From 93ee74678dc28f344e9119328cf11fa5000a637d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Oct 2025 03:33:46 +0000 Subject: [PATCH 1/7] Initial plan From 77d4871e362156da233088c5c14dc9d1b7bb0a4b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Oct 2025 03:43:51 +0000 Subject: [PATCH 2/7] Add comprehensive docstrings to iris.hip module and create documentation page Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- docs/conf.py | 1 - docs/reference/api-hip-module.md | 97 +++++++++++ docs/reference/api-reference.md | 2 + docs/sphinx/_toc.yml | 1 + iris/hip.py | 266 +++++++++++++++++++++++++++++++ 5 files changed, 366 insertions(+), 1 deletion(-) create mode 100644 docs/reference/api-hip-module.md diff --git a/docs/conf.py b/docs/conf.py index 35fa7a00..7453ac9b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -95,7 +95,6 @@ "triton.language", "numpy", "iris._distributed_helpers", - "iris.hip", ] # Napoleon settings for Google/NumPy docstring parsing diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md new file mode 100644 index 00000000..960418e4 --- /dev/null +++ b/docs/reference/api-hip-module.md @@ -0,0 +1,97 @@ +# HIP Module API + +Low-level HIP runtime integration for AMD GPU device management and memory operations. + +```{eval-rst} +.. automodule:: iris.hip + :members: + :undoc-members: + :show-inheritance: +``` + +## Device Management + +### count_devices +```{eval-rst} +.. autofunction:: iris.hip.count_devices +``` + +### set_device +```{eval-rst} +.. autofunction:: iris.hip.set_device +``` + +### get_device_id +```{eval-rst} +.. autofunction:: iris.hip.get_device_id +``` + +## Device Attributes + +### get_cu_count +```{eval-rst} +.. autofunction:: iris.hip.get_cu_count +``` + +### get_arch_string +```{eval-rst} +.. autofunction:: iris.hip.get_arch_string +``` + +### get_num_xcc +```{eval-rst} +.. autofunction:: iris.hip.get_num_xcc +``` + +### get_wall_clock_rate +```{eval-rst} +.. autofunction:: iris.hip.get_wall_clock_rate +``` + +### get_rocm_version +```{eval-rst} +.. autofunction:: iris.hip.get_rocm_version +``` + +## Memory Management + +### hip_malloc +```{eval-rst} +.. autofunction:: iris.hip.hip_malloc +``` + +### malloc_fine_grained +```{eval-rst} +.. autofunction:: iris.hip.malloc_fine_grained +``` + +### hip_free +```{eval-rst} +.. autofunction:: iris.hip.hip_free +``` + +## IPC Memory Operations + +### get_ipc_handle +```{eval-rst} +.. autofunction:: iris.hip.get_ipc_handle +``` + +### open_ipc_handle +```{eval-rst} +.. autofunction:: iris.hip.open_ipc_handle +``` + +### hipIpcMemHandle_t +```{eval-rst} +.. autoclass:: iris.hip.hipIpcMemHandle_t + :members: +``` + +## Error Handling + +### hip_try +```{eval-rst} +.. autofunction:: iris.hip.hip_try +``` + diff --git a/docs/reference/api-reference.md b/docs/reference/api-reference.md index 5aab4e30..6460a44e 100644 --- a/docs/reference/api-reference.md +++ b/docs/reference/api-reference.md @@ -5,10 +5,12 @@ Explore Iris APIs. The reference is broken down into focused sections to mirror - The `Iris` class itself (constructor and helper utilities) - Tensor-like creation methods on the `Iris` context - Triton device-side functions for remote memory ops and atomics +- HIP runtime integration for low-level device management Use the links below to navigate: - [Iris Class (ctor & helpers)](api-iris-class.md) - [Tensor Creation](api-tensor-creation.md) - [Triton Device Functions](api-device-functions.md) +- [HIP Module](api-hip-module.md) diff --git a/docs/sphinx/_toc.yml b/docs/sphinx/_toc.yml index 5b4cdead..a3761d77 100644 --- a/docs/sphinx/_toc.yml +++ b/docs/sphinx/_toc.yml @@ -15,3 +15,4 @@ subtrees: - file: reference/api-iris-class.md - file: reference/api-tensor-creation.md - file: reference/api-device-functions.md + - file: reference/api-hip-module.md diff --git a/iris/hip.py b/iris/hip.py index 2a03c397..963dc01d 100644 --- a/iris/hip.py +++ b/iris/hip.py @@ -1,6 +1,28 @@ # SPDX-License-Identifier: MIT # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +""" +HIP Runtime Integration Module + +This module provides low-level HIP runtime integration for AMD GPUs, +offering Python bindings to essential HIP runtime functions through ctypes. +It enables device management, memory operations, and inter-process communication +for multi-GPU programming. + +Key Features: +- Device enumeration and management +- IPC (Inter-Process Communication) memory handles +- Device attribute queries (compute units, architecture, XCC count) +- Fine-grained and coarse-grained memory allocation +- ROCm version detection + +Example: + >>> import iris.hip as hip + >>> num_devices = hip.count_devices() + >>> hip.set_device(0) + >>> cu_count = hip.get_cu_count() +""" + import ctypes import numpy as np import sys @@ -11,6 +33,19 @@ def hip_try(err): + """ + Check HIP error codes and raise RuntimeError if an error occurred. + + Args: + err (int): HIP error code returned from a HIP runtime function. + + Raises: + RuntimeError: If err is non-zero, with a descriptive error message. + + Example: + >>> hip_try(0) # No error, returns silently + >>> hip_try(1) # Raises RuntimeError with HIP error message + """ if err != 0: hip_runtime.hipGetErrorString.restype = ctypes.c_char_p error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8") @@ -18,10 +53,50 @@ def hip_try(err): class hipIpcMemHandle_t(ctypes.Structure): + """ + HIP IPC (Inter-Process Communication) memory handle structure. + + This structure represents an opaque handle used for sharing memory + between processes on different GPUs. The handle contains 64 bytes + of reserved data that uniquely identifies the shared memory region. + + Attributes: + reserved (ctypes.c_char * 64): Reserved bytes containing the handle data. + + Example: + >>> handle = hipIpcMemHandle_t() + >>> # Use with get_ipc_handle and open_ipc_handle + """ + _fields_ = [("reserved", ctypes.c_char * 64)] def open_ipc_handle(ipc_handle_data, rank): + """ + Open an IPC memory handle to access shared memory from another process. + + This function takes an IPC memory handle (obtained via get_ipc_handle) and + opens it to allow the current process to access the shared memory region. + The memory is opened with lazy peer access enabled. + + Args: + ipc_handle_data (numpy.ndarray): A 64-element uint8 numpy array containing + the IPC handle data. + rank (int): The rank ID of the process opening the handle (used for logging/debugging). + + Returns: + int: The pointer value (as Python int) to the opened shared memory. + + Raises: + ValueError: If ipc_handle_data is not a 64-element uint8 numpy array. + TypeError: If ipc_handle_data is not a numpy.ndarray. + RuntimeError: If the HIP runtime call fails. + + Example: + >>> # On process with rank 1, get the handle from process 0 + >>> ipc_data = all_ipc_handles[0] # From distributed communication + >>> ptr = open_ipc_handle(ipc_data, rank=1) + """ ptr = ctypes.c_void_p() hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1) hip_runtime.hipIpcOpenMemHandle.argtypes = [ @@ -55,28 +130,104 @@ def open_ipc_handle(ipc_handle_data, rank): def get_ipc_handle(ptr, rank): + """ + Get an IPC memory handle for a memory pointer to share with other processes. + + This function creates an IPC handle that can be shared with other processes + to allow them to access the memory pointed to by ptr. + + Args: + ptr (ctypes.c_void_p): Pointer to the memory region to share. + rank (int): The rank ID of the process creating the handle (used for logging/debugging). + + Returns: + hipIpcMemHandle_t: An IPC memory handle that can be shared with other processes. + + Raises: + RuntimeError: If the HIP runtime call fails. + + Example: + >>> import ctypes + >>> heap_ptr = ctypes.c_void_p(tensor.data_ptr()) + >>> handle = get_ipc_handle(heap_ptr, rank=0) + """ ipc_handle = hipIpcMemHandle_t() hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr)) return ipc_handle def count_devices(): + """ + Get the number of available HIP devices (GPUs). + + Returns: + int: The number of HIP-capable devices available on the system. + + Raises: + RuntimeError: If the HIP runtime call fails. + + Example: + >>> num_gpus = count_devices() + >>> print(f"Found {num_gpus} GPU(s)") + """ device_count = ctypes.c_int() hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count))) return device_count.value def set_device(gpu_id): + """ + Set the current HIP device for subsequent operations. + + Args: + gpu_id (int): The device ID to set as the current device (0-indexed). + + Raises: + RuntimeError: If the HIP runtime call fails or the device ID is invalid. + + Example: + >>> set_device(0) # Use GPU 0 + >>> set_device(1) # Switch to GPU 1 + """ hip_try(hip_runtime.hipSetDevice(gpu_id)) def get_device_id(): + """ + Get the currently active HIP device ID. + + Returns: + int: The ID of the currently active HIP device. + + Raises: + RuntimeError: If the HIP runtime call fails. + + Example: + >>> current_device = get_device_id() + >>> print(f"Using GPU {current_device}") + """ device_id = ctypes.c_int() hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id))) return device_id.value def get_cu_count(device_id=None): + """ + Get the number of compute units (CUs) for a HIP device. + + Args: + device_id (int, optional): The device ID to query. If None, uses the current device. + + Returns: + int: The number of compute units on the specified device. + + Raises: + RuntimeError: If the HIP runtime call fails. + + Example: + >>> cu_count = get_cu_count() # Current device + >>> cu_count_gpu1 = get_cu_count(device_id=1) # Specific device + """ if device_id is None: device_id = get_device_id() @@ -89,6 +240,20 @@ def get_cu_count(device_id=None): def get_rocm_version(): + """ + Get the installed ROCm version. + + Returns: + tuple: A tuple of (major, minor) version numbers as integers. + + Raises: + FileNotFoundError: If the ROCm version file is not found. + IndexError: If the version file format is unexpected. + + Example: + >>> major, minor = get_rocm_version() + >>> print(f"ROCm version: {major}.{minor}") + """ major, minor = -1, -1 with open("/opt/rocm/.info/version", "r") as version_file: version = version_file.readline().strip() @@ -98,6 +263,22 @@ def get_rocm_version(): def get_wall_clock_rate(device_id): + """ + Get the wall clock rate (GPU clock frequency) for a HIP device. + + Args: + device_id (int): The device ID to query. + + Returns: + int: The wall clock rate in kHz. + + Raises: + RuntimeError: If the HIP runtime call fails. + + Example: + >>> clock_rate = get_wall_clock_rate(0) + >>> print(f"GPU clock rate: {clock_rate} kHz") + """ hipDeviceAttributeWallClockRate = 10017 wall_clock_rate = ctypes.c_int() status = hip_runtime.hipDeviceGetAttribute( @@ -108,6 +289,19 @@ def get_wall_clock_rate(device_id): def get_arch_string(device_id=None): + """ + Get the GPU architecture string for a HIP device. + + Args: + device_id (int, optional): The device ID to query. If None, uses the current device. + + Returns: + str: The architecture name (e.g., "gfx90a", "gfx942"). + + Example: + >>> arch = get_arch_string() + >>> print(f"GPU architecture: {arch}") # e.g., "gfx942" + """ if device_id is None: device_id = get_device_id() arch_full = torch.cuda.get_device_properties(device_id).gcnArchName @@ -116,6 +310,25 @@ def get_arch_string(device_id=None): def get_num_xcc(device_id=None): + """ + Get the number of XCCs (Compute Dies) for a HIP device. + + XCC (eXtended Compute Complex) refers to the compute dies in MI300 series GPUs. + For ROCm versions before 7.0, returns a default value of 8. + + Args: + device_id (int, optional): The device ID to query. If None, uses the current device. + + Returns: + int: The number of XCCs on the device. + + Raises: + RuntimeError: If the HIP runtime call fails. + + Example: + >>> xcc_count = get_num_xcc() + >>> print(f"Number of XCCs: {xcc_count}") + """ if device_id is None: device_id = get_device_id() rocm_major, _ = get_rocm_version() @@ -128,6 +341,26 @@ def get_num_xcc(device_id=None): def malloc_fine_grained(size): + """ + Allocate fine-grained GPU memory accessible by both CPU and GPU. + + Fine-grained memory provides coherent access from both CPU and GPU, + making it suitable for scenarios requiring frequent CPU-GPU data exchange. + + Args: + size (int): Size of memory to allocate in bytes. + + Returns: + ctypes.c_void_p: Pointer to the allocated memory. + + Raises: + RuntimeError: If the HIP runtime call fails or allocation fails. + + Example: + >>> ptr = malloc_fine_grained(1024 * 1024) # Allocate 1 MB + >>> # Use the memory... + >>> hip_free(ptr) + """ hipDeviceMallocFinegrained = 0x1 ptr = ctypes.c_void_p() hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained)) @@ -135,10 +368,43 @@ def malloc_fine_grained(size): def hip_malloc(size): + """ + Allocate standard (coarse-grained) GPU memory. + + This allocates device memory that is optimized for GPU access but + not directly accessible from the CPU without explicit transfers. + + Args: + size (int): Size of memory to allocate in bytes. + + Returns: + ctypes.c_void_p: Pointer to the allocated memory. + + Raises: + RuntimeError: If the HIP runtime call fails or allocation fails. + + Example: + >>> ptr = hip_malloc(2 * 1024 * 1024) # Allocate 2 MB + >>> # Use the memory... + >>> hip_free(ptr) + """ ptr = ctypes.c_void_p() hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size)) return ptr def hip_free(ptr): + """ + Free GPU memory allocated by hip_malloc or malloc_fine_grained. + + Args: + ptr (ctypes.c_void_p): Pointer to the memory to free. + + Raises: + RuntimeError: If the HIP runtime call fails. + + Example: + >>> ptr = hip_malloc(1024) + >>> hip_free(ptr) + """ hip_try(hip_runtime.hipFree(ptr)) From 449f3006b2b905a87969e5da32e82f9eee2aaa24 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Oct 2025 04:13:59 +0000 Subject: [PATCH 3/7] Remove unused hip module functions (get_arch_string, hip_malloc, malloc_fine_grained, hip_free) Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- docs/reference/api-hip-module.md | 22 -------- iris/hip.py | 91 -------------------------------- 2 files changed, 113 deletions(-) diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md index 960418e4..2c07bd2c 100644 --- a/docs/reference/api-hip-module.md +++ b/docs/reference/api-hip-module.md @@ -33,11 +33,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera .. autofunction:: iris.hip.get_cu_count ``` -### get_arch_string -```{eval-rst} -.. autofunction:: iris.hip.get_arch_string -``` - ### get_num_xcc ```{eval-rst} .. autofunction:: iris.hip.get_num_xcc @@ -53,23 +48,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera .. autofunction:: iris.hip.get_rocm_version ``` -## Memory Management - -### hip_malloc -```{eval-rst} -.. autofunction:: iris.hip.hip_malloc -``` - -### malloc_fine_grained -```{eval-rst} -.. autofunction:: iris.hip.malloc_fine_grained -``` - -### hip_free -```{eval-rst} -.. autofunction:: iris.hip.hip_free -``` - ## IPC Memory Operations ### get_ipc_handle diff --git a/iris/hip.py b/iris/hip.py index 963dc01d..e9c83bbd 100644 --- a/iris/hip.py +++ b/iris/hip.py @@ -288,27 +288,6 @@ def get_wall_clock_rate(device_id): return wall_clock_rate.value -def get_arch_string(device_id=None): - """ - Get the GPU architecture string for a HIP device. - - Args: - device_id (int, optional): The device ID to query. If None, uses the current device. - - Returns: - str: The architecture name (e.g., "gfx90a", "gfx942"). - - Example: - >>> arch = get_arch_string() - >>> print(f"GPU architecture: {arch}") # e.g., "gfx942" - """ - if device_id is None: - device_id = get_device_id() - arch_full = torch.cuda.get_device_properties(device_id).gcnArchName - arch_name = arch_full.split(":")[0] - return arch_name - - def get_num_xcc(device_id=None): """ Get the number of XCCs (Compute Dies) for a HIP device. @@ -338,73 +317,3 @@ def get_num_xcc(device_id=None): xcc_count = ctypes.c_int() hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id)) return xcc_count.value - - -def malloc_fine_grained(size): - """ - Allocate fine-grained GPU memory accessible by both CPU and GPU. - - Fine-grained memory provides coherent access from both CPU and GPU, - making it suitable for scenarios requiring frequent CPU-GPU data exchange. - - Args: - size (int): Size of memory to allocate in bytes. - - Returns: - ctypes.c_void_p: Pointer to the allocated memory. - - Raises: - RuntimeError: If the HIP runtime call fails or allocation fails. - - Example: - >>> ptr = malloc_fine_grained(1024 * 1024) # Allocate 1 MB - >>> # Use the memory... - >>> hip_free(ptr) - """ - hipDeviceMallocFinegrained = 0x1 - ptr = ctypes.c_void_p() - hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained)) - return ptr - - -def hip_malloc(size): - """ - Allocate standard (coarse-grained) GPU memory. - - This allocates device memory that is optimized for GPU access but - not directly accessible from the CPU without explicit transfers. - - Args: - size (int): Size of memory to allocate in bytes. - - Returns: - ctypes.c_void_p: Pointer to the allocated memory. - - Raises: - RuntimeError: If the HIP runtime call fails or allocation fails. - - Example: - >>> ptr = hip_malloc(2 * 1024 * 1024) # Allocate 2 MB - >>> # Use the memory... - >>> hip_free(ptr) - """ - ptr = ctypes.c_void_p() - hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size)) - return ptr - - -def hip_free(ptr): - """ - Free GPU memory allocated by hip_malloc or malloc_fine_grained. - - Args: - ptr (ctypes.c_void_p): Pointer to the memory to free. - - Raises: - RuntimeError: If the HIP runtime call fails. - - Example: - >>> ptr = hip_malloc(1024) - >>> hip_free(ptr) - """ - hip_try(hip_runtime.hipFree(ptr)) From b8778e618ba3cb763a4c0cb84ece87d9867441d4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:06:16 +0000 Subject: [PATCH 4/7] Remove internal APIs from hip module documentation Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- docs/reference/api-hip-module.md | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md index 2c07bd2c..123be4cc 100644 --- a/docs/reference/api-hip-module.md +++ b/docs/reference/api-hip-module.md @@ -2,12 +2,7 @@ Low-level HIP runtime integration for AMD GPU device management and memory operations. -```{eval-rst} -.. automodule:: iris.hip - :members: - :undoc-members: - :show-inheritance: -``` +This module provides public APIs for device management, device attribute queries, and IPC memory operations. ## Device Management @@ -21,11 +16,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera .. autofunction:: iris.hip.set_device ``` -### get_device_id -```{eval-rst} -.. autofunction:: iris.hip.get_device_id -``` - ## Device Attributes ### get_cu_count @@ -43,11 +33,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera .. autofunction:: iris.hip.get_wall_clock_rate ``` -### get_rocm_version -```{eval-rst} -.. autofunction:: iris.hip.get_rocm_version -``` - ## IPC Memory Operations ### get_ipc_handle @@ -60,16 +45,3 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera .. autofunction:: iris.hip.open_ipc_handle ``` -### hipIpcMemHandle_t -```{eval-rst} -.. autoclass:: iris.hip.hipIpcMemHandle_t - :members: -``` - -## Error Handling - -### hip_try -```{eval-rst} -.. autofunction:: iris.hip.hip_try -``` - From a36e9a1b86b4e51114e4d7743a4696f3f1ca7c6b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:44:28 +0000 Subject: [PATCH 5/7] Document only user-facing hip APIs used in examples/tests Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- docs/reference/api-hip-module.md | 35 +++----------------------------- 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md index 123be4cc..3724a8c5 100644 --- a/docs/reference/api-hip-module.md +++ b/docs/reference/api-hip-module.md @@ -2,46 +2,17 @@ Low-level HIP runtime integration for AMD GPU device management and memory operations. -This module provides public APIs for device management, device attribute queries, and IPC memory operations. - -## Device Management - -### count_devices -```{eval-rst} -.. autofunction:: iris.hip.count_devices -``` - -### set_device -```{eval-rst} -.. autofunction:: iris.hip.set_device -``` +This module provides public APIs for querying device attributes. ## Device Attributes -### get_cu_count -```{eval-rst} -.. autofunction:: iris.hip.get_cu_count -``` - -### get_num_xcc -```{eval-rst} -.. autofunction:: iris.hip.get_num_xcc -``` - ### get_wall_clock_rate ```{eval-rst} .. autofunction:: iris.hip.get_wall_clock_rate ``` -## IPC Memory Operations - -### get_ipc_handle -```{eval-rst} -.. autofunction:: iris.hip.get_ipc_handle -``` - -### open_ipc_handle +### get_num_xcc ```{eval-rst} -.. autofunction:: iris.hip.open_ipc_handle +.. autofunction:: iris.hip.get_num_xcc ``` From 39273b982686947bf6671f5230e3e674d3f9c7df Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:54:59 +0000 Subject: [PATCH 6/7] Fix XCC acronym: Accelerated Compute Core Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com> --- iris/hip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iris/hip.py b/iris/hip.py index e9c83bbd..ef7a3f94 100644 --- a/iris/hip.py +++ b/iris/hip.py @@ -292,7 +292,7 @@ def get_num_xcc(device_id=None): """ Get the number of XCCs (Compute Dies) for a HIP device. - XCC (eXtended Compute Complex) refers to the compute dies in MI300 series GPUs. + XCC (Accelerated Compute Core) refers to the compute dies in MI300 series GPUs. For ROCm versions before 7.0, returns a default value of 8. Args: From a123110d8a3abbf29af4da33d8a68e919431f87e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 12 Oct 2025 02:48:31 +0000 Subject: [PATCH 7/7] Add docstrings to hip module after syncing with main --- iris/hip.py | 372 ++++++++++++++++++++++++++-------------------------- 1 file changed, 184 insertions(+), 188 deletions(-) diff --git a/iris/hip.py b/iris/hip.py index ef7a3f94..454c002c 100644 --- a/iris/hip.py +++ b/iris/hip.py @@ -27,238 +27,179 @@ import numpy as np import sys import torch - -rt_path = "libamdhip64.so" -hip_runtime = ctypes.cdll.LoadLibrary(rt_path) - - -def hip_try(err): - """ - Check HIP error codes and raise RuntimeError if an error occurred. - - Args: - err (int): HIP error code returned from a HIP runtime function. - - Raises: - RuntimeError: If err is non-zero, with a descriptive error message. - - Example: - >>> hip_try(0) # No error, returns silently - >>> hip_try(1) # Raises RuntimeError with HIP error message - """ +import subprocess +import os + +# Auto-detect backend +_is_amd_backend = True +try: + rt_path = "libamdhip64.so" + gpu_runtime = ctypes.cdll.LoadLibrary(rt_path) +except OSError: + try: + rt_path = "libcudart.so" + gpu_runtime = ctypes.cdll.LoadLibrary(rt_path) + _is_amd_backend = False + except OSError: + rt_path = "libamdhip64.so" + gpu_runtime = ctypes.cdll.LoadLibrary(rt_path) + + +def gpu_try(err): if err != 0: - hip_runtime.hipGetErrorString.restype = ctypes.c_char_p - error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8") - raise RuntimeError(f"HIP error code {err}: {error_string}") - + if _is_amd_backend: + gpu_runtime.hipGetErrorString.restype = ctypes.c_char_p + error_string = gpu_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8") + raise RuntimeError(f"HIP error code {err}: {error_string}") + else: + gpu_runtime.cudaGetErrorString.restype = ctypes.c_char_p + error_string = gpu_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8") + raise RuntimeError(f"CUDA error code {err}: {error_string}") -class hipIpcMemHandle_t(ctypes.Structure): - """ - HIP IPC (Inter-Process Communication) memory handle structure. - - This structure represents an opaque handle used for sharing memory - between processes on different GPUs. The handle contains 64 bytes - of reserved data that uniquely identifies the shared memory region. - Attributes: - reserved (ctypes.c_char * 64): Reserved bytes containing the handle data. +def get_ipc_handle_size(): + """Return the IPC handle size for the current backend.""" + return 64 if _is_amd_backend else 128 - Example: - >>> handle = hipIpcMemHandle_t() - >>> # Use with get_ipc_handle and open_ipc_handle - """ - _fields_ = [("reserved", ctypes.c_char * 64)] +class gpuIpcMemHandle_t(ctypes.Structure): + _fields_ = [("reserved", ctypes.c_char * get_ipc_handle_size())] def open_ipc_handle(ipc_handle_data, rank): - """ - Open an IPC memory handle to access shared memory from another process. - - This function takes an IPC memory handle (obtained via get_ipc_handle) and - opens it to allow the current process to access the shared memory region. - The memory is opened with lazy peer access enabled. - - Args: - ipc_handle_data (numpy.ndarray): A 64-element uint8 numpy array containing - the IPC handle data. - rank (int): The rank ID of the process opening the handle (used for logging/debugging). - - Returns: - int: The pointer value (as Python int) to the opened shared memory. - - Raises: - ValueError: If ipc_handle_data is not a 64-element uint8 numpy array. - TypeError: If ipc_handle_data is not a numpy.ndarray. - RuntimeError: If the HIP runtime call fails. - - Example: - >>> # On process with rank 1, get the handle from process 0 - >>> ipc_data = all_ipc_handles[0] # From distributed communication - >>> ptr = open_ipc_handle(ipc_data, rank=1) - """ ptr = ctypes.c_void_p() - hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1) - hip_runtime.hipIpcOpenMemHandle.argtypes = [ - ctypes.POINTER(ctypes.c_void_p), - hipIpcMemHandle_t, - ctypes.c_uint, - ] + handle_size = get_ipc_handle_size() + + if _is_amd_backend: + hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1) + gpu_runtime.hipIpcOpenMemHandle.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + gpuIpcMemHandle_t, + ctypes.c_uint, + ] + else: + gpu_runtime.cudaIpcOpenMemHandle.argtypes = [ + ctypes.POINTER(ctypes.c_void_p), + gpuIpcMemHandle_t, + ctypes.c_uint, + ] + cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1) + if isinstance(ipc_handle_data, np.ndarray): - if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 64: - raise ValueError("ipc_handle_data must be a 64-element uint8 numpy array") + if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != handle_size: + raise ValueError(f"ipc_handle_data must be a {handle_size}-element uint8 numpy array") ipc_handle_bytes = ipc_handle_data.tobytes() - ipc_handle_data = (ctypes.c_char * 64).from_buffer_copy(ipc_handle_bytes) + ipc_handle_data = (ctypes.c_char * handle_size).from_buffer_copy(ipc_handle_bytes) else: - raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 64 elements") + raise TypeError(f"ipc_handle_data must be a numpy.ndarray of dtype uint8 with {handle_size} elements") - raw_memory = ctypes.create_string_buffer(64) - ctypes.memset(raw_memory, 0x00, 64) - ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory) + raw_memory = ctypes.create_string_buffer(handle_size) + ctypes.memset(raw_memory, 0x00, handle_size) + ipc_handle_struct = gpuIpcMemHandle_t.from_buffer(raw_memory) ipc_handle_data_bytes = bytes(ipc_handle_data) - ctypes.memmove(raw_memory, ipc_handle_data_bytes, 64) - - hip_try( - hip_runtime.hipIpcOpenMemHandle( - ctypes.byref(ptr), - ipc_handle_struct, - hipIpcMemLazyEnablePeerAccess, + ctypes.memmove(raw_memory, ipc_handle_data_bytes, handle_size) + + if _is_amd_backend: + gpu_try( + gpu_runtime.hipIpcOpenMemHandle( + ctypes.byref(ptr), + ipc_handle_struct, + hipIpcMemLazyEnablePeerAccess, + ) + ) + else: + gpu_try( + gpu_runtime.cudaIpcOpenMemHandle( + ctypes.byref(ptr), + ipc_handle_struct, + cudaIpcMemLazyEnablePeerAccess, + ) ) - ) return ptr.value def get_ipc_handle(ptr, rank): - """ - Get an IPC memory handle for a memory pointer to share with other processes. - - This function creates an IPC handle that can be shared with other processes - to allow them to access the memory pointed to by ptr. - - Args: - ptr (ctypes.c_void_p): Pointer to the memory region to share. - rank (int): The rank ID of the process creating the handle (used for logging/debugging). - - Returns: - hipIpcMemHandle_t: An IPC memory handle that can be shared with other processes. - - Raises: - RuntimeError: If the HIP runtime call fails. - - Example: - >>> import ctypes - >>> heap_ptr = ctypes.c_void_p(tensor.data_ptr()) - >>> handle = get_ipc_handle(heap_ptr, rank=0) - """ - ipc_handle = hipIpcMemHandle_t() - hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr)) + ipc_handle = gpuIpcMemHandle_t() + if _is_amd_backend: + gpu_try(gpu_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr)) + else: + gpu_try(gpu_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr)) return ipc_handle def count_devices(): - """ - Get the number of available HIP devices (GPUs). - - Returns: - int: The number of HIP-capable devices available on the system. - - Raises: - RuntimeError: If the HIP runtime call fails. - - Example: - >>> num_gpus = count_devices() - >>> print(f"Found {num_gpus} GPU(s)") - """ device_count = ctypes.c_int() - hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count))) + if _is_amd_backend: + gpu_try(gpu_runtime.hipGetDeviceCount(ctypes.byref(device_count))) + else: + gpu_try(gpu_runtime.cudaGetDeviceCount(ctypes.byref(device_count))) return device_count.value def set_device(gpu_id): - """ - Set the current HIP device for subsequent operations. - - Args: - gpu_id (int): The device ID to set as the current device (0-indexed). - - Raises: - RuntimeError: If the HIP runtime call fails or the device ID is invalid. - - Example: - >>> set_device(0) # Use GPU 0 - >>> set_device(1) # Switch to GPU 1 - """ - hip_try(hip_runtime.hipSetDevice(gpu_id)) + if _is_amd_backend: + gpu_try(gpu_runtime.hipSetDevice(gpu_id)) + else: + gpu_try(gpu_runtime.cudaSetDevice(gpu_id)) def get_device_id(): - """ - Get the currently active HIP device ID. - - Returns: - int: The ID of the currently active HIP device. - - Raises: - RuntimeError: If the HIP runtime call fails. - - Example: - >>> current_device = get_device_id() - >>> print(f"Using GPU {current_device}") - """ device_id = ctypes.c_int() - hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id))) + if _is_amd_backend: + gpu_try(gpu_runtime.hipGetDevice(ctypes.byref(device_id))) + else: + gpu_try(gpu_runtime.cudaGetDevice(ctypes.byref(device_id))) return device_id.value def get_cu_count(device_id=None): - """ - Get the number of compute units (CUs) for a HIP device. - - Args: - device_id (int, optional): The device ID to query. If None, uses the current device. - - Returns: - int: The number of compute units on the specified device. - - Raises: - RuntimeError: If the HIP runtime call fails. - - Example: - >>> cu_count = get_cu_count() # Current device - >>> cu_count_gpu1 = get_cu_count(device_id=1) # Specific device - """ if device_id is None: device_id = get_device_id() - hipDeviceAttributeMultiprocessorCount = 63 cu_count = ctypes.c_int() - hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id)) + if _is_amd_backend: + hipDeviceAttributeMultiprocessorCount = 63 + gpu_try( + gpu_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id) + ) + else: + cudaDevAttrMultiProcessorCount = 16 + gpu_try(gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDevAttrMultiProcessorCount, device_id)) return cu_count.value def get_rocm_version(): - """ - Get the installed ROCm version. + if not _is_amd_backend: + # Not applicable for CUDA + return (-1, -1) - Returns: - tuple: A tuple of (major, minor) version numbers as integers. + major, minor = -1, -1 - Raises: - FileNotFoundError: If the ROCm version file is not found. - IndexError: If the version file format is unexpected. + # Try hipconfig --path first + try: + result = subprocess.run(["hipconfig", "--path"], capture_output=True, text=True, check=True) + rocm_path = result.stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError): + # Then look for $ROCM_PATH environment variable + rocm_path = os.environ.get("ROCM_PATH") + if not rocm_path: + # Finally, try default location + rocm_path = "/opt/rocm" + + # Try to read version from .info/version file + try: + version_file_path = os.path.join(rocm_path, ".info", "version") + with open(version_file_path, "r") as version_file: + version = version_file.readline().strip() + major = int(version.split(".")[0]) + minor = int(version.split(".")[1]) + except (FileNotFoundError, IOError, ValueError, IndexError): + # If we can't read the version file, return -1, -1 + pass - Example: - >>> major, minor = get_rocm_version() - >>> print(f"ROCm version: {major}.{minor}") - """ - major, minor = -1, -1 - with open("/opt/rocm/.info/version", "r") as version_file: - version = version_file.readline().strip() - major = int(version.split(".")[0]) - minor = int(version.split(".")[1]) return (major, minor) @@ -279,21 +220,42 @@ def get_wall_clock_rate(device_id): >>> clock_rate = get_wall_clock_rate(0) >>> print(f"GPU clock rate: {clock_rate} kHz") """ - hipDeviceAttributeWallClockRate = 10017 wall_clock_rate = ctypes.c_int() - status = hip_runtime.hipDeviceGetAttribute( - ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id - ) - hip_try(status) + + if _is_amd_backend: + hipDeviceAttributeWallClockRate = 10017 + status = gpu_runtime.hipDeviceGetAttribute( + ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id + ) + else: + cudaDevAttrClockRate = 13 + status = gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrClockRate, device_id) + + gpu_try(status) return wall_clock_rate.value +def get_arch_string(device_id=None): + if device_id is None: + device_id = get_device_id() + + if _is_amd_backend: + arch_full = torch.cuda.get_device_properties(device_id).gcnArchName + arch_name = arch_full.split(":")[0] + return arch_name + else: + # For CUDA, return compute capability + props = torch.cuda.get_device_properties(device_id) + return f"sm_{props.major}{props.minor}" + + def get_num_xcc(device_id=None): """ Get the number of XCCs (Compute Dies) for a HIP device. XCC (Accelerated Compute Core) refers to the compute dies in MI300 series GPUs. For ROCm versions before 7.0, returns a default value of 8. + For CUDA/NVIDIA devices, returns 1 as XCC is AMD-specific. Args: device_id (int, optional): The device ID to query. If None, uses the current device. @@ -310,10 +272,44 @@ def get_num_xcc(device_id=None): """ if device_id is None: device_id = get_device_id() + + if not _is_amd_backend: + # XCC is AMD-specific, return 1 for CUDA + return 1 + rocm_major, _ = get_rocm_version() if rocm_major < 7: return 8 hipDeviceAttributeNumberOfXccs = 10018 xcc_count = ctypes.c_int() - hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id)) + gpu_try(gpu_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id)) return xcc_count.value + + +def malloc_fine_grained(size): + ptr = ctypes.c_void_p() + + if _is_amd_backend: + hipDeviceMallocFinegrained = 0x1 + gpu_try(gpu_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained)) + else: + # CUDA doesn't have direct equivalent, use regular malloc + gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size)) + + return ptr + + +def hip_malloc(size): + ptr = ctypes.c_void_p() + if _is_amd_backend: + gpu_try(gpu_runtime.hipMalloc(ctypes.byref(ptr), size)) + else: + gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size)) + return ptr + + +def hip_free(ptr): + if _is_amd_backend: + gpu_try(gpu_runtime.hipFree(ptr)) + else: + gpu_try(gpu_runtime.cudaFree(ptr))