From 93ee74678dc28f344e9119328cf11fa5000a637d Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Oct 2025 03:33:46 +0000
Subject: [PATCH 1/7] Initial plan


From 77d4871e362156da233088c5c14dc9d1b7bb0a4b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Oct 2025 03:43:51 +0000
Subject: [PATCH 2/7] Add comprehensive docstrings to iris.hip module and
 create documentation page

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 docs/conf.py                     |   1 -
 docs/reference/api-hip-module.md |  97 +++++++++++
 docs/reference/api-reference.md  |   2 +
 docs/sphinx/_toc.yml             |   1 +
 iris/hip.py                      | 266 +++++++++++++++++++++++++++++++
 5 files changed, 366 insertions(+), 1 deletion(-)
 create mode 100644 docs/reference/api-hip-module.md

diff --git a/docs/conf.py b/docs/conf.py
index 35fa7a00..7453ac9b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -95,7 +95,6 @@
     "triton.language",
     "numpy",
     "iris._distributed_helpers",
-    "iris.hip",
 ]
 
 # Napoleon settings for Google/NumPy docstring parsing
diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md
new file mode 100644
index 00000000..960418e4
--- /dev/null
+++ b/docs/reference/api-hip-module.md
@@ -0,0 +1,97 @@
+# HIP Module API
+
+Low-level HIP runtime integration for AMD GPU device management and memory operations.
+
+```{eval-rst}
+.. automodule:: iris.hip
+   :members:
+   :undoc-members:
+   :show-inheritance:
+```
+
+## Device Management
+
+### count_devices
+```{eval-rst}
+.. autofunction:: iris.hip.count_devices
+```
+
+### set_device
+```{eval-rst}
+.. autofunction:: iris.hip.set_device
+```
+
+### get_device_id
+```{eval-rst}
+.. autofunction:: iris.hip.get_device_id
+```
+
+## Device Attributes
+
+### get_cu_count
+```{eval-rst}
+.. autofunction:: iris.hip.get_cu_count
+```
+
+### get_arch_string
+```{eval-rst}
+.. autofunction:: iris.hip.get_arch_string
+```
+
+### get_num_xcc
+```{eval-rst}
+.. autofunction:: iris.hip.get_num_xcc
+```
+
+### get_wall_clock_rate
+```{eval-rst}
+.. autofunction:: iris.hip.get_wall_clock_rate
+```
+
+### get_rocm_version
+```{eval-rst}
+.. autofunction:: iris.hip.get_rocm_version
+```
+
+## Memory Management
+
+### hip_malloc
+```{eval-rst}
+.. autofunction:: iris.hip.hip_malloc
+```
+
+### malloc_fine_grained
+```{eval-rst}
+.. autofunction:: iris.hip.malloc_fine_grained
+```
+
+### hip_free
+```{eval-rst}
+.. autofunction:: iris.hip.hip_free
+```
+
+## IPC Memory Operations
+
+### get_ipc_handle
+```{eval-rst}
+.. autofunction:: iris.hip.get_ipc_handle
+```
+
+### open_ipc_handle
+```{eval-rst}
+.. autofunction:: iris.hip.open_ipc_handle
+```
+
+### hipIpcMemHandle_t
+```{eval-rst}
+.. autoclass:: iris.hip.hipIpcMemHandle_t
+   :members:
+```
+
+## Error Handling
+
+### hip_try
+```{eval-rst}
+.. autofunction:: iris.hip.hip_try
+```
+
diff --git a/docs/reference/api-reference.md b/docs/reference/api-reference.md
index 5aab4e30..6460a44e 100644
--- a/docs/reference/api-reference.md
+++ b/docs/reference/api-reference.md
@@ -5,10 +5,12 @@ Explore Iris APIs. The reference is broken down into focused sections to mirror
 - The `Iris` class itself (constructor and helper utilities)
 - Tensor-like creation methods on the `Iris` context
 - Triton device-side functions for remote memory ops and atomics
+- HIP runtime integration for low-level device management
 
 Use the links below to navigate:
 
 - [Iris Class (ctor & helpers)](api-iris-class.md)
 - [Tensor Creation](api-tensor-creation.md)
 - [Triton Device Functions](api-device-functions.md)
+- [HIP Module](api-hip-module.md)
 
diff --git a/docs/sphinx/_toc.yml b/docs/sphinx/_toc.yml
index 5b4cdead..a3761d77 100644
--- a/docs/sphinx/_toc.yml
+++ b/docs/sphinx/_toc.yml
@@ -15,3 +15,4 @@ subtrees:
     - file: reference/api-iris-class.md
     - file: reference/api-tensor-creation.md
     - file: reference/api-device-functions.md
+    - file: reference/api-hip-module.md
diff --git a/iris/hip.py b/iris/hip.py
index 2a03c397..963dc01d 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -1,6 +1,28 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
 
+"""
+HIP Runtime Integration Module
+
+This module provides low-level HIP runtime integration for AMD GPUs,
+offering Python bindings to essential HIP runtime functions through ctypes.
+It enables device management, memory operations, and inter-process communication
+for multi-GPU programming.
+
+Key Features:
+- Device enumeration and management
+- IPC (Inter-Process Communication) memory handles
+- Device attribute queries (compute units, architecture, XCC count)
+- Fine-grained and coarse-grained memory allocation
+- ROCm version detection
+
+Example:
+    >>> import iris.hip as hip
+    >>> num_devices = hip.count_devices()
+    >>> hip.set_device(0)
+    >>> cu_count = hip.get_cu_count()
+"""
+
 import ctypes
 import numpy as np
 import sys
@@ -11,6 +33,19 @@
 
 
 def hip_try(err):
+    """
+    Check HIP error codes and raise RuntimeError if an error occurred.
+
+    Args:
+        err (int): HIP error code returned from a HIP runtime function.
+
+    Raises:
+        RuntimeError: If err is non-zero, with a descriptive error message.
+
+    Example:
+        >>> hip_try(0)  # No error, returns silently
+        >>> hip_try(1)  # Raises RuntimeError with HIP error message
+    """
     if err != 0:
         hip_runtime.hipGetErrorString.restype = ctypes.c_char_p
         error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
@@ -18,10 +53,50 @@ def hip_try(err):
 
 
 class hipIpcMemHandle_t(ctypes.Structure):
+    """
+    HIP IPC (Inter-Process Communication) memory handle structure.
+
+    This structure represents an opaque handle used for sharing memory
+    between processes on different GPUs. The handle contains 64 bytes
+    of reserved data that uniquely identifies the shared memory region.
+
+    Attributes:
+        reserved (ctypes.c_char * 64): Reserved bytes containing the handle data.
+
+    Example:
+        >>> handle = hipIpcMemHandle_t()
+        >>> # Use with get_ipc_handle and open_ipc_handle
+    """
+
     _fields_ = [("reserved", ctypes.c_char * 64)]
 
 
 def open_ipc_handle(ipc_handle_data, rank):
+    """
+    Open an IPC memory handle to access shared memory from another process.
+
+    This function takes an IPC memory handle (obtained via get_ipc_handle) and
+    opens it to allow the current process to access the shared memory region.
+    The memory is opened with lazy peer access enabled.
+
+    Args:
+        ipc_handle_data (numpy.ndarray): A 64-element uint8 numpy array containing
+            the IPC handle data.
+        rank (int): The rank ID of the process opening the handle (used for logging/debugging).
+
+    Returns:
+        int: The pointer value (as Python int) to the opened shared memory.
+
+    Raises:
+        ValueError: If ipc_handle_data is not a 64-element uint8 numpy array.
+        TypeError: If ipc_handle_data is not a numpy.ndarray.
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> # On process with rank 1, get the handle from process 0
+        >>> ipc_data = all_ipc_handles[0]  # From distributed communication
+        >>> ptr = open_ipc_handle(ipc_data, rank=1)
+    """
     ptr = ctypes.c_void_p()
     hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
     hip_runtime.hipIpcOpenMemHandle.argtypes = [
@@ -55,28 +130,104 @@ def open_ipc_handle(ipc_handle_data, rank):
 
 
 def get_ipc_handle(ptr, rank):
+    """
+    Get an IPC memory handle for a memory pointer to share with other processes.
+
+    This function creates an IPC handle that can be shared with other processes
+    to allow them to access the memory pointed to by ptr.
+
+    Args:
+        ptr (ctypes.c_void_p): Pointer to the memory region to share.
+        rank (int): The rank ID of the process creating the handle (used for logging/debugging).
+
+    Returns:
+        hipIpcMemHandle_t: An IPC memory handle that can be shared with other processes.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> import ctypes
+        >>> heap_ptr = ctypes.c_void_p(tensor.data_ptr())
+        >>> handle = get_ipc_handle(heap_ptr, rank=0)
+    """
     ipc_handle = hipIpcMemHandle_t()
     hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
     return ipc_handle
 
 
 def count_devices():
+    """
+    Get the number of available HIP devices (GPUs).
+
+    Returns:
+        int: The number of HIP-capable devices available on the system.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> num_gpus = count_devices()
+        >>> print(f"Found {num_gpus} GPU(s)")
+    """
     device_count = ctypes.c_int()
     hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
     return device_count.value
 
 
 def set_device(gpu_id):
+    """
+    Set the current HIP device for subsequent operations.
+
+    Args:
+        gpu_id (int): The device ID to set as the current device (0-indexed).
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails or the device ID is invalid.
+
+    Example:
+        >>> set_device(0)  # Use GPU 0
+        >>> set_device(1)  # Switch to GPU 1
+    """
     hip_try(hip_runtime.hipSetDevice(gpu_id))
 
 
 def get_device_id():
+    """
+    Get the currently active HIP device ID.
+
+    Returns:
+        int: The ID of the currently active HIP device.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> current_device = get_device_id()
+        >>> print(f"Using GPU {current_device}")
+    """
     device_id = ctypes.c_int()
     hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id)))
     return device_id.value
 
 
 def get_cu_count(device_id=None):
+    """
+    Get the number of compute units (CUs) for a HIP device.
+
+    Args:
+        device_id (int, optional): The device ID to query. If None, uses the current device.
+
+    Returns:
+        int: The number of compute units on the specified device.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> cu_count = get_cu_count()  # Current device
+        >>> cu_count_gpu1 = get_cu_count(device_id=1)  # Specific device
+    """
     if device_id is None:
         device_id = get_device_id()
 
@@ -89,6 +240,20 @@ def get_cu_count(device_id=None):
 
 
 def get_rocm_version():
+    """
+    Get the installed ROCm version.
+
+    Returns:
+        tuple: A tuple of (major, minor) version numbers as integers.
+
+    Raises:
+        FileNotFoundError: If the ROCm version file is not found.
+        IndexError: If the version file format is unexpected.
+
+    Example:
+        >>> major, minor = get_rocm_version()
+        >>> print(f"ROCm version: {major}.{minor}")
+    """
     major, minor = -1, -1
     with open("/opt/rocm/.info/version", "r") as version_file:
         version = version_file.readline().strip()
@@ -98,6 +263,22 @@ def get_rocm_version():
 
 
 def get_wall_clock_rate(device_id):
+    """
+    Get the wall clock rate (GPU clock frequency) for a HIP device.
+
+    Args:
+        device_id (int): The device ID to query.
+
+    Returns:
+        int: The wall clock rate in kHz.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> clock_rate = get_wall_clock_rate(0)
+        >>> print(f"GPU clock rate: {clock_rate} kHz")
+    """
     hipDeviceAttributeWallClockRate = 10017
     wall_clock_rate = ctypes.c_int()
     status = hip_runtime.hipDeviceGetAttribute(
@@ -108,6 +289,19 @@ def get_wall_clock_rate(device_id):
 
 
 def get_arch_string(device_id=None):
+    """
+    Get the GPU architecture string for a HIP device.
+
+    Args:
+        device_id (int, optional): The device ID to query. If None, uses the current device.
+
+    Returns:
+        str: The architecture name (e.g., "gfx90a", "gfx942").
+
+    Example:
+        >>> arch = get_arch_string()
+        >>> print(f"GPU architecture: {arch}")  # e.g., "gfx942"
+    """
     if device_id is None:
         device_id = get_device_id()
     arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
@@ -116,6 +310,25 @@ def get_arch_string(device_id=None):
 
 
 def get_num_xcc(device_id=None):
+    """
+    Get the number of XCCs (Compute Dies) for a HIP device.
+
+    XCC (eXtended Compute Complex) refers to the compute dies in MI300 series GPUs.
+    For ROCm versions before 7.0, returns a default value of 8.
+
+    Args:
+        device_id (int, optional): The device ID to query. If None, uses the current device.
+
+    Returns:
+        int: The number of XCCs on the device.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> xcc_count = get_num_xcc()
+        >>> print(f"Number of XCCs: {xcc_count}")
+    """
     if device_id is None:
         device_id = get_device_id()
     rocm_major, _ = get_rocm_version()
@@ -128,6 +341,26 @@ def get_num_xcc(device_id=None):
 
 
 def malloc_fine_grained(size):
+    """
+    Allocate fine-grained GPU memory accessible by both CPU and GPU.
+
+    Fine-grained memory provides coherent access from both CPU and GPU,
+    making it suitable for scenarios requiring frequent CPU-GPU data exchange.
+
+    Args:
+        size (int): Size of memory to allocate in bytes.
+
+    Returns:
+        ctypes.c_void_p: Pointer to the allocated memory.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails or allocation fails.
+
+    Example:
+        >>> ptr = malloc_fine_grained(1024 * 1024)  # Allocate 1 MB
+        >>> # Use the memory...
+        >>> hip_free(ptr)
+    """
     hipDeviceMallocFinegrained = 0x1
     ptr = ctypes.c_void_p()
     hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
@@ -135,10 +368,43 @@ def malloc_fine_grained(size):
 
 
 def hip_malloc(size):
+    """
+    Allocate standard (coarse-grained) GPU memory.
+
+    This allocates device memory that is optimized for GPU access but
+    not directly accessible from the CPU without explicit transfers.
+
+    Args:
+        size (int): Size of memory to allocate in bytes.
+
+    Returns:
+        ctypes.c_void_p: Pointer to the allocated memory.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails or allocation fails.
+
+    Example:
+        >>> ptr = hip_malloc(2 * 1024 * 1024)  # Allocate 2 MB
+        >>> # Use the memory...
+        >>> hip_free(ptr)
+    """
     ptr = ctypes.c_void_p()
     hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size))
     return ptr
 
 
 def hip_free(ptr):
+    """
+    Free GPU memory allocated by hip_malloc or malloc_fine_grained.
+
+    Args:
+        ptr (ctypes.c_void_p): Pointer to the memory to free.
+
+    Raises:
+        RuntimeError: If the HIP runtime call fails.
+
+    Example:
+        >>> ptr = hip_malloc(1024)
+        >>> hip_free(ptr)
+    """
     hip_try(hip_runtime.hipFree(ptr))

From 449f3006b2b905a87969e5da32e82f9eee2aaa24 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Oct 2025 04:13:59 +0000
Subject: [PATCH 3/7] Remove unused hip module functions (get_arch_string,
 hip_malloc, malloc_fine_grained, hip_free)

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 docs/reference/api-hip-module.md | 22 --------
 iris/hip.py                      | 91 --------------------------------
 2 files changed, 113 deletions(-)

diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md
index 960418e4..2c07bd2c 100644
--- a/docs/reference/api-hip-module.md
+++ b/docs/reference/api-hip-module.md
@@ -33,11 +33,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera
 .. autofunction:: iris.hip.get_cu_count
 ```
 
-### get_arch_string
-```{eval-rst}
-.. autofunction:: iris.hip.get_arch_string
-```
-
 ### get_num_xcc
 ```{eval-rst}
 .. autofunction:: iris.hip.get_num_xcc
@@ -53,23 +48,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera
 .. autofunction:: iris.hip.get_rocm_version
 ```
 
-## Memory Management
-
-### hip_malloc
-```{eval-rst}
-.. autofunction:: iris.hip.hip_malloc
-```
-
-### malloc_fine_grained
-```{eval-rst}
-.. autofunction:: iris.hip.malloc_fine_grained
-```
-
-### hip_free
-```{eval-rst}
-.. autofunction:: iris.hip.hip_free
-```
-
 ## IPC Memory Operations
 
 ### get_ipc_handle
diff --git a/iris/hip.py b/iris/hip.py
index 963dc01d..e9c83bbd 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -288,27 +288,6 @@ def get_wall_clock_rate(device_id):
     return wall_clock_rate.value
 
 
-def get_arch_string(device_id=None):
-    """
-    Get the GPU architecture string for a HIP device.
-
-    Args:
-        device_id (int, optional): The device ID to query. If None, uses the current device.
-
-    Returns:
-        str: The architecture name (e.g., "gfx90a", "gfx942").
-
-    Example:
-        >>> arch = get_arch_string()
-        >>> print(f"GPU architecture: {arch}")  # e.g., "gfx942"
-    """
-    if device_id is None:
-        device_id = get_device_id()
-    arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
-    arch_name = arch_full.split(":")[0]
-    return arch_name
-
-
 def get_num_xcc(device_id=None):
     """
     Get the number of XCCs (Compute Dies) for a HIP device.
@@ -338,73 +317,3 @@ def get_num_xcc(device_id=None):
     xcc_count = ctypes.c_int()
     hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
     return xcc_count.value
-
-
-def malloc_fine_grained(size):
-    """
-    Allocate fine-grained GPU memory accessible by both CPU and GPU.
-
-    Fine-grained memory provides coherent access from both CPU and GPU,
-    making it suitable for scenarios requiring frequent CPU-GPU data exchange.
-
-    Args:
-        size (int): Size of memory to allocate in bytes.
-
-    Returns:
-        ctypes.c_void_p: Pointer to the allocated memory.
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails or allocation fails.
-
-    Example:
-        >>> ptr = malloc_fine_grained(1024 * 1024)  # Allocate 1 MB
-        >>> # Use the memory...
-        >>> hip_free(ptr)
-    """
-    hipDeviceMallocFinegrained = 0x1
-    ptr = ctypes.c_void_p()
-    hip_try(hip_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
-    return ptr
-
-
-def hip_malloc(size):
-    """
-    Allocate standard (coarse-grained) GPU memory.
-
-    This allocates device memory that is optimized for GPU access but
-    not directly accessible from the CPU without explicit transfers.
-
-    Args:
-        size (int): Size of memory to allocate in bytes.
-
-    Returns:
-        ctypes.c_void_p: Pointer to the allocated memory.
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails or allocation fails.
-
-    Example:
-        >>> ptr = hip_malloc(2 * 1024 * 1024)  # Allocate 2 MB
-        >>> # Use the memory...
-        >>> hip_free(ptr)
-    """
-    ptr = ctypes.c_void_p()
-    hip_try(hip_runtime.hipMalloc(ctypes.byref(ptr), size))
-    return ptr
-
-
-def hip_free(ptr):
-    """
-    Free GPU memory allocated by hip_malloc or malloc_fine_grained.
-
-    Args:
-        ptr (ctypes.c_void_p): Pointer to the memory to free.
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails.
-
-    Example:
-        >>> ptr = hip_malloc(1024)
-        >>> hip_free(ptr)
-    """
-    hip_try(hip_runtime.hipFree(ptr))

From b8778e618ba3cb763a4c0cb84ece87d9867441d4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Oct 2025 22:06:16 +0000
Subject: [PATCH 4/7] Remove internal APIs from hip module documentation

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 docs/reference/api-hip-module.md | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md
index 2c07bd2c..123be4cc 100644
--- a/docs/reference/api-hip-module.md
+++ b/docs/reference/api-hip-module.md
@@ -2,12 +2,7 @@
 
 Low-level HIP runtime integration for AMD GPU device management and memory operations.
 
-```{eval-rst}
-.. automodule:: iris.hip
-   :members:
-   :undoc-members:
-   :show-inheritance:
-```
+This module provides public APIs for device management, device attribute queries, and IPC memory operations.
 
 ## Device Management
 
@@ -21,11 +16,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera
 .. autofunction:: iris.hip.set_device
 ```
 
-### get_device_id
-```{eval-rst}
-.. autofunction:: iris.hip.get_device_id
-```
-
 ## Device Attributes
 
 ### get_cu_count
@@ -43,11 +33,6 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera
 .. autofunction:: iris.hip.get_wall_clock_rate
 ```
 
-### get_rocm_version
-```{eval-rst}
-.. autofunction:: iris.hip.get_rocm_version
-```
-
 ## IPC Memory Operations
 
 ### get_ipc_handle
@@ -60,16 +45,3 @@ Low-level HIP runtime integration for AMD GPU device management and memory opera
 .. autofunction:: iris.hip.open_ipc_handle
 ```
 
-### hipIpcMemHandle_t
-```{eval-rst}
-.. autoclass:: iris.hip.hipIpcMemHandle_t
-   :members:
-```
-
-## Error Handling
-
-### hip_try
-```{eval-rst}
-.. autofunction:: iris.hip.hip_try
-```
-

From a36e9a1b86b4e51114e4d7743a4696f3f1ca7c6b Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Oct 2025 22:44:28 +0000
Subject: [PATCH 5/7] Document only user-facing hip APIs used in examples/tests

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 docs/reference/api-hip-module.md | 35 +++-----------------------------
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/docs/reference/api-hip-module.md b/docs/reference/api-hip-module.md
index 123be4cc..3724a8c5 100644
--- a/docs/reference/api-hip-module.md
+++ b/docs/reference/api-hip-module.md
@@ -2,46 +2,17 @@
 
 Low-level HIP runtime integration for AMD GPU device management and memory operations.
 
-This module provides public APIs for device management, device attribute queries, and IPC memory operations.
-
-## Device Management
-
-### count_devices
-```{eval-rst}
-.. autofunction:: iris.hip.count_devices
-```
-
-### set_device
-```{eval-rst}
-.. autofunction:: iris.hip.set_device
-```
+This module provides public APIs for querying device attributes.
 
 ## Device Attributes
 
-### get_cu_count
-```{eval-rst}
-.. autofunction:: iris.hip.get_cu_count
-```
-
-### get_num_xcc
-```{eval-rst}
-.. autofunction:: iris.hip.get_num_xcc
-```
-
 ### get_wall_clock_rate
 ```{eval-rst}
 .. autofunction:: iris.hip.get_wall_clock_rate
 ```
 
-## IPC Memory Operations
-
-### get_ipc_handle
-```{eval-rst}
-.. autofunction:: iris.hip.get_ipc_handle
-```
-
-### open_ipc_handle
+### get_num_xcc
 ```{eval-rst}
-.. autofunction:: iris.hip.open_ipc_handle
+.. autofunction:: iris.hip.get_num_xcc
 ```
 

From 39273b982686947bf6671f5230e3e674d3f9c7df Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Oct 2025 22:54:59 +0000
Subject: [PATCH 6/7] Fix XCC acronym: Accelerated Compute Core

Co-authored-by: mawad-amd <112003944+mawad-amd@users.noreply.github.com>
---
 iris/hip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/iris/hip.py b/iris/hip.py
index e9c83bbd..ef7a3f94 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -292,7 +292,7 @@ def get_num_xcc(device_id=None):
     """
     Get the number of XCCs (Compute Dies) for a HIP device.
 
-    XCC (eXtended Compute Complex) refers to the compute dies in MI300 series GPUs.
+    XCC (Accelerated Compute Core) refers to the compute dies in MI300 series GPUs.
     For ROCm versions before 7.0, returns a default value of 8.
 
     Args:

From a123110d8a3abbf29af4da33d8a68e919431f87e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 12 Oct 2025 02:48:31 +0000
Subject: [PATCH 7/7] Add docstrings to hip module after syncing with main

---
 iris/hip.py | 372 ++++++++++++++++++++++++++--------------------------
 1 file changed, 184 insertions(+), 188 deletions(-)

diff --git a/iris/hip.py b/iris/hip.py
index ef7a3f94..454c002c 100644
--- a/iris/hip.py
+++ b/iris/hip.py
@@ -27,238 +27,179 @@
 import numpy as np
 import sys
 import torch
-
-rt_path = "libamdhip64.so"
-hip_runtime = ctypes.cdll.LoadLibrary(rt_path)
-
-
-def hip_try(err):
-    """
-    Check HIP error codes and raise RuntimeError if an error occurred.
-
-    Args:
-        err (int): HIP error code returned from a HIP runtime function.
-
-    Raises:
-        RuntimeError: If err is non-zero, with a descriptive error message.
-
-    Example:
-        >>> hip_try(0)  # No error, returns silently
-        >>> hip_try(1)  # Raises RuntimeError with HIP error message
-    """
+import subprocess
+import os
+
+# Auto-detect backend
+_is_amd_backend = True
+try:
+    rt_path = "libamdhip64.so"
+    gpu_runtime = ctypes.cdll.LoadLibrary(rt_path)
+except OSError:
+    try:
+        rt_path = "libcudart.so"
+        gpu_runtime = ctypes.cdll.LoadLibrary(rt_path)
+        _is_amd_backend = False
+    except OSError:
+        rt_path = "libamdhip64.so"
+        gpu_runtime = ctypes.cdll.LoadLibrary(rt_path)
+
+
+def gpu_try(err):
     if err != 0:
-        hip_runtime.hipGetErrorString.restype = ctypes.c_char_p
-        error_string = hip_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
-        raise RuntimeError(f"HIP error code {err}: {error_string}")
-
+        if _is_amd_backend:
+            gpu_runtime.hipGetErrorString.restype = ctypes.c_char_p
+            error_string = gpu_runtime.hipGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            raise RuntimeError(f"HIP error code {err}: {error_string}")
+        else:
+            gpu_runtime.cudaGetErrorString.restype = ctypes.c_char_p
+            error_string = gpu_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
+            raise RuntimeError(f"CUDA error code {err}: {error_string}")
 
-class hipIpcMemHandle_t(ctypes.Structure):
-    """
-    HIP IPC (Inter-Process Communication) memory handle structure.
-
-    This structure represents an opaque handle used for sharing memory
-    between processes on different GPUs. The handle contains 64 bytes
-    of reserved data that uniquely identifies the shared memory region.
 
-    Attributes:
-        reserved (ctypes.c_char * 64): Reserved bytes containing the handle data.
+def get_ipc_handle_size():
+    """Return the IPC handle size for the current backend."""
+    return 64 if _is_amd_backend else 128
 
-    Example:
-        >>> handle = hipIpcMemHandle_t()
-        >>> # Use with get_ipc_handle and open_ipc_handle
-    """
 
-    _fields_ = [("reserved", ctypes.c_char * 64)]
+class gpuIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("reserved", ctypes.c_char * get_ipc_handle_size())]
 
 
 def open_ipc_handle(ipc_handle_data, rank):
-    """
-    Open an IPC memory handle to access shared memory from another process.
-
-    This function takes an IPC memory handle (obtained via get_ipc_handle) and
-    opens it to allow the current process to access the shared memory region.
-    The memory is opened with lazy peer access enabled.
-
-    Args:
-        ipc_handle_data (numpy.ndarray): A 64-element uint8 numpy array containing
-            the IPC handle data.
-        rank (int): The rank ID of the process opening the handle (used for logging/debugging).
-
-    Returns:
-        int: The pointer value (as Python int) to the opened shared memory.
-
-    Raises:
-        ValueError: If ipc_handle_data is not a 64-element uint8 numpy array.
-        TypeError: If ipc_handle_data is not a numpy.ndarray.
-        RuntimeError: If the HIP runtime call fails.
-
-    Example:
-        >>> # On process with rank 1, get the handle from process 0
-        >>> ipc_data = all_ipc_handles[0]  # From distributed communication
-        >>> ptr = open_ipc_handle(ipc_data, rank=1)
-    """
     ptr = ctypes.c_void_p()
-    hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
-    hip_runtime.hipIpcOpenMemHandle.argtypes = [
-        ctypes.POINTER(ctypes.c_void_p),
-        hipIpcMemHandle_t,
-        ctypes.c_uint,
-    ]
+    handle_size = get_ipc_handle_size()
+
+    if _is_amd_backend:
+        hipIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+        gpu_runtime.hipIpcOpenMemHandle.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            gpuIpcMemHandle_t,
+            ctypes.c_uint,
+        ]
+    else:
+        gpu_runtime.cudaIpcOpenMemHandle.argtypes = [
+            ctypes.POINTER(ctypes.c_void_p),
+            gpuIpcMemHandle_t,
+            ctypes.c_uint,
+        ]
+        cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+
     if isinstance(ipc_handle_data, np.ndarray):
-        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 64:
-            raise ValueError("ipc_handle_data must be a 64-element uint8 numpy array")
+        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != handle_size:
+            raise ValueError(f"ipc_handle_data must be a {handle_size}-element uint8 numpy array")
         ipc_handle_bytes = ipc_handle_data.tobytes()
-        ipc_handle_data = (ctypes.c_char * 64).from_buffer_copy(ipc_handle_bytes)
+        ipc_handle_data = (ctypes.c_char * handle_size).from_buffer_copy(ipc_handle_bytes)
     else:
-        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 64 elements")
+        raise TypeError(f"ipc_handle_data must be a numpy.ndarray of dtype uint8 with {handle_size} elements")
 
-    raw_memory = ctypes.create_string_buffer(64)
-    ctypes.memset(raw_memory, 0x00, 64)
-    ipc_handle_struct = hipIpcMemHandle_t.from_buffer(raw_memory)
+    raw_memory = ctypes.create_string_buffer(handle_size)
+    ctypes.memset(raw_memory, 0x00, handle_size)
+    ipc_handle_struct = gpuIpcMemHandle_t.from_buffer(raw_memory)
     ipc_handle_data_bytes = bytes(ipc_handle_data)
-    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 64)
-
-    hip_try(
-        hip_runtime.hipIpcOpenMemHandle(
-            ctypes.byref(ptr),
-            ipc_handle_struct,
-            hipIpcMemLazyEnablePeerAccess,
+    ctypes.memmove(raw_memory, ipc_handle_data_bytes, handle_size)
+
+    if _is_amd_backend:
+        gpu_try(
+            gpu_runtime.hipIpcOpenMemHandle(
+                ctypes.byref(ptr),
+                ipc_handle_struct,
+                hipIpcMemLazyEnablePeerAccess,
+            )
+        )
+    else:
+        gpu_try(
+            gpu_runtime.cudaIpcOpenMemHandle(
+                ctypes.byref(ptr),
+                ipc_handle_struct,
+                cudaIpcMemLazyEnablePeerAccess,
+            )
         )
-    )
 
     return ptr.value
 
 
 def get_ipc_handle(ptr, rank):
-    """
-    Get an IPC memory handle for a memory pointer to share with other processes.
-
-    This function creates an IPC handle that can be shared with other processes
-    to allow them to access the memory pointed to by ptr.
-
-    Args:
-        ptr (ctypes.c_void_p): Pointer to the memory region to share.
-        rank (int): The rank ID of the process creating the handle (used for logging/debugging).
-
-    Returns:
-        hipIpcMemHandle_t: An IPC memory handle that can be shared with other processes.
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails.
-
-    Example:
-        >>> import ctypes
-        >>> heap_ptr = ctypes.c_void_p(tensor.data_ptr())
-        >>> handle = get_ipc_handle(heap_ptr, rank=0)
-    """
-    ipc_handle = hipIpcMemHandle_t()
-    hip_try(hip_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    ipc_handle = gpuIpcMemHandle_t()
+    if _is_amd_backend:
+        gpu_try(gpu_runtime.hipIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    else:
+        gpu_try(gpu_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
     return ipc_handle
 
 
 def count_devices():
-    """
-    Get the number of available HIP devices (GPUs).
-
-    Returns:
-        int: The number of HIP-capable devices available on the system.
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails.
-
-    Example:
-        >>> num_gpus = count_devices()
-        >>> print(f"Found {num_gpus} GPU(s)")
-    """
     device_count = ctypes.c_int()
-    hip_try(hip_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
+    if _is_amd_backend:
+        gpu_try(gpu_runtime.hipGetDeviceCount(ctypes.byref(device_count)))
+    else:
+        gpu_try(gpu_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
     return device_count.value
 
 
 def set_device(gpu_id):
-    """
-    Set the current HIP device for subsequent operations.
-
-    Args:
-        gpu_id (int): The device ID to set as the current device (0-indexed).
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails or the device ID is invalid.
-
-    Example:
-        >>> set_device(0)  # Use GPU 0
-        >>> set_device(1)  # Switch to GPU 1
-    """
-    hip_try(hip_runtime.hipSetDevice(gpu_id))
+    if _is_amd_backend:
+        gpu_try(gpu_runtime.hipSetDevice(gpu_id))
+    else:
+        gpu_try(gpu_runtime.cudaSetDevice(gpu_id))
 
 
 def get_device_id():
-    """
-    Get the currently active HIP device ID.
-
-    Returns:
-        int: The ID of the currently active HIP device.
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails.
-
-    Example:
-        >>> current_device = get_device_id()
-        >>> print(f"Using GPU {current_device}")
-    """
     device_id = ctypes.c_int()
-    hip_try(hip_runtime.hipGetDevice(ctypes.byref(device_id)))
+    if _is_amd_backend:
+        gpu_try(gpu_runtime.hipGetDevice(ctypes.byref(device_id)))
+    else:
+        gpu_try(gpu_runtime.cudaGetDevice(ctypes.byref(device_id)))
     return device_id.value
 
 
 def get_cu_count(device_id=None):
-    """
-    Get the number of compute units (CUs) for a HIP device.
-
-    Args:
-        device_id (int, optional): The device ID to query. If None, uses the current device.
-
-    Returns:
-        int: The number of compute units on the specified device.
-
-    Raises:
-        RuntimeError: If the HIP runtime call fails.
-
-    Example:
-        >>> cu_count = get_cu_count()  # Current device
-        >>> cu_count_gpu1 = get_cu_count(device_id=1)  # Specific device
-    """
     if device_id is None:
         device_id = get_device_id()
 
-    hipDeviceAttributeMultiprocessorCount = 63
     cu_count = ctypes.c_int()
 
-    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id))
+    if _is_amd_backend:
+        hipDeviceAttributeMultiprocessorCount = 63
+        gpu_try(
+            gpu_runtime.hipDeviceGetAttribute(ctypes.byref(cu_count), hipDeviceAttributeMultiprocessorCount, device_id)
+        )
+    else:
+        cudaDevAttrMultiProcessorCount = 16
+        gpu_try(gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDevAttrMultiProcessorCount, device_id))
 
     return cu_count.value
 
 
 def get_rocm_version():
-    """
-    Get the installed ROCm version.
+    if not _is_amd_backend:
+        # Not applicable for CUDA
+        return (-1, -1)
 
-    Returns:
-        tuple: A tuple of (major, minor) version numbers as integers.
+    major, minor = -1, -1
 
-    Raises:
-        FileNotFoundError: If the ROCm version file is not found.
-        IndexError: If the version file format is unexpected.
+    # Try hipconfig --path first
+    try:
+        result = subprocess.run(["hipconfig", "--path"], capture_output=True, text=True, check=True)
+        rocm_path = result.stdout.strip()
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # Then look for $ROCM_PATH environment variable
+        rocm_path = os.environ.get("ROCM_PATH")
+        if not rocm_path:
+            # Finally, try default location
+            rocm_path = "/opt/rocm"
+
+    # Try to read version from .info/version file
+    try:
+        version_file_path = os.path.join(rocm_path, ".info", "version")
+        with open(version_file_path, "r") as version_file:
+            version = version_file.readline().strip()
+            major = int(version.split(".")[0])
+            minor = int(version.split(".")[1])
+    except (FileNotFoundError, IOError, ValueError, IndexError):
+        # If we can't read the version file, return -1, -1
+        pass
 
-    Example:
-        >>> major, minor = get_rocm_version()
-        >>> print(f"ROCm version: {major}.{minor}")
-    """
-    major, minor = -1, -1
-    with open("/opt/rocm/.info/version", "r") as version_file:
-        version = version_file.readline().strip()
-        major = int(version.split(".")[0])
-        minor = int(version.split(".")[1])
     return (major, minor)
 
 
@@ -279,21 +220,42 @@ def get_wall_clock_rate(device_id):
         >>> clock_rate = get_wall_clock_rate(0)
         >>> print(f"GPU clock rate: {clock_rate} kHz")
     """
-    hipDeviceAttributeWallClockRate = 10017
     wall_clock_rate = ctypes.c_int()
-    status = hip_runtime.hipDeviceGetAttribute(
-        ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id
-    )
-    hip_try(status)
+
+    if _is_amd_backend:
+        hipDeviceAttributeWallClockRate = 10017
+        status = gpu_runtime.hipDeviceGetAttribute(
+            ctypes.byref(wall_clock_rate), hipDeviceAttributeWallClockRate, device_id
+        )
+    else:
+        cudaDevAttrClockRate = 13
+        status = gpu_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrClockRate, device_id)
+
+    gpu_try(status)
     return wall_clock_rate.value
 
 
+def get_arch_string(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+
+    if _is_amd_backend:
+        arch_full = torch.cuda.get_device_properties(device_id).gcnArchName
+        arch_name = arch_full.split(":")[0]
+        return arch_name
+    else:
+        # For CUDA, return compute capability
+        props = torch.cuda.get_device_properties(device_id)
+        return f"sm_{props.major}{props.minor}"
+
+
 def get_num_xcc(device_id=None):
     """
     Get the number of XCCs (Compute Dies) for a HIP device.
 
     XCC (Accelerated Compute Core) refers to the compute dies in MI300 series GPUs.
     For ROCm versions before 7.0, returns a default value of 8.
+    For CUDA/NVIDIA devices, returns 1 as XCC is AMD-specific.
 
     Args:
         device_id (int, optional): The device ID to query. If None, uses the current device.
@@ -310,10 +272,44 @@ def get_num_xcc(device_id=None):
     """
     if device_id is None:
         device_id = get_device_id()
+
+    if not _is_amd_backend:
+        # XCC is AMD-specific, return 1 for CUDA
+        return 1
+
     rocm_major, _ = get_rocm_version()
     if rocm_major < 7:
         return 8
     hipDeviceAttributeNumberOfXccs = 10018
     xcc_count = ctypes.c_int()
-    hip_try(hip_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
+    gpu_try(gpu_runtime.hipDeviceGetAttribute(ctypes.byref(xcc_count), hipDeviceAttributeNumberOfXccs, device_id))
     return xcc_count.value
+
+
+def malloc_fine_grained(size):
+    ptr = ctypes.c_void_p()
+
+    if _is_amd_backend:
+        hipDeviceMallocFinegrained = 0x1
+        gpu_try(gpu_runtime.hipExtMallocWithFlags(ctypes.byref(ptr), size, hipDeviceMallocFinegrained))
+    else:
+        # CUDA doesn't have direct equivalent, use regular malloc
+        gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size))
+
+    return ptr
+
+
+def hip_malloc(size):
+    ptr = ctypes.c_void_p()
+    if _is_amd_backend:
+        gpu_try(gpu_runtime.hipMalloc(ctypes.byref(ptr), size))
+    else:
+        gpu_try(gpu_runtime.cudaMalloc(ctypes.byref(ptr), size))
+    return ptr
+
+
+def hip_free(ptr):
+    if _is_amd_backend:
+        gpu_try(gpu_runtime.hipFree(ptr))
+    else:
+        gpu_try(gpu_runtime.cudaFree(ptr))