Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
dcf704f
Add chunked VMem allocator with power-of-two free lists
mawad-amd Mar 25, 2026
2bc42d0
Apply Ruff auto-fixes
github-actions[bot] Mar 25, 2026
c700b55
Fix free_list_size_classes test: use granularity-adaptive sizes
mawad-amd Mar 25, 2026
9c8bdf4
Fix import_external_tensor: tolerate hipMemSetAccess failure on DMA-B…
mawad-amd Mar 25, 2026
6475347
Fix import chunks: separate from shareable chunks for peer exchange
mawad-amd Mar 25, 2026
7b0b816
Fix VA space exhaustion: reduce default VA size, add cleanup
mawad-amd Mar 25, 2026
b3fa1e5
Apply Ruff auto-fixes
github-actions[bot] Mar 25, 2026
8dacfc8
Fix chunk_size and VA sizing for small heaps
mawad-amd Mar 25, 2026
846033a
Fix hipErrorUnknown from async GPU ops during VMem close
mawad-amd Mar 25, 2026
0082dbc
Add GPU sync to SymmetricHeap.close() before freeing VA ranges
mawad-amd Mar 25, 2026
1504643
Apply Ruff auto-fixes
github-actions[bot] Mar 25, 2026
3402fa9
Fix hipErrorUnknown: unmap peer-imported chunks before freeing VA ranges
mawad-amd Mar 26, 2026
637ca7d
Fix import_external_tensor: use External Memory API for hipMalloc ten…
mawad-amd Mar 26, 2026
8c9bdb7
Apply Ruff auto-fixes
github-actions[bot] Mar 26, 2026
0964473
Fix test_chunked_stats: improve GC reliability for weakref finalizer
mawad-amd Mar 26, 2026
bb1c114
Remove non-deterministic GC assertion from test_chunked_stats
mawad-amd Mar 26, 2026
30e9e3f
Increase default VA reservation to 128 GiB
mawad-amd Mar 26, 2026
10fa7e7
Document ROCm 7.1+ requirement for hipMemImportFromShareableHandle
mawad-amd Mar 26, 2026
eb09151
Document ROCm 7.1+ requirement for hipMemImportFromShareableHandle
mawad-amd Mar 26, 2026
938f435
Fixing import issues on vmem_chunked_allocator.py.
artulab Apr 28, 2026
53cb6f3
Refactor memory drivers to decouple virtual address reservation
artulab Apr 28, 2026
d821268
Apply Ruff auto-fixes
github-actions[bot] Apr 28, 2026
bedf2cb
feat(drivers): implement VMM local drivers, factory, and fabric VA su…
artulab Apr 28, 2026
c33dc44
Refactor SymmetricHeap and VMemChunkedAllocator to use topology-aware…
artulab May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions iris/drivers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.

"""
Shared driver package types for fabric backends.
Shared driver package types for memory backends.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Optional

from iris.drivers.base import BaseFabricDriver
from iris.drivers.base import BaseDriver

__all__ = ["DriverStack"]


@dataclass
class DriverStack:
"""Fabric drivers available for a rank."""
"""Driver available for a rank."""

vendor: str
fabric: Optional[BaseFabricDriver]
driver: Optional[BaseDriver]
41 changes: 33 additions & 8 deletions iris/drivers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.

"""
Abstract base classes, shared dataclasses, and exceptions for fabric drivers.
Abstract base classes, shared dataclasses, and exceptions for memory drivers.
"""

from __future__ import annotations

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any
from typing import Any, Optional

from iris.host.distributed.topology import InterconnectLevel

__all__ = [
"PeerMapping",
"LocalAllocation",
"BaseFabricDriver",
"BaseDriver",
"DriverError",
"DriverNotSupported",
]
Expand All @@ -40,6 +40,7 @@ class LocalAllocation:
va: int
size: int
handle: Any
_va_owned: bool = True


class DriverError(RuntimeError):
Expand All @@ -50,23 +51,23 @@ class DriverNotSupported(DriverError):
"""The current hardware or software stack does not support this driver."""


class BaseFabricDriver(ABC):
"""Cross-node fabric memory sharing (for example NVSwitch or xGMI)."""
class BaseDriver(ABC):
"""Generic base class for local and fabric memory drivers."""

@abstractmethod
def initialize(self, device_ordinal: int) -> None:
"""Prepare the driver for a specific local GPU."""

@abstractmethod
def allocate_exportable(self, size: int) -> LocalAllocation:
"""Allocate memory that can be shared through the fabric transport."""
def allocate_exportable(self, size: int, va: Optional[int] = None) -> LocalAllocation:
"""Allocate exportable memory, optionally mapping it at a caller-reserved VA."""

@abstractmethod
def export_handle(self, allocation: LocalAllocation) -> bytes:
"""Export a transport-specific handle for a local allocation."""

@abstractmethod
def import_and_map(self, peer_rank: int, handle_bytes: bytes, size: int) -> PeerMapping:
def import_and_map(self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None) -> PeerMapping:
"""Import a peer handle and map it into the local virtual address space."""

@abstractmethod
Expand All @@ -76,3 +77,27 @@ def cleanup_import(self, mapping: PeerMapping) -> None:
@abstractmethod
def cleanup_local(self, allocation: LocalAllocation) -> None:
"""Release a locally-exported allocation."""

@abstractmethod
def get_minimum_granularity(self) -> int:
"""Minimum allocation granularity in bytes for this driver+device."""

@abstractmethod
def reserve_va(self, size: int, alignment: int = 0) -> int:
"""Reserve a virtual address range without backing physical memory."""

@abstractmethod
def free_va(self, va: int, size: int) -> None:
"""Free a VA range previously returned by reserve_va."""

def get_address_range(self, ptr: int) -> tuple[int, int]:
"""Return the base VA and size of the allocation containing ptr."""
raise DriverNotSupported(
f"{type(self).__name__} does not support get_address_range"
)

def export_pointer_handle(self, ptr: int, size: int) -> bytes:
"""Export a peer handle for an arbitrary device pointer."""
raise DriverNotSupported(
f"{type(self).__name__} does not support export_pointer_handle"
)
28 changes: 24 additions & 4 deletions iris/drivers/fabric/amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,30 +7,50 @@

from __future__ import annotations

from iris.drivers.base import BaseFabricDriver, DriverNotSupported, LocalAllocation, PeerMapping
from typing import Optional

from iris.drivers.base import (
BaseDriver,
DriverNotSupported,
LocalAllocation,
PeerMapping,
)

__all__ = ["AmdFabricDriver"]

_NOT_IMPLEMENTED_MESSAGE = "AMD fabric driver not yet implemented"


class AmdFabricDriver(BaseFabricDriver):
class AmdFabricDriver(BaseDriver):
"""AMD fabric driver placeholder."""

def initialize(self, device_ordinal: int) -> None:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def allocate_exportable(self, size: int) -> LocalAllocation:
def allocate_exportable(
self, size: int, va: Optional[int] = None
) -> LocalAllocation:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def export_handle(self, allocation: LocalAllocation) -> bytes:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def import_and_map(self, peer_rank: int, handle_bytes: bytes, size: int) -> PeerMapping:
def import_and_map(
self, peer_rank: int, handle_bytes: bytes, size: int, va: Optional[int] = None
) -> PeerMapping:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def cleanup_import(self, mapping: PeerMapping) -> None:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def cleanup_local(self, allocation: LocalAllocation) -> None:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def get_minimum_granularity(self) -> int:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def reserve_va(self, size: int, alignment: int = 0) -> int:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)

def free_va(self, va: int, size: int) -> None:
raise DriverNotSupported(_NOT_IMPLEMENTED_MESSAGE)
Loading
Loading