From 29cc96a8b2271a6a6e15bc3282b87008a4603867 Mon Sep 17 00:00:00 2001 From: apbose Date: Thu, 13 Feb 2025 12:10:45 -0800 Subject: [PATCH 1/8] TensorRT-LLM import fix and aot_joint_export specify as explicit setting in dynamo.compile --- py/torch_tensorrt/dynamo/_compiler.py | 9 ++ .../dynamo/conversion/converter_utils.py | 127 +++++++++++------- 2 files changed, 91 insertions(+), 45 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 6928347baa..2e3824f4a9 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -98,6 +98,7 @@ def cross_compile_for_windows( enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL, l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, + use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows @@ -173,6 +174,7 @@ def cross_compile_for_windows( enable_weight_streaming (bool): Enable weight streaming. tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). + use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -332,6 +334,7 @@ def cross_compile_for_windows( "enable_weight_streaming": enable_weight_streaming, "tiling_optimization_level": tiling_optimization_level, "l2_limit_for_tiling": l2_limit_for_tiling, + "use_aot_joint_export": use_aot_joint_export, } # disable the following settings is not supported for cross compilation for windows feature @@ -421,6 +424,7 @@ def compile( enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL, l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, + use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -498,6 +502,7 @@ def compile( enable_weight_streaming (bool): Enable weight streaming. tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). + **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -674,6 +679,7 @@ def compile( "enable_weight_streaming": enable_weight_streaming, "tiling_optimization_level": tiling_optimization_level, "l2_limit_for_tiling": l2_limit_for_tiling, + "use_aot_joint_export": use_aot_joint_export, } settings = CompilationSettings(**compilation_options) @@ -964,6 +970,7 @@ def convert_exported_program_to_serialized_trt_engine( enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL, l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, + use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT, **kwargs: Any, ) -> bytes: """Convert an ExportedProgram to a serialized TensorRT engine @@ -1029,6 +1036,7 @@ def convert_exported_program_to_serialized_trt_engine( enable_weight_streaming (bool): Enable weight streaming. tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). + use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors Returns: bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs """ @@ -1147,6 +1155,7 @@ def convert_exported_program_to_serialized_trt_engine( "enable_weight_streaming": enable_weight_streaming, "tiling_optimization_level": tiling_optimization_level, "l2_limit_for_tiling": l2_limit_for_tiling, + "use_aot_joint_export": use_aot_joint_export, } settings = CompilationSettings(**compilation_options) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index bcb8495c67..d1e85e6e3d 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -3,6 +3,8 @@ import functools import logging import os +import subprocess +import sys from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload import numpy as np @@ -13,6 +15,7 @@ from torch.fx.node import Argument, Target from torch.fx.passes.shape_prop import TensorMetadata from torch_tensorrt import _enums +from torch_tensorrt._enums import Platform from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -1011,57 +1014,91 @@ def load_tensorrt_llm() -> bool: Returns: bool: True if the plugin was successfully loaded and initialized, False otherwise. """ - try: - import tensorrt_llm as trt_llm # noqa: F401 - _LOGGER.info("TensorRT-LLM successfully imported") - return True - except (ImportError, AssertionError) as e_import_error: - # Check for environment variable for the plugin library path - plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH") - if not plugin_lib_path: + plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH") + if not plugin_lib_path: + _LOGGER.warning( + "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library", + ) + use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in ( + "1", + "true", + "yes", + "on", + ) + if not use_trtllm_plugin: _LOGGER.warning( - "TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops", + "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library" ) return False + else: + py_version = f"cp{sys.version_info.major}{sys.version_info.minor}" + platform = Platform.current_platform() + if Platform == Platform.LINUX_X86_64: + platform = "linux_x86_64" + elif Platform == Platform.LINUX_AARCH64: + platform = "linux_aarch64" + + if py_version not in ("cp310", "cp312"): + _LOGGER.warning( + "No available wheel for python versions other than py3.10 and py3.12" + ) + if py_version == "cp310" and platform == "linux_aarch64": + _LOGGER.warning("No available wheel for python3.10 with Linux aarch64") - _LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}") - try: - # Load the shared library - handle = ctypes.CDLL(plugin_lib_path) - _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}") - except OSError as e_os_error: - _LOGGER.error( - f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}" - f"Ensure the path is correct and the library is compatible", - exc_info=e_os_error, + base_url = "https://pypi.nvidia.com/tensorrt-llm/" + file_name = ( + "tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl" ) - return False + download_url = base_url + file_name + cmd = ["wget", download_url] + subprocess.run(cmd) + if os.path.exists(file_name): + _LOGGER.info("filename download is completed") + import zipfile + + with zipfile.ZipFile(file_name, "r") as zip_ref: + zip_ref.extractall( + "./tensorrt_llm" + ) # Extract to a folder named 'tensorrt_llm' + plugin_lib_path = ( + "./tensorrt_llm" + "libnvinfer_plugin_tensorrt_llm.so" + ) + try: + # Load the shared library + handle = ctypes.CDLL(plugin_lib_path) + _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}") + except OSError as e_os_error: + _LOGGER.error( + f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}" + f"Ensure the path is correct and the library is compatible", + exc_info=e_os_error, + ) + return False - try: - # Configure plugin initialization arguments - handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] - handle.initTrtLlmPlugins.restype = ctypes.c_bool - except AttributeError as e_plugin_unavailable: - _LOGGER.warning( - "Unable to initialize the TensorRT-LLM plugin library", - exc_info=e_plugin_unavailable, - ) - return False + try: + # Configure plugin initialization arguments + handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + handle.initTrtLlmPlugins.restype = ctypes.c_bool + except AttributeError as e_plugin_unavailable: + _LOGGER.warning( + "Unable to initialize the TensorRT-LLM plugin library", + exc_info=e_plugin_unavailable, + ) + return False - try: - # Initialize the plugin - TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm" - if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")): - _LOGGER.info("TensorRT-LLM plugin successfully initialized") - return True - else: - _LOGGER.warning("TensorRT-LLM plugin library failed in initialization") - return False - except Exception as e_initialization_error: - _LOGGER.warning( - "Exception occurred during TensorRT-LLM plugin library initialization", - exc_info=e_initialization_error, - ) + try: + # Initialize the plugin + TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm" + if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")): + _LOGGER.info("TensorRT-LLM plugin successfully initialized") + return True + else: + _LOGGER.warning("TensorRT-LLM plugin library failed in initialization") return False - return False + except Exception as e_initialization_error: + _LOGGER.warning( + "Exception occurred during TensorRT-LLM plugin library initialization", + exc_info=e_initialization_error, + ) + return False From 02e537bc757749fa615a7051acac06d87cad3f45 Mon Sep 17 00:00:00 2001 From: apbose Date: Thu, 27 Feb 2025 12:02:31 -0800 Subject: [PATCH 2/8] TRT-LLM installation utilities and adding test cases --- .../dynamo/conversion/converter_utils.py | 116 +++++++++++++----- tests/py/dynamo/conversion/harness.py | 13 ++ tests/py/dynamo/conversion/test_nccl_ops.py | 80 ++++++++++++ 3 files changed, 178 insertions(+), 31 deletions(-) create mode 100644 tests/py/dynamo/conversion/test_nccl_ops.py diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index d1e85e6e3d..454a0ba519 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -3,6 +3,7 @@ import functools import logging import os +import shutil import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload @@ -1007,6 +1008,84 @@ def args_bounds_check( return args[i] if len(args) > i and args[i] is not None else replacement +def install_wget(platform: str) -> None: + if shutil.which("wget"): + _LOGGER.debug("wget is already installed") + return + if platform.startswith("linux"): + try: + # if its root + if os.geteuid() == 0: + subprocess.run(["apt-get", "update"], check=True) + subprocess.run(["apt-get", "install", "-y", "wget"], check=True) + else: + _LOGGER.debug("Please run with sudo permissions") + subprocess.run(["sudo", "apt-get", "update"], check=True) + subprocess.run(["sudo", "apt-get", "install", "-y", "wget"], check=True) + except subprocess.CalledProcessError as e: + _LOGGER.debug("Error installing wget:", e) + + +def install_mpi(platform: str) -> None: + if platform.startswith("linux"): + try: + # if its root + if os.geteuid() == 0: + subprocess.run(["apt-get", "update"], check=True) + subprocess.run(["apt-get", "install", "-y", "libmpich-dev"], check=True) + subprocess.run( + ["apt-get", "install", "-y", "libopenmpi-dev"], check=True + ) + else: + _LOGGER.debug("Please run with sudo permissions") + subprocess.run(["sudo", "apt-get", "update"], check=True) + subprocess.run( + ["sudo", "apt-get", "install", "-y", "libmpich-dev"], check=True + ) + subprocess.run( + ["sudo", "apt-get", "install", "-y", "libopenmpi-dev"], check=True + ) + except subprocess.CalledProcessError as e: + _LOGGER.debug("Error installing mpi libs:", e) + + +def download_plugin_lib_path(py_version: str, platform: str) -> str: + plugin_lib_path = None + if py_version not in ("cp310", "cp312"): + _LOGGER.warning( + "No available wheel for python versions other than py3.10 and py3.12" + ) + install_wget(platform) + base_url = "https://pypi.nvidia.com/tensorrt-llm/" + file_name = f"tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl" + download_url = base_url + file_name + cmd = ["wget", download_url] + try: + if not (os.path.exists(file_name)): + _LOGGER.info(f"Running command: {' '.join(cmd)}") + subprocess.run(cmd) + _LOGGER.info("Download complete of wheel") + if os.path.exists(file_name): + _LOGGER.info("filename now present") + if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"): + plugin_lib_path = ( + "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" + ) + else: + import zipfile + + with zipfile.ZipFile(file_name, "r") as zip_ref: + zip_ref.extractall(".") # Extract to a folder named 'tensorrt_llm' + plugin_lib_path = ( + "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" + ) + except subprocess.CalledProcessError as e: + _LOGGER.debug(f"Error occurred while trying to download: {e}") + except Exception as e: + _LOGGER.debug(f"An unexpected error occurred: {e}") + return plugin_lib_path + + def load_tensorrt_llm() -> bool: """ Attempts to load the TensorRT-LLM plugin and initialize it. @@ -1014,12 +1093,13 @@ def load_tensorrt_llm() -> bool: Returns: bool: True if the plugin was successfully loaded and initialized, False otherwise. """ - plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH") if not plugin_lib_path: _LOGGER.warning( "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library", ) + for key, value in os.environ.items(): + print(f"{key}: {value}") use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in ( "1", "true", @@ -1034,38 +1114,12 @@ def load_tensorrt_llm() -> bool: else: py_version = f"cp{sys.version_info.major}{sys.version_info.minor}" platform = Platform.current_platform() - if Platform == Platform.LINUX_X86_64: - platform = "linux_x86_64" - elif Platform == Platform.LINUX_AARCH64: - platform = "linux_aarch64" - - if py_version not in ("cp310", "cp312"): - _LOGGER.warning( - "No available wheel for python versions other than py3.10 and py3.12" - ) - if py_version == "cp310" and platform == "linux_aarch64": - _LOGGER.warning("No available wheel for python3.10 with Linux aarch64") - base_url = "https://pypi.nvidia.com/tensorrt-llm/" - file_name = ( - "tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl" - ) - download_url = base_url + file_name - cmd = ["wget", download_url] - subprocess.run(cmd) - if os.path.exists(file_name): - _LOGGER.info("filename download is completed") - import zipfile - - with zipfile.ZipFile(file_name, "r") as zip_ref: - zip_ref.extractall( - "./tensorrt_llm" - ) # Extract to a folder named 'tensorrt_llm' - plugin_lib_path = ( - "./tensorrt_llm" + "libnvinfer_plugin_tensorrt_llm.so" - ) + platform = str(platform).lower() + plugin_lib_path = download_plugin_lib_path(py_version, platform) try: - # Load the shared library + # Load the shared + install_mpi(platform) handle = ctypes.CDLL(plugin_lib_path) _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}") except OSError as e_os_error: diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index 6ff45507a0..e45a11f0d5 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -353,6 +353,7 @@ def generate_graph( enable_passes: bool, propagate_shapes: bool = False, settings: CompilationSettings = CompilationSettings(), + fuse_distributed_ops: bool = False, torch_export_dynamic_shapes: Optional[Any] = None, ): mod = mod.eval() @@ -368,6 +369,16 @@ def generate_graph( tuple(torch_export_inputs), dynamic_shapes=torch_export_dynamic_shapes, ) + if fuse_distributed_ops: + from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import ( + fuse_distributed_ops, + ) + + gm = exported_program.graph_module + gm = fuse_distributed_ops(gm, settings) + exported_program = exported_program.run_decompositions( + get_decompositions(False) + ) if enable_passes: exported_program = pre_export_lowering(exported_program, settings) exported_program = exported_program.run_decompositions( @@ -406,6 +417,7 @@ def run_test( propagate_shapes=False, int32_reqd=False, immutable_weights=True, + fuse_distributed_ops=False, ): # TODO: lan to remove this and set use_dynamo_traccer to True by default # once all the converter test files are moved to use_dynamo_tracer @@ -426,6 +438,7 @@ def run_test( enable_passes=enable_passes, propagate_shapes=propagate_shapes, settings=compilation_settings, + fuse_distributed_ops=fuse_distributed_ops, ) num_inputs = len(inputs) diff --git a/tests/py/dynamo/conversion/test_nccl_ops.py b/tests/py/dynamo/conversion/test_nccl_ops.py new file mode 100644 index 0000000000..4db24881c8 --- /dev/null +++ b/tests/py/dynamo/conversion/test_nccl_ops.py @@ -0,0 +1,80 @@ +import os + +import torch +import torch.distributed as dist +import torch.nn as nn +from parameterized import parameterized +from torch.testing._internal.common_utils import run_tests + + +def set_environment_variables(): + os.environ["WORLD_SIZE"] = str(1) + os.environ["RANK"] = str(0) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(29500) + os.environ["USE_TRTLLM_PLUGINS"] = "1" + + +set_environment_variables() +dist.init_process_group(backend="nccl", init_method="env://") +group = dist.new_group(ranks=[0]) +group_name = group.group_name + +from .harness import DispatchTestCase + + +class TestGatherNcclOpsConverter(DispatchTestCase): + @parameterized.expand([(8)]) + def test_nccl_ops(self, linear_layer_dim): + class DistributedGatherModel(nn.Module): + def __init__(self, input_dim): + super().__init__() + self.fc = torch.nn.Linear(input_dim, input_dim) + + def forward(self, x): + x = self.fc(x) + world_size = 1 + gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor( + x, world_size, group_name + ) + gathered_tensor = torch.ops._c10d_functional.wait_tensor( + gathered_tensor + ) + return gathered_tensor + + inputs = [torch.randn(1, linear_layer_dim).to("cuda")] + + self.run_test( + DistributedGatherModel(linear_layer_dim).cuda(), + inputs, + use_dynamo_tracer=True, + fuse_distributed_ops=True, + ) + + # TODO: Look at this + # @parameterized.expand( + # [ + # (8) + # ] + # ) + # def test_nccl_ops_scatter(self, linear_layer_dim): + + # class DistributedReduceScatterModel(nn.Module): + # def __init__(self, input_dim): + # super().__init__() + # def forward(self, x): + # world_size = 1 + # scatter_reduce_tensor = torch.ops._c10d_functional.reduce_scatter_tensor(x, "sum", world_size, group_name) + # scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(scatter_reduce_tensor) + # return scatter_reduce_tensor + # inputs = [torch.zeros(1, linear_layer_dim).to("cuda")] + + # self.run_test( + # DistributedReduceScatterModel(linear_layer_dim).cuda(), + # inputs, + # use_dynamo_tracer=True, + # ) + + +if __name__ == "__main__": + run_tests() From 636faa2cc899cbef153a9cf2a04c1c8ed23088ad Mon Sep 17 00:00:00 2001 From: apbose Date: Tue, 15 Apr 2025 13:59:48 -0700 Subject: [PATCH 3/8] adding the option in _compiler.py --- py/torch_tensorrt/dynamo/_compiler.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 2e3824f4a9..f7d9f6e9d3 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -98,7 +98,7 @@ def cross_compile_for_windows( enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL, l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, - use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT, + use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows @@ -174,7 +174,7 @@ def cross_compile_for_windows( enable_weight_streaming (bool): Enable weight streaming. tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). - use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors + use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -334,7 +334,7 @@ def cross_compile_for_windows( "enable_weight_streaming": enable_weight_streaming, "tiling_optimization_level": tiling_optimization_level, "l2_limit_for_tiling": l2_limit_for_tiling, - "use_aot_joint_export": use_aot_joint_export, + "use_distributed_mode_trace": use_distributed_mode_trace, } # disable the following settings is not supported for cross compilation for windows feature @@ -424,7 +424,7 @@ def compile( enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL, l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, - use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT, + use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -502,7 +502,7 @@ def compile( enable_weight_streaming (bool): Enable weight streaming. tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). - + ç **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -679,7 +679,7 @@ def compile( "enable_weight_streaming": enable_weight_streaming, "tiling_optimization_level": tiling_optimization_level, "l2_limit_for_tiling": l2_limit_for_tiling, - "use_aot_joint_export": use_aot_joint_export, + "use_distributed_mode_trace": use_distributed_mode_trace, } settings = CompilationSettings(**compilation_options) @@ -970,7 +970,7 @@ def convert_exported_program_to_serialized_trt_engine( enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING, tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL, l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING, - use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT, + use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE, **kwargs: Any, ) -> bytes: """Convert an ExportedProgram to a serialized TensorRT engine @@ -1036,7 +1036,7 @@ def convert_exported_program_to_serialized_trt_engine( enable_weight_streaming (bool): Enable weight streaming. tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). - use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors + use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE, Returns: bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs """ @@ -1155,7 +1155,7 @@ def convert_exported_program_to_serialized_trt_engine( "enable_weight_streaming": enable_weight_streaming, "tiling_optimization_level": tiling_optimization_level, "l2_limit_for_tiling": l2_limit_for_tiling, - "use_aot_joint_export": use_aot_joint_export, + "use_distributed_mode_trace": use_distributed_mode_trace, } settings = CompilationSettings(**compilation_options) From aae4bb31c42e11e8ed7fc56b4799b8bffcc523cd Mon Sep 17 00:00:00 2001 From: apbose Date: Thu, 17 Apr 2025 14:19:08 -0700 Subject: [PATCH 4/8] changes in the TRT-LLM loading tool- removing install_wget, install_unzip, install_mpi --- .../dynamo/conversion/converter_utils.py | 113 ++++++------------ tests/py/dynamo/conversion/harness.py | 13 -- tests/py/dynamo/conversion/test_nccl_ops.py | 80 ------------- 3 files changed, 39 insertions(+), 167 deletions(-) delete mode 100644 tests/py/dynamo/conversion/test_nccl_ops.py diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index 454a0ba519..b7f7fb3e90 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -1008,87 +1008,51 @@ def args_bounds_check( return args[i] if len(args) > i and args[i] is not None else replacement -def install_wget(platform: str) -> None: - if shutil.which("wget"): - _LOGGER.debug("wget is already installed") - return - if platform.startswith("linux"): - try: - # if its root - if os.geteuid() == 0: - subprocess.run(["apt-get", "update"], check=True) - subprocess.run(["apt-get", "install", "-y", "wget"], check=True) - else: - _LOGGER.debug("Please run with sudo permissions") - subprocess.run(["sudo", "apt-get", "update"], check=True) - subprocess.run(["sudo", "apt-get", "install", "-y", "wget"], check=True) - except subprocess.CalledProcessError as e: - _LOGGER.debug("Error installing wget:", e) - - -def install_mpi(platform: str) -> None: - if platform.startswith("linux"): - try: - # if its root - if os.geteuid() == 0: - subprocess.run(["apt-get", "update"], check=True) - subprocess.run(["apt-get", "install", "-y", "libmpich-dev"], check=True) - subprocess.run( - ["apt-get", "install", "-y", "libopenmpi-dev"], check=True - ) - else: - _LOGGER.debug("Please run with sudo permissions") - subprocess.run(["sudo", "apt-get", "update"], check=True) - subprocess.run( - ["sudo", "apt-get", "install", "-y", "libmpich-dev"], check=True - ) - subprocess.run( - ["sudo", "apt-get", "install", "-y", "libopenmpi-dev"], check=True - ) - except subprocess.CalledProcessError as e: - _LOGGER.debug("Error installing mpi libs:", e) - - def download_plugin_lib_path(py_version: str, platform: str) -> str: plugin_lib_path = None - if py_version not in ("cp310", "cp312"): - _LOGGER.warning( - "No available wheel for python versions other than py3.10 and py3.12" - ) - install_wget(platform) + + # Downloading TRT-LLM lib + # TODO: check how to fix the 0.18.0 hardcode below base_url = "https://pypi.nvidia.com/tensorrt-llm/" - file_name = f"tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl" + file_name = f"tensorrt_llm-0.18.0.post1-{py_version}-{py_version}-{platform}.whl" download_url = base_url + file_name cmd = ["wget", download_url] - try: - if not (os.path.exists(file_name)): - _LOGGER.info(f"Running command: {' '.join(cmd)}") - subprocess.run(cmd) - _LOGGER.info("Download complete of wheel") - if os.path.exists(file_name): - _LOGGER.info("filename now present") - if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"): - plugin_lib_path = ( - "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" - ) - else: - import zipfile + if not (os.path.exists(file_name)): + try: + subprocess.run(cmd, check=True) + _LOGGER.debug("Download succeeded and TRT-LLM wheel is now present") + except subprocess.CalledProcessError as e: + _LOGGER.error( + "Download failed (file not found or connection issue). Error code:", + e.returncode, + ) + except FileNotFoundError: + _LOGGER.error("wget is required but not found. Please install wget.") - with zipfile.ZipFile(file_name, "r") as zip_ref: - zip_ref.extractall(".") # Extract to a folder named 'tensorrt_llm' - plugin_lib_path = ( - "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" - ) - except subprocess.CalledProcessError as e: - _LOGGER.debug(f"Error occurred while trying to download: {e}") - except Exception as e: - _LOGGER.debug(f"An unexpected error occurred: {e}") + # Proceeding with the unzip of the wheel file + # This will exist if the filename was already downloaded + if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"): + plugin_lib_path = "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" + else: + try: + import zipfile + except: + raise ImportError( + "zipfile module is required but not found. Please install zipfile" + ) + with zipfile.ZipFile(file_name, "r") as zip_ref: + zip_ref.extractall(".") # Extract to a folder named 'tensorrt_llm' + plugin_lib_path = ( + "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" + ) return plugin_lib_path def load_tensorrt_llm() -> bool: """ Attempts to load the TensorRT-LLM plugin and initialize it. + Either the env variable TRTLLM_PLUGINS_PATH specifies the path + If the above is not, the user can specify USE_TRTLLM_PLUGINS as either of 1, true, yes, on to download the TRT-LLM distribution and load it Returns: bool: True if the plugin was successfully loaded and initialized, False otherwise. @@ -1098,8 +1062,9 @@ def load_tensorrt_llm() -> bool: _LOGGER.warning( "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library", ) - for key, value in os.environ.items(): - print(f"{key}: {value}") + # for key, value in os.environ.items(): + # print(f"{key}: {value}") + # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in ( "1", "true", @@ -1112,14 +1077,14 @@ def load_tensorrt_llm() -> bool: ) return False else: - py_version = f"cp{sys.version_info.major}{sys.version_info.minor}" + # this is used as the default py version + py_version = f"cp312" platform = Platform.current_platform() platform = str(platform).lower() plugin_lib_path = download_plugin_lib_path(py_version, platform) try: - # Load the shared - install_mpi(platform) + # Load the shared TRT-LLM file handle = ctypes.CDLL(plugin_lib_path) _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}") except OSError as e_os_error: diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py index e45a11f0d5..6ff45507a0 100644 --- a/tests/py/dynamo/conversion/harness.py +++ b/tests/py/dynamo/conversion/harness.py @@ -353,7 +353,6 @@ def generate_graph( enable_passes: bool, propagate_shapes: bool = False, settings: CompilationSettings = CompilationSettings(), - fuse_distributed_ops: bool = False, torch_export_dynamic_shapes: Optional[Any] = None, ): mod = mod.eval() @@ -369,16 +368,6 @@ def generate_graph( tuple(torch_export_inputs), dynamic_shapes=torch_export_dynamic_shapes, ) - if fuse_distributed_ops: - from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import ( - fuse_distributed_ops, - ) - - gm = exported_program.graph_module - gm = fuse_distributed_ops(gm, settings) - exported_program = exported_program.run_decompositions( - get_decompositions(False) - ) if enable_passes: exported_program = pre_export_lowering(exported_program, settings) exported_program = exported_program.run_decompositions( @@ -417,7 +406,6 @@ def run_test( propagate_shapes=False, int32_reqd=False, immutable_weights=True, - fuse_distributed_ops=False, ): # TODO: lan to remove this and set use_dynamo_traccer to True by default # once all the converter test files are moved to use_dynamo_tracer @@ -438,7 +426,6 @@ def run_test( enable_passes=enable_passes, propagate_shapes=propagate_shapes, settings=compilation_settings, - fuse_distributed_ops=fuse_distributed_ops, ) num_inputs = len(inputs) diff --git a/tests/py/dynamo/conversion/test_nccl_ops.py b/tests/py/dynamo/conversion/test_nccl_ops.py deleted file mode 100644 index 4db24881c8..0000000000 --- a/tests/py/dynamo/conversion/test_nccl_ops.py +++ /dev/null @@ -1,80 +0,0 @@ -import os - -import torch -import torch.distributed as dist -import torch.nn as nn -from parameterized import parameterized -from torch.testing._internal.common_utils import run_tests - - -def set_environment_variables(): - os.environ["WORLD_SIZE"] = str(1) - os.environ["RANK"] = str(0) - os.environ["MASTER_ADDR"] = "127.0.0.1" - os.environ["MASTER_PORT"] = str(29500) - os.environ["USE_TRTLLM_PLUGINS"] = "1" - - -set_environment_variables() -dist.init_process_group(backend="nccl", init_method="env://") -group = dist.new_group(ranks=[0]) -group_name = group.group_name - -from .harness import DispatchTestCase - - -class TestGatherNcclOpsConverter(DispatchTestCase): - @parameterized.expand([(8)]) - def test_nccl_ops(self, linear_layer_dim): - class DistributedGatherModel(nn.Module): - def __init__(self, input_dim): - super().__init__() - self.fc = torch.nn.Linear(input_dim, input_dim) - - def forward(self, x): - x = self.fc(x) - world_size = 1 - gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor( - x, world_size, group_name - ) - gathered_tensor = torch.ops._c10d_functional.wait_tensor( - gathered_tensor - ) - return gathered_tensor - - inputs = [torch.randn(1, linear_layer_dim).to("cuda")] - - self.run_test( - DistributedGatherModel(linear_layer_dim).cuda(), - inputs, - use_dynamo_tracer=True, - fuse_distributed_ops=True, - ) - - # TODO: Look at this - # @parameterized.expand( - # [ - # (8) - # ] - # ) - # def test_nccl_ops_scatter(self, linear_layer_dim): - - # class DistributedReduceScatterModel(nn.Module): - # def __init__(self, input_dim): - # super().__init__() - # def forward(self, x): - # world_size = 1 - # scatter_reduce_tensor = torch.ops._c10d_functional.reduce_scatter_tensor(x, "sum", world_size, group_name) - # scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(scatter_reduce_tensor) - # return scatter_reduce_tensor - # inputs = [torch.zeros(1, linear_layer_dim).to("cuda")] - - # self.run_test( - # DistributedReduceScatterModel(linear_layer_dim).cuda(), - # inputs, - # use_dynamo_tracer=True, - # ) - - -if __name__ == "__main__": - run_tests() From 85811f11e25fab0e27a62f7ca21b8286a3d83121 Mon Sep 17 00:00:00 2001 From: apbose Date: Thu, 17 Apr 2025 16:35:53 -0700 Subject: [PATCH 5/8] Further changes in error logging of the TRT-LLM installation tool --- .../dynamo/conversion/converter_utils.py | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index b7f7fb3e90..dd0d37c7d3 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -1014,7 +1014,7 @@ def download_plugin_lib_path(py_version: str, platform: str) -> str: # Downloading TRT-LLM lib # TODO: check how to fix the 0.18.0 hardcode below base_url = "https://pypi.nvidia.com/tensorrt-llm/" - file_name = f"tensorrt_llm-0.18.0.post1-{py_version}-{py_version}-{platform}.whl" + file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl" download_url = base_url + file_name cmd = ["wget", download_url] if not (os.path.exists(file_name)): @@ -1051,19 +1051,14 @@ def download_plugin_lib_path(py_version: str, platform: str) -> str: def load_tensorrt_llm() -> bool: """ Attempts to load the TensorRT-LLM plugin and initialize it. - Either the env variable TRTLLM_PLUGINS_PATH specifies the path - If the above is not, the user can specify USE_TRTLLM_PLUGINS as either of 1, true, yes, on to download the TRT-LLM distribution and load it + Either the env variable TRTLLM_PLUGINS_PATH can specify the path + Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it Returns: bool: True if the plugin was successfully loaded and initialized, False otherwise. """ plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH") if not plugin_lib_path: - _LOGGER.warning( - "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library", - ) - # for key, value in os.environ.items(): - # print(f"{key}: {value}") # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in ( "1", @@ -1073,7 +1068,7 @@ def load_tensorrt_llm() -> bool: ) if not use_trtllm_plugin: _LOGGER.warning( - "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library" + "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT" ) return False else: @@ -1083,16 +1078,25 @@ def load_tensorrt_llm() -> bool: platform = str(platform).lower() plugin_lib_path = download_plugin_lib_path(py_version, platform) + try: # Load the shared TRT-LLM file handle = ctypes.CDLL(plugin_lib_path) _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}") except OSError as e_os_error: - _LOGGER.error( - f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}" - f"Ensure the path is correct and the library is compatible", - exc_info=e_os_error, - ) + if "libmpi" in str(e_os_error): + _LOGGER.warning( + f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. " + f"The dependency libmpi.so is missing. " + f"Please install the packages libmpich-dev and libopenmpi-dev.", + exc_info=e_os_error, + ) + else: + _LOGGER.warning( + f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}" + f"Ensure the path is correct and the library is compatible", + exc_info=e_os_error, + ) return False try: @@ -1121,3 +1125,4 @@ def load_tensorrt_llm() -> bool: exc_info=e_initialization_error, ) return False + return False From 77f2145cdb4b07a462ffa3585bd665c68dd6af99 Mon Sep 17 00:00:00 2001 From: apbose Date: Thu, 17 Apr 2025 16:58:54 -0700 Subject: [PATCH 6/8] moving the load_tensorrt_llm to dynamo/utils.py --- .../dynamo/conversion/converter_utils.py | 124 ----------------- .../conversion/custom_ops_converters.py | 2 +- py/torch_tensorrt/dynamo/utils.py | 127 +++++++++++++++++- 3 files changed, 127 insertions(+), 126 deletions(-) diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py index dd0d37c7d3..46db1c0d3b 100644 --- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py +++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py @@ -3,9 +3,6 @@ import functools import logging import os -import shutil -import subprocess -import sys from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload import numpy as np @@ -16,7 +13,6 @@ from torch.fx.node import Argument, Target from torch.fx.passes.shape_prop import TensorMetadata from torch_tensorrt import _enums -from torch_tensorrt._enums import Platform from torch_tensorrt.dynamo._settings import CompilationSettings from torch_tensorrt.dynamo._SourceIR import SourceIR from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext @@ -1006,123 +1002,3 @@ def args_bounds_check( args: Tuple[Argument, ...], i: int, replacement: Optional[Any] = None ) -> Any: return args[i] if len(args) > i and args[i] is not None else replacement - - -def download_plugin_lib_path(py_version: str, platform: str) -> str: - plugin_lib_path = None - - # Downloading TRT-LLM lib - # TODO: check how to fix the 0.18.0 hardcode below - base_url = "https://pypi.nvidia.com/tensorrt-llm/" - file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl" - download_url = base_url + file_name - cmd = ["wget", download_url] - if not (os.path.exists(file_name)): - try: - subprocess.run(cmd, check=True) - _LOGGER.debug("Download succeeded and TRT-LLM wheel is now present") - except subprocess.CalledProcessError as e: - _LOGGER.error( - "Download failed (file not found or connection issue). Error code:", - e.returncode, - ) - except FileNotFoundError: - _LOGGER.error("wget is required but not found. Please install wget.") - - # Proceeding with the unzip of the wheel file - # This will exist if the filename was already downloaded - if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"): - plugin_lib_path = "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" - else: - try: - import zipfile - except: - raise ImportError( - "zipfile module is required but not found. Please install zipfile" - ) - with zipfile.ZipFile(file_name, "r") as zip_ref: - zip_ref.extractall(".") # Extract to a folder named 'tensorrt_llm' - plugin_lib_path = ( - "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" - ) - return plugin_lib_path - - -def load_tensorrt_llm() -> bool: - """ - Attempts to load the TensorRT-LLM plugin and initialize it. - Either the env variable TRTLLM_PLUGINS_PATH can specify the path - Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it - - Returns: - bool: True if the plugin was successfully loaded and initialized, False otherwise. - """ - plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH") - if not plugin_lib_path: - # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user - use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in ( - "1", - "true", - "yes", - "on", - ) - if not use_trtllm_plugin: - _LOGGER.warning( - "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT" - ) - return False - else: - # this is used as the default py version - py_version = f"cp312" - platform = Platform.current_platform() - - platform = str(platform).lower() - plugin_lib_path = download_plugin_lib_path(py_version, platform) - - try: - # Load the shared TRT-LLM file - handle = ctypes.CDLL(plugin_lib_path) - _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}") - except OSError as e_os_error: - if "libmpi" in str(e_os_error): - _LOGGER.warning( - f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. " - f"The dependency libmpi.so is missing. " - f"Please install the packages libmpich-dev and libopenmpi-dev.", - exc_info=e_os_error, - ) - else: - _LOGGER.warning( - f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}" - f"Ensure the path is correct and the library is compatible", - exc_info=e_os_error, - ) - return False - - try: - # Configure plugin initialization arguments - handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] - handle.initTrtLlmPlugins.restype = ctypes.c_bool - except AttributeError as e_plugin_unavailable: - _LOGGER.warning( - "Unable to initialize the TensorRT-LLM plugin library", - exc_info=e_plugin_unavailable, - ) - return False - - try: - # Initialize the plugin - TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm" - if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")): - _LOGGER.info("TensorRT-LLM plugin successfully initialized") - return True - else: - _LOGGER.warning("TensorRT-LLM plugin library failed in initialization") - return False - except Exception as e_initialization_error: - _LOGGER.warning( - "Exception occurred during TensorRT-LLM plugin library initialization", - exc_info=e_initialization_error, - ) - return False - return False diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py index 79611c7552..3e67457e54 100644 --- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py +++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py @@ -11,11 +11,11 @@ from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( dynamo_tensorrt_converter, ) -from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import ( tensorrt_fused_nccl_all_gather_op, tensorrt_fused_nccl_reduce_scatter_op, ) +from torch_tensorrt.dynamo.utils import load_tensorrt_llm _LOGGER: logging.Logger = logging.getLogger(__name__) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index e4018ae95c..2a65437946 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -1,7 +1,12 @@ from __future__ import annotations +import ctypes import gc import logging +import os +import shutil +import subprocess +import sys import warnings from dataclasses import fields, replace from enum import Enum @@ -14,7 +19,7 @@ from torch._subclasses.fake_tensor import FakeTensor from torch.fx.experimental.proxy_tensor import unset_fake_temporarily from torch_tensorrt._Device import Device -from torch_tensorrt._enums import dtype +from torch_tensorrt._enums import Platform, dtype from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import _defaults @@ -812,3 +817,123 @@ def is_tegra_platform() -> bool: if torch.cuda.get_device_capability() in [(8, 7), (7, 2)]: return True return False + + +def download_plugin_lib_path(py_version: str, platform: str) -> str: + plugin_lib_path = None + + # Downloading TRT-LLM lib + # TODO: check how to fix the 0.18.0 hardcode below + base_url = "https://pypi.nvidia.com/tensorrt-llm/" + file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl" + download_url = base_url + file_name + cmd = ["wget", download_url] + if not (os.path.exists(file_name)): + try: + subprocess.run(cmd, check=True) + logger.debug("Download succeeded and TRT-LLM wheel is now present") + except subprocess.CalledProcessError as e: + logger.error( + "Download failed (file not found or connection issue). Error code:", + e.returncode, + ) + except FileNotFoundError: + logger.error("wget is required but not found. Please install wget.") + + # Proceeding with the unzip of the wheel file + # This will exist if the filename was already downloaded + if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"): + plugin_lib_path = "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" + else: + try: + import zipfile + except: + raise ImportError( + "zipfile module is required but not found. Please install zipfile" + ) + with zipfile.ZipFile(file_name, "r") as zip_ref: + zip_ref.extractall(".") # Extract to a folder named 'tensorrt_llm' + plugin_lib_path = ( + "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so" + ) + return plugin_lib_path + + +def load_tensorrt_llm() -> bool: + """ + Attempts to load the TensorRT-LLM plugin and initialize it. + Either the env variable TRTLLM_PLUGINS_PATH can specify the path + Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it + + Returns: + bool: True if the plugin was successfully loaded and initialized, False otherwise. + """ + plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH") + if not plugin_lib_path: + # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user + use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in ( + "1", + "true", + "yes", + "on", + ) + if not use_trtllm_plugin: + logger.warning( + "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT" + ) + return False + else: + # this is used as the default py version + py_version = f"cp312" + platform = Platform.current_platform() + + platform = str(platform).lower() + plugin_lib_path = download_plugin_lib_path(py_version, platform) + + try: + # Load the shared TRT-LLM file + handle = ctypes.CDLL(plugin_lib_path) + logger.info(f"Successfully loaded plugin library: {plugin_lib_path}") + except OSError as e_os_error: + if "libmpi" in str(e_os_error): + logger.warning( + f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. " + f"The dependency libmpi.so is missing. " + f"Please install the packages libmpich-dev and libopenmpi-dev.", + exc_info=e_os_error, + ) + else: + logger.warning( + f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}" + f"Ensure the path is correct and the library is compatible", + exc_info=e_os_error, + ) + return False + + try: + # Configure plugin initialization arguments + handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + handle.initTrtLlmPlugins.restype = ctypes.c_bool + except AttributeError as e_plugin_unavailable: + logger.warning( + "Unable to initialize the TensorRT-LLM plugin library", + exc_info=e_plugin_unavailable, + ) + return False + + try: + # Initialize the plugin + TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm" + if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")): + logger.info("TensorRT-LLM plugin successfully initialized") + return True + else: + logger.warning("TensorRT-LLM plugin library failed in initialization") + return False + except Exception as e_initialization_error: + logger.warning( + "Exception occurred during TensorRT-LLM plugin library initialization", + exc_info=e_initialization_error, + ) + return False + return False From 3469736223ca931612fcd8a66c83e006b314a0a6 Mon Sep 17 00:00:00 2001 From: apbose Date: Fri, 25 Apr 2025 11:33:52 -0700 Subject: [PATCH 7/8] correcting misprint for TRT LLM load --- py/torch_tensorrt/dynamo/_compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index f7d9f6e9d3..7de6ddb0d0 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -502,7 +502,7 @@ def compile( enable_weight_streaming (bool): Enable weight streaming. tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"]. l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit). - ç + use_distributed_mode_trace (bool): Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT From 9c238ae5bb3b5ffdbfb6715b95e7e2f401a3c114 Mon Sep 17 00:00:00 2001 From: apbose Date: Mon, 28 Apr 2025 21:20:45 -0700 Subject: [PATCH 8/8] Using python lib for download to make it platform agnostic --- py/torch_tensorrt/dynamo/utils.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py index 2a65437946..a8b1b10e3a 100644 --- a/py/torch_tensorrt/dynamo/utils.py +++ b/py/torch_tensorrt/dynamo/utils.py @@ -7,10 +7,12 @@ import shutil import subprocess import sys +import urllib.request import warnings from dataclasses import fields, replace from enum import Enum from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +from urllib.error import URLError import numpy as np import sympy @@ -827,18 +829,21 @@ def download_plugin_lib_path(py_version: str, platform: str) -> str: base_url = "https://pypi.nvidia.com/tensorrt-llm/" file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl" download_url = base_url + file_name - cmd = ["wget", download_url] if not (os.path.exists(file_name)): try: - subprocess.run(cmd, check=True) + logger.debug(f"Downloading {download_url} ...") + urllib.request.urlretrieve(download_url, file_name) logger.debug("Download succeeded and TRT-LLM wheel is now present") - except subprocess.CalledProcessError as e: + except urllib.error.HTTPError as e: logger.error( - "Download failed (file not found or connection issue). Error code:", - e.returncode, + f"HTTP error {e.code} when trying to download {download_url}: {e.reason}" ) - except FileNotFoundError: - logger.error("wget is required but not found. Please install wget.") + except urllib.error.URLError as e: + logger.error( + f"URL error when trying to download {download_url}: {e.reason}" + ) + except OSError as e: + logger.error(f"Local file write error: {e}") # Proceeding with the unzip of the wheel file # This will exist if the filename was already downloaded