From 29cc96a8b2271a6a6e15bc3282b87008a4603867 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 13 Feb 2025 12:10:45 -0800
Subject: [PATCH 1/8] TensorRT-LLM import fix and aot_joint_export specify as
 explicit setting in dynamo.compile

---
 py/torch_tensorrt/dynamo/_compiler.py         |   9 ++
 .../dynamo/conversion/converter_utils.py      | 127 +++++++++++-------
 2 files changed, 91 insertions(+), 45 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 6928347baa..2e3824f4a9 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -98,6 +98,7 @@ def cross_compile_for_windows(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
+    use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -173,6 +174,7 @@ def cross_compile_for_windows(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -332,6 +334,7 @@ def cross_compile_for_windows(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "use_aot_joint_export": use_aot_joint_export,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -421,6 +424,7 @@ def compile(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
+    use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -498,6 +502,7 @@ def compile(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -674,6 +679,7 @@ def compile(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "use_aot_joint_export": use_aot_joint_export,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -964,6 +970,7 @@ def convert_exported_program_to_serialized_trt_engine(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
+    use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1029,6 +1036,7 @@ def convert_exported_program_to_serialized_trt_engine(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
+        use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
@@ -1147,6 +1155,7 @@ def convert_exported_program_to_serialized_trt_engine(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "use_aot_joint_export": use_aot_joint_export,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index bcb8495c67..d1e85e6e3d 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -3,6 +3,8 @@
 import functools
 import logging
 import os
+import subprocess
+import sys
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload
 
 import numpy as np
@@ -13,6 +15,7 @@
 from torch.fx.node import Argument, Target
 from torch.fx.passes.shape_prop import TensorMetadata
 from torch_tensorrt import _enums
+from torch_tensorrt._enums import Platform
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
@@ -1011,57 +1014,91 @@ def load_tensorrt_llm() -> bool:
     Returns:
         bool: True if the plugin was successfully loaded and initialized, False otherwise.
     """
-    try:
-        import tensorrt_llm as trt_llm  # noqa: F401
 
-        _LOGGER.info("TensorRT-LLM successfully imported")
-        return True
-    except (ImportError, AssertionError) as e_import_error:
-        # Check for environment variable for the plugin library path
-        plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
-        if not plugin_lib_path:
+    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+    if not plugin_lib_path:
+        _LOGGER.warning(
+            "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library",
+        )
+        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
+            "1",
+            "true",
+            "yes",
+            "on",
+        )
+        if not use_trtllm_plugin:
             _LOGGER.warning(
-                "TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops",
+                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library"
             )
             return False
+        else:
+            py_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+            platform = Platform.current_platform()
+            if Platform == Platform.LINUX_X86_64:
+                platform = "linux_x86_64"
+            elif Platform == Platform.LINUX_AARCH64:
+                platform = "linux_aarch64"
+
+            if py_version not in ("cp310", "cp312"):
+                _LOGGER.warning(
+                    "No available wheel for python versions other than py3.10 and py3.12"
+                )
+            if py_version == "cp310" and platform == "linux_aarch64":
+                _LOGGER.warning("No available wheel for python3.10 with Linux aarch64")
 
-        _LOGGER.info(f"TensorRT-LLM Plugin lib path found: {plugin_lib_path}")
-        try:
-            # Load the shared library
-            handle = ctypes.CDLL(plugin_lib_path)
-            _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
-        except OSError as e_os_error:
-            _LOGGER.error(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
-                f"Ensure the path is correct and the library is compatible",
-                exc_info=e_os_error,
+            base_url = "https://pypi.nvidia.com/tensorrt-llm/"
+            file_name = (
+                "tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl"
             )
-            return False
+            download_url = base_url + file_name
+            cmd = ["wget", download_url]
+            subprocess.run(cmd)
+            if os.path.exists(file_name):
+                _LOGGER.info("filename download is completed")
+                import zipfile
+
+                with zipfile.ZipFile(file_name, "r") as zip_ref:
+                    zip_ref.extractall(
+                        "./tensorrt_llm"
+                    )  # Extract to a folder named 'tensorrt_llm'
+                    plugin_lib_path = (
+                        "./tensorrt_llm" + "libnvinfer_plugin_tensorrt_llm.so"
+                    )
+    try:
+        # Load the shared library
+        handle = ctypes.CDLL(plugin_lib_path)
+        _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
+    except OSError as e_os_error:
+        _LOGGER.error(
+            f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
+            f"Ensure the path is correct and the library is compatible",
+            exc_info=e_os_error,
+        )
+        return False
 
-        try:
-            # Configure plugin initialization arguments
-            handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-            handle.initTrtLlmPlugins.restype = ctypes.c_bool
-        except AttributeError as e_plugin_unavailable:
-            _LOGGER.warning(
-                "Unable to initialize the TensorRT-LLM plugin library",
-                exc_info=e_plugin_unavailable,
-            )
-            return False
+    try:
+        # Configure plugin initialization arguments
+        handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+        handle.initTrtLlmPlugins.restype = ctypes.c_bool
+    except AttributeError as e_plugin_unavailable:
+        _LOGGER.warning(
+            "Unable to initialize the TensorRT-LLM plugin library",
+            exc_info=e_plugin_unavailable,
+        )
+        return False
 
-        try:
-            # Initialize the plugin
-            TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
-            if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
-                _LOGGER.info("TensorRT-LLM plugin successfully initialized")
-                return True
-            else:
-                _LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
-                return False
-        except Exception as e_initialization_error:
-            _LOGGER.warning(
-                "Exception occurred during TensorRT-LLM plugin library initialization",
-                exc_info=e_initialization_error,
-            )
+    try:
+        # Initialize the plugin
+        TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
+        if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
+            _LOGGER.info("TensorRT-LLM plugin successfully initialized")
+            return True
+        else:
+            _LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
             return False
-    return False
+    except Exception as e_initialization_error:
+        _LOGGER.warning(
+            "Exception occurred during TensorRT-LLM plugin library initialization",
+            exc_info=e_initialization_error,
+        )
+        return False

From 02e537bc757749fa615a7051acac06d87cad3f45 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 27 Feb 2025 12:02:31 -0800
Subject: [PATCH 2/8] TRT-LLM installation utilities and adding test cases

---
 .../dynamo/conversion/converter_utils.py      | 116 +++++++++++++-----
 tests/py/dynamo/conversion/harness.py         |  13 ++
 tests/py/dynamo/conversion/test_nccl_ops.py   |  80 ++++++++++++
 3 files changed, 178 insertions(+), 31 deletions(-)
 create mode 100644 tests/py/dynamo/conversion/test_nccl_ops.py

diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index d1e85e6e3d..454a0ba519 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -3,6 +3,7 @@
 import functools
 import logging
 import os
+import shutil
 import subprocess
 import sys
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload
@@ -1007,6 +1008,84 @@ def args_bounds_check(
     return args[i] if len(args) > i and args[i] is not None else replacement
 
 
+def install_wget(platform: str) -> None:
+    if shutil.which("wget"):
+        _LOGGER.debug("wget is already installed")
+        return
+    if platform.startswith("linux"):
+        try:
+            # if its root
+            if os.geteuid() == 0:
+                subprocess.run(["apt-get", "update"], check=True)
+                subprocess.run(["apt-get", "install", "-y", "wget"], check=True)
+            else:
+                _LOGGER.debug("Please run with sudo permissions")
+                subprocess.run(["sudo", "apt-get", "update"], check=True)
+                subprocess.run(["sudo", "apt-get", "install", "-y", "wget"], check=True)
+        except subprocess.CalledProcessError as e:
+            _LOGGER.debug("Error installing wget:", e)
+
+
+def install_mpi(platform: str) -> None:
+    if platform.startswith("linux"):
+        try:
+            # if its root
+            if os.geteuid() == 0:
+                subprocess.run(["apt-get", "update"], check=True)
+                subprocess.run(["apt-get", "install", "-y", "libmpich-dev"], check=True)
+                subprocess.run(
+                    ["apt-get", "install", "-y", "libopenmpi-dev"], check=True
+                )
+            else:
+                _LOGGER.debug("Please run with sudo permissions")
+                subprocess.run(["sudo", "apt-get", "update"], check=True)
+                subprocess.run(
+                    ["sudo", "apt-get", "install", "-y", "libmpich-dev"], check=True
+                )
+                subprocess.run(
+                    ["sudo", "apt-get", "install", "-y", "libopenmpi-dev"], check=True
+                )
+        except subprocess.CalledProcessError as e:
+            _LOGGER.debug("Error installing mpi libs:", e)
+
+
+def download_plugin_lib_path(py_version: str, platform: str) -> str:
+    plugin_lib_path = None
+    if py_version not in ("cp310", "cp312"):
+        _LOGGER.warning(
+            "No available wheel for python versions other than py3.10 and py3.12"
+        )
+    install_wget(platform)
+    base_url = "https://pypi.nvidia.com/tensorrt-llm/"
+    file_name = f"tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl"
+    download_url = base_url + file_name
+    cmd = ["wget", download_url]
+    try:
+        if not (os.path.exists(file_name)):
+            _LOGGER.info(f"Running command: {' '.join(cmd)}")
+            subprocess.run(cmd)
+            _LOGGER.info("Download complete of wheel")
+        if os.path.exists(file_name):
+            _LOGGER.info("filename now present")
+            if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"):
+                plugin_lib_path = (
+                    "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
+                )
+            else:
+                import zipfile
+
+                with zipfile.ZipFile(file_name, "r") as zip_ref:
+                    zip_ref.extractall(".")  # Extract to a folder named 'tensorrt_llm'
+                    plugin_lib_path = (
+                        "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
+                    )
+    except subprocess.CalledProcessError as e:
+        _LOGGER.debug(f"Error occurred while trying to download: {e}")
+    except Exception as e:
+        _LOGGER.debug(f"An unexpected error occurred: {e}")
+    return plugin_lib_path
+
+
 def load_tensorrt_llm() -> bool:
     """
     Attempts to load the TensorRT-LLM plugin and initialize it.
@@ -1014,12 +1093,13 @@ def load_tensorrt_llm() -> bool:
     Returns:
         bool: True if the plugin was successfully loaded and initialized, False otherwise.
     """
-
     plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
     if not plugin_lib_path:
         _LOGGER.warning(
             "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library",
         )
+        for key, value in os.environ.items():
+            print(f"{key}: {value}")
         use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
             "1",
             "true",
@@ -1034,38 +1114,12 @@ def load_tensorrt_llm() -> bool:
         else:
             py_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
             platform = Platform.current_platform()
-            if Platform == Platform.LINUX_X86_64:
-                platform = "linux_x86_64"
-            elif Platform == Platform.LINUX_AARCH64:
-                platform = "linux_aarch64"
-
-            if py_version not in ("cp310", "cp312"):
-                _LOGGER.warning(
-                    "No available wheel for python versions other than py3.10 and py3.12"
-                )
-            if py_version == "cp310" and platform == "linux_aarch64":
-                _LOGGER.warning("No available wheel for python3.10 with Linux aarch64")
 
-            base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-            file_name = (
-                "tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl"
-            )
-            download_url = base_url + file_name
-            cmd = ["wget", download_url]
-            subprocess.run(cmd)
-            if os.path.exists(file_name):
-                _LOGGER.info("filename download is completed")
-                import zipfile
-
-                with zipfile.ZipFile(file_name, "r") as zip_ref:
-                    zip_ref.extractall(
-                        "./tensorrt_llm"
-                    )  # Extract to a folder named 'tensorrt_llm'
-                    plugin_lib_path = (
-                        "./tensorrt_llm" + "libnvinfer_plugin_tensorrt_llm.so"
-                    )
+            platform = str(platform).lower()
+            plugin_lib_path = download_plugin_lib_path(py_version, platform)
     try:
-        # Load the shared library
+        # Load the shared
+        install_mpi(platform)
         handle = ctypes.CDLL(plugin_lib_path)
         _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
     except OSError as e_os_error:
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
index 6ff45507a0..e45a11f0d5 100644
--- a/tests/py/dynamo/conversion/harness.py
+++ b/tests/py/dynamo/conversion/harness.py
@@ -353,6 +353,7 @@ def generate_graph(
         enable_passes: bool,
         propagate_shapes: bool = False,
         settings: CompilationSettings = CompilationSettings(),
+        fuse_distributed_ops: bool = False,
         torch_export_dynamic_shapes: Optional[Any] = None,
     ):
         mod = mod.eval()
@@ -368,6 +369,16 @@ def generate_graph(
                 tuple(torch_export_inputs),
                 dynamic_shapes=torch_export_dynamic_shapes,
             )
+            if fuse_distributed_ops:
+                from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
+                    fuse_distributed_ops,
+                )
+
+                gm = exported_program.graph_module
+                gm = fuse_distributed_ops(gm, settings)
+                exported_program = exported_program.run_decompositions(
+                    get_decompositions(False)
+                )
             if enable_passes:
                 exported_program = pre_export_lowering(exported_program, settings)
                 exported_program = exported_program.run_decompositions(
@@ -406,6 +417,7 @@ def run_test(
         propagate_shapes=False,
         int32_reqd=False,
         immutable_weights=True,
+        fuse_distributed_ops=False,
     ):
         # TODO: lan to remove this and set use_dynamo_traccer to True by default
         # once all the converter test files are moved to use_dynamo_tracer
@@ -426,6 +438,7 @@ def run_test(
             enable_passes=enable_passes,
             propagate_shapes=propagate_shapes,
             settings=compilation_settings,
+            fuse_distributed_ops=fuse_distributed_ops,
         )
 
         num_inputs = len(inputs)
diff --git a/tests/py/dynamo/conversion/test_nccl_ops.py b/tests/py/dynamo/conversion/test_nccl_ops.py
new file mode 100644
index 0000000000..4db24881c8
--- /dev/null
+++ b/tests/py/dynamo/conversion/test_nccl_ops.py
@@ -0,0 +1,80 @@
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from parameterized import parameterized
+from torch.testing._internal.common_utils import run_tests
+
+
+def set_environment_variables():
+    os.environ["WORLD_SIZE"] = str(1)
+    os.environ["RANK"] = str(0)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(29500)
+    os.environ["USE_TRTLLM_PLUGINS"] = "1"
+
+
+set_environment_variables()
+dist.init_process_group(backend="nccl", init_method="env://")
+group = dist.new_group(ranks=[0])
+group_name = group.group_name
+
+from .harness import DispatchTestCase
+
+
+class TestGatherNcclOpsConverter(DispatchTestCase):
+    @parameterized.expand([(8)])
+    def test_nccl_ops(self, linear_layer_dim):
+        class DistributedGatherModel(nn.Module):
+            def __init__(self, input_dim):
+                super().__init__()
+                self.fc = torch.nn.Linear(input_dim, input_dim)
+
+            def forward(self, x):
+                x = self.fc(x)
+                world_size = 1
+                gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
+                    x, world_size, group_name
+                )
+                gathered_tensor = torch.ops._c10d_functional.wait_tensor(
+                    gathered_tensor
+                )
+                return gathered_tensor
+
+        inputs = [torch.randn(1, linear_layer_dim).to("cuda")]
+
+        self.run_test(
+            DistributedGatherModel(linear_layer_dim).cuda(),
+            inputs,
+            use_dynamo_tracer=True,
+            fuse_distributed_ops=True,
+        )
+
+    # TODO: Look at this
+    # @parameterized.expand(
+    #     [
+    #         (8)
+    #     ]
+    # )
+    # def test_nccl_ops_scatter(self, linear_layer_dim):
+
+    #     class DistributedReduceScatterModel(nn.Module):
+    #         def __init__(self, input_dim):
+    #             super().__init__()
+    #         def forward(self, x):
+    #             world_size = 1
+    #             scatter_reduce_tensor = torch.ops._c10d_functional.reduce_scatter_tensor(x, "sum", world_size, group_name)
+    #             scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(scatter_reduce_tensor)
+    #             return scatter_reduce_tensor
+    #     inputs = [torch.zeros(1, linear_layer_dim).to("cuda")]
+
+    #     self.run_test(
+    #         DistributedReduceScatterModel(linear_layer_dim).cuda(),
+    #         inputs,
+    #         use_dynamo_tracer=True,
+    #     )
+
+
+if __name__ == "__main__":
+    run_tests()

From 636faa2cc899cbef153a9cf2a04c1c8ed23088ad Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Tue, 15 Apr 2025 13:59:48 -0700
Subject: [PATCH 3/8] adding the option in _compiler.py

---
 py/torch_tensorrt/dynamo/_compiler.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 2e3824f4a9..f7d9f6e9d3 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -98,7 +98,7 @@ def cross_compile_for_windows(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
-    use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -174,7 +174,7 @@ def cross_compile_for_windows(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
-        use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -334,7 +334,7 @@ def cross_compile_for_windows(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
-        "use_aot_joint_export": use_aot_joint_export,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -424,7 +424,7 @@ def compile(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
-    use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -502,7 +502,7 @@ def compile(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
-
+       ç
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -679,7 +679,7 @@ def compile(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
-        "use_aot_joint_export": use_aot_joint_export,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -970,7 +970,7 @@ def convert_exported_program_to_serialized_trt_engine(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
-    use_aot_joint_export: bool = _defaults.USE_AOT_JOINT_EXPORT,
+    use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -1036,7 +1036,7 @@ def convert_exported_program_to_serialized_trt_engine(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
-        use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
+        use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
@@ -1155,7 +1155,7 @@ def convert_exported_program_to_serialized_trt_engine(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
-        "use_aot_joint_export": use_aot_joint_export,
+        "use_distributed_mode_trace": use_distributed_mode_trace,
     }
 
     settings = CompilationSettings(**compilation_options)

From aae4bb31c42e11e8ed7fc56b4799b8bffcc523cd Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 17 Apr 2025 14:19:08 -0700
Subject: [PATCH 4/8] changes in the TRT-LLM loading tool- removing
 install_wget, install_unzip, install_mpi

---
 .../dynamo/conversion/converter_utils.py      | 113 ++++++------------
 tests/py/dynamo/conversion/harness.py         |  13 --
 tests/py/dynamo/conversion/test_nccl_ops.py   |  80 -------------
 3 files changed, 39 insertions(+), 167 deletions(-)
 delete mode 100644 tests/py/dynamo/conversion/test_nccl_ops.py

diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index 454a0ba519..b7f7fb3e90 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -1008,87 +1008,51 @@ def args_bounds_check(
     return args[i] if len(args) > i and args[i] is not None else replacement
 
 
-def install_wget(platform: str) -> None:
-    if shutil.which("wget"):
-        _LOGGER.debug("wget is already installed")
-        return
-    if platform.startswith("linux"):
-        try:
-            # if its root
-            if os.geteuid() == 0:
-                subprocess.run(["apt-get", "update"], check=True)
-                subprocess.run(["apt-get", "install", "-y", "wget"], check=True)
-            else:
-                _LOGGER.debug("Please run with sudo permissions")
-                subprocess.run(["sudo", "apt-get", "update"], check=True)
-                subprocess.run(["sudo", "apt-get", "install", "-y", "wget"], check=True)
-        except subprocess.CalledProcessError as e:
-            _LOGGER.debug("Error installing wget:", e)
-
-
-def install_mpi(platform: str) -> None:
-    if platform.startswith("linux"):
-        try:
-            # if its root
-            if os.geteuid() == 0:
-                subprocess.run(["apt-get", "update"], check=True)
-                subprocess.run(["apt-get", "install", "-y", "libmpich-dev"], check=True)
-                subprocess.run(
-                    ["apt-get", "install", "-y", "libopenmpi-dev"], check=True
-                )
-            else:
-                _LOGGER.debug("Please run with sudo permissions")
-                subprocess.run(["sudo", "apt-get", "update"], check=True)
-                subprocess.run(
-                    ["sudo", "apt-get", "install", "-y", "libmpich-dev"], check=True
-                )
-                subprocess.run(
-                    ["sudo", "apt-get", "install", "-y", "libopenmpi-dev"], check=True
-                )
-        except subprocess.CalledProcessError as e:
-            _LOGGER.debug("Error installing mpi libs:", e)
-
-
 def download_plugin_lib_path(py_version: str, platform: str) -> str:
     plugin_lib_path = None
-    if py_version not in ("cp310", "cp312"):
-        _LOGGER.warning(
-            "No available wheel for python versions other than py3.10 and py3.12"
-        )
-    install_wget(platform)
+
+    # Downloading TRT-LLM lib
+    # TODO: check how to fix the 0.18.0 hardcode below
     base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-    file_name = f"tensorrt_llm-0.17.0.post1-{py_version}-{py_version}-{platform}.whl"
+    file_name = f"tensorrt_llm-0.18.0.post1-{py_version}-{py_version}-{platform}.whl"
     download_url = base_url + file_name
     cmd = ["wget", download_url]
-    try:
-        if not (os.path.exists(file_name)):
-            _LOGGER.info(f"Running command: {' '.join(cmd)}")
-            subprocess.run(cmd)
-            _LOGGER.info("Download complete of wheel")
-        if os.path.exists(file_name):
-            _LOGGER.info("filename now present")
-            if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"):
-                plugin_lib_path = (
-                    "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
-                )
-            else:
-                import zipfile
+    if not (os.path.exists(file_name)):
+        try:
+            subprocess.run(cmd, check=True)
+            _LOGGER.debug("Download succeeded and TRT-LLM wheel is now present")
+        except subprocess.CalledProcessError as e:
+            _LOGGER.error(
+                "Download failed (file not found or connection issue). Error code:",
+                e.returncode,
+            )
+        except FileNotFoundError:
+            _LOGGER.error("wget is required but not found. Please install wget.")
 
-                with zipfile.ZipFile(file_name, "r") as zip_ref:
-                    zip_ref.extractall(".")  # Extract to a folder named 'tensorrt_llm'
-                    plugin_lib_path = (
-                        "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
-                    )
-    except subprocess.CalledProcessError as e:
-        _LOGGER.debug(f"Error occurred while trying to download: {e}")
-    except Exception as e:
-        _LOGGER.debug(f"An unexpected error occurred: {e}")
+    # Proceeding with the unzip of the wheel file
+    # This will exist if the filename was already downloaded
+    if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"):
+        plugin_lib_path = "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
+    else:
+        try:
+            import zipfile
+        except:
+            raise ImportError(
+                "zipfile module is required but not found. Please install zipfile"
+            )
+        with zipfile.ZipFile(file_name, "r") as zip_ref:
+            zip_ref.extractall(".")  # Extract to a folder named 'tensorrt_llm'
+            plugin_lib_path = (
+                "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
+            )
     return plugin_lib_path
 
 
 def load_tensorrt_llm() -> bool:
     """
     Attempts to load the TensorRT-LLM plugin and initialize it.
+    Either the env variable TRTLLM_PLUGINS_PATH specifies the path
+    If the above is not, the user can specify USE_TRTLLM_PLUGINS as either of 1, true, yes, on to download the TRT-LLM distribution and load it
 
     Returns:
         bool: True if the plugin was successfully loaded and initialized, False otherwise.
@@ -1098,8 +1062,9 @@ def load_tensorrt_llm() -> bool:
         _LOGGER.warning(
             "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library",
         )
-        for key, value in os.environ.items():
-            print(f"{key}: {value}")
+        # for key, value in os.environ.items():
+        #     print(f"{key}: {value}")
+        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
         use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
             "1",
             "true",
@@ -1112,14 +1077,14 @@ def load_tensorrt_llm() -> bool:
             )
             return False
         else:
-            py_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+            # this is used as the default py version
+            py_version = f"cp312"
             platform = Platform.current_platform()
 
             platform = str(platform).lower()
             plugin_lib_path = download_plugin_lib_path(py_version, platform)
     try:
-        # Load the shared
-        install_mpi(platform)
+        # Load the shared TRT-LLM file
         handle = ctypes.CDLL(plugin_lib_path)
         _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
     except OSError as e_os_error:
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
index e45a11f0d5..6ff45507a0 100644
--- a/tests/py/dynamo/conversion/harness.py
+++ b/tests/py/dynamo/conversion/harness.py
@@ -353,7 +353,6 @@ def generate_graph(
         enable_passes: bool,
         propagate_shapes: bool = False,
         settings: CompilationSettings = CompilationSettings(),
-        fuse_distributed_ops: bool = False,
         torch_export_dynamic_shapes: Optional[Any] = None,
     ):
         mod = mod.eval()
@@ -369,16 +368,6 @@ def generate_graph(
                 tuple(torch_export_inputs),
                 dynamic_shapes=torch_export_dynamic_shapes,
             )
-            if fuse_distributed_ops:
-                from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
-                    fuse_distributed_ops,
-                )
-
-                gm = exported_program.graph_module
-                gm = fuse_distributed_ops(gm, settings)
-                exported_program = exported_program.run_decompositions(
-                    get_decompositions(False)
-                )
             if enable_passes:
                 exported_program = pre_export_lowering(exported_program, settings)
                 exported_program = exported_program.run_decompositions(
@@ -417,7 +406,6 @@ def run_test(
         propagate_shapes=False,
         int32_reqd=False,
         immutable_weights=True,
-        fuse_distributed_ops=False,
     ):
         # TODO: lan to remove this and set use_dynamo_traccer to True by default
         # once all the converter test files are moved to use_dynamo_tracer
@@ -438,7 +426,6 @@ def run_test(
             enable_passes=enable_passes,
             propagate_shapes=propagate_shapes,
             settings=compilation_settings,
-            fuse_distributed_ops=fuse_distributed_ops,
         )
 
         num_inputs = len(inputs)
diff --git a/tests/py/dynamo/conversion/test_nccl_ops.py b/tests/py/dynamo/conversion/test_nccl_ops.py
deleted file mode 100644
index 4db24881c8..0000000000
--- a/tests/py/dynamo/conversion/test_nccl_ops.py
+++ /dev/null
@@ -1,80 +0,0 @@
-import os
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from parameterized import parameterized
-from torch.testing._internal.common_utils import run_tests
-
-
-def set_environment_variables():
-    os.environ["WORLD_SIZE"] = str(1)
-    os.environ["RANK"] = str(0)
-    os.environ["MASTER_ADDR"] = "127.0.0.1"
-    os.environ["MASTER_PORT"] = str(29500)
-    os.environ["USE_TRTLLM_PLUGINS"] = "1"
-
-
-set_environment_variables()
-dist.init_process_group(backend="nccl", init_method="env://")
-group = dist.new_group(ranks=[0])
-group_name = group.group_name
-
-from .harness import DispatchTestCase
-
-
-class TestGatherNcclOpsConverter(DispatchTestCase):
-    @parameterized.expand([(8)])
-    def test_nccl_ops(self, linear_layer_dim):
-        class DistributedGatherModel(nn.Module):
-            def __init__(self, input_dim):
-                super().__init__()
-                self.fc = torch.nn.Linear(input_dim, input_dim)
-
-            def forward(self, x):
-                x = self.fc(x)
-                world_size = 1
-                gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
-                    x, world_size, group_name
-                )
-                gathered_tensor = torch.ops._c10d_functional.wait_tensor(
-                    gathered_tensor
-                )
-                return gathered_tensor
-
-        inputs = [torch.randn(1, linear_layer_dim).to("cuda")]
-
-        self.run_test(
-            DistributedGatherModel(linear_layer_dim).cuda(),
-            inputs,
-            use_dynamo_tracer=True,
-            fuse_distributed_ops=True,
-        )
-
-    # TODO: Look at this
-    # @parameterized.expand(
-    #     [
-    #         (8)
-    #     ]
-    # )
-    # def test_nccl_ops_scatter(self, linear_layer_dim):
-
-    #     class DistributedReduceScatterModel(nn.Module):
-    #         def __init__(self, input_dim):
-    #             super().__init__()
-    #         def forward(self, x):
-    #             world_size = 1
-    #             scatter_reduce_tensor = torch.ops._c10d_functional.reduce_scatter_tensor(x, "sum", world_size, group_name)
-    #             scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(scatter_reduce_tensor)
-    #             return scatter_reduce_tensor
-    #     inputs = [torch.zeros(1, linear_layer_dim).to("cuda")]
-
-    #     self.run_test(
-    #         DistributedReduceScatterModel(linear_layer_dim).cuda(),
-    #         inputs,
-    #         use_dynamo_tracer=True,
-    #     )
-
-
-if __name__ == "__main__":
-    run_tests()

From 85811f11e25fab0e27a62f7ca21b8286a3d83121 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 17 Apr 2025 16:35:53 -0700
Subject: [PATCH 5/8] Further changes in error logging of the TRT-LLM
 installation tool

---
 .../dynamo/conversion/converter_utils.py      | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index b7f7fb3e90..dd0d37c7d3 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -1014,7 +1014,7 @@ def download_plugin_lib_path(py_version: str, platform: str) -> str:
     # Downloading TRT-LLM lib
     # TODO: check how to fix the 0.18.0 hardcode below
     base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-    file_name = f"tensorrt_llm-0.18.0.post1-{py_version}-{py_version}-{platform}.whl"
+    file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl"
     download_url = base_url + file_name
     cmd = ["wget", download_url]
     if not (os.path.exists(file_name)):
@@ -1051,19 +1051,14 @@ def download_plugin_lib_path(py_version: str, platform: str) -> str:
 def load_tensorrt_llm() -> bool:
     """
     Attempts to load the TensorRT-LLM plugin and initialize it.
-    Either the env variable TRTLLM_PLUGINS_PATH specifies the path
-    If the above is not, the user can specify USE_TRTLLM_PLUGINS as either of 1, true, yes, on to download the TRT-LLM distribution and load it
+    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
+    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
 
     Returns:
         bool: True if the plugin was successfully loaded and initialized, False otherwise.
     """
     plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
     if not plugin_lib_path:
-        _LOGGER.warning(
-            "Please set the TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops or else set the USE_TRTLLM_PLUGINS variable to download the shared library",
-        )
-        # for key, value in os.environ.items():
-        #     print(f"{key}: {value}")
         # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
         use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
             "1",
@@ -1073,7 +1068,7 @@ def load_tensorrt_llm() -> bool:
         )
         if not use_trtllm_plugin:
             _LOGGER.warning(
-                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library"
+                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
             )
             return False
         else:
@@ -1083,16 +1078,25 @@ def load_tensorrt_llm() -> bool:
 
             platform = str(platform).lower()
             plugin_lib_path = download_plugin_lib_path(py_version, platform)
+
     try:
         # Load the shared TRT-LLM file
         handle = ctypes.CDLL(plugin_lib_path)
         _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
     except OSError as e_os_error:
-        _LOGGER.error(
-            f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
-            f"Ensure the path is correct and the library is compatible",
-            exc_info=e_os_error,
-        )
+        if "libmpi" in str(e_os_error):
+            _LOGGER.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
+                f"The dependency libmpi.so is missing. "
+                f"Please install the packages libmpich-dev and libopenmpi-dev.",
+                exc_info=e_os_error,
+            )
+        else:
+            _LOGGER.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
+                f"Ensure the path is correct and the library is compatible",
+                exc_info=e_os_error,
+            )
         return False
 
     try:
@@ -1121,3 +1125,4 @@ def load_tensorrt_llm() -> bool:
             exc_info=e_initialization_error,
         )
         return False
+    return False

From 77f2145cdb4b07a462ffa3585bd665c68dd6af99 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Thu, 17 Apr 2025 16:58:54 -0700
Subject: [PATCH 6/8] moving the load_tensorrt_llm to dynamo/utils.py

---
 .../dynamo/conversion/converter_utils.py      | 124 -----------------
 .../conversion/custom_ops_converters.py       |   2 +-
 py/torch_tensorrt/dynamo/utils.py             | 127 +++++++++++++++++-
 3 files changed, 127 insertions(+), 126 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/conversion/converter_utils.py b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
index dd0d37c7d3..46db1c0d3b 100644
--- a/py/torch_tensorrt/dynamo/conversion/converter_utils.py
+++ b/py/torch_tensorrt/dynamo/conversion/converter_utils.py
@@ -3,9 +3,6 @@
 import functools
 import logging
 import os
-import shutil
-import subprocess
-import sys
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, overload
 
 import numpy as np
@@ -16,7 +13,6 @@
 from torch.fx.node import Argument, Target
 from torch.fx.passes.shape_prop import TensorMetadata
 from torch_tensorrt import _enums
-from torch_tensorrt._enums import Platform
 from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo._SourceIR import SourceIR
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
@@ -1006,123 +1002,3 @@ def args_bounds_check(
     args: Tuple[Argument, ...], i: int, replacement: Optional[Any] = None
 ) -> Any:
     return args[i] if len(args) > i and args[i] is not None else replacement
-
-
-def download_plugin_lib_path(py_version: str, platform: str) -> str:
-    plugin_lib_path = None
-
-    # Downloading TRT-LLM lib
-    # TODO: check how to fix the 0.18.0 hardcode below
-    base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-    file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl"
-    download_url = base_url + file_name
-    cmd = ["wget", download_url]
-    if not (os.path.exists(file_name)):
-        try:
-            subprocess.run(cmd, check=True)
-            _LOGGER.debug("Download succeeded and TRT-LLM wheel is now present")
-        except subprocess.CalledProcessError as e:
-            _LOGGER.error(
-                "Download failed (file not found or connection issue). Error code:",
-                e.returncode,
-            )
-        except FileNotFoundError:
-            _LOGGER.error("wget is required but not found. Please install wget.")
-
-    # Proceeding with the unzip of the wheel file
-    # This will exist if the filename was already downloaded
-    if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"):
-        plugin_lib_path = "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
-    else:
-        try:
-            import zipfile
-        except:
-            raise ImportError(
-                "zipfile module is required but not found. Please install zipfile"
-            )
-        with zipfile.ZipFile(file_name, "r") as zip_ref:
-            zip_ref.extractall(".")  # Extract to a folder named 'tensorrt_llm'
-            plugin_lib_path = (
-                "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
-            )
-    return plugin_lib_path
-
-
-def load_tensorrt_llm() -> bool:
-    """
-    Attempts to load the TensorRT-LLM plugin and initialize it.
-    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
-    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
-
-    Returns:
-        bool: True if the plugin was successfully loaded and initialized, False otherwise.
-    """
-    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
-    if not plugin_lib_path:
-        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
-        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
-            "1",
-            "true",
-            "yes",
-            "on",
-        )
-        if not use_trtllm_plugin:
-            _LOGGER.warning(
-                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
-            )
-            return False
-        else:
-            # this is used as the default py version
-            py_version = f"cp312"
-            platform = Platform.current_platform()
-
-            platform = str(platform).lower()
-            plugin_lib_path = download_plugin_lib_path(py_version, platform)
-
-    try:
-        # Load the shared TRT-LLM file
-        handle = ctypes.CDLL(plugin_lib_path)
-        _LOGGER.info(f"Successfully loaded plugin library: {plugin_lib_path}")
-    except OSError as e_os_error:
-        if "libmpi" in str(e_os_error):
-            _LOGGER.warning(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
-                f"The dependency libmpi.so is missing. "
-                f"Please install the packages libmpich-dev and libopenmpi-dev.",
-                exc_info=e_os_error,
-            )
-        else:
-            _LOGGER.warning(
-                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
-                f"Ensure the path is correct and the library is compatible",
-                exc_info=e_os_error,
-            )
-        return False
-
-    try:
-        # Configure plugin initialization arguments
-        handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
-        handle.initTrtLlmPlugins.restype = ctypes.c_bool
-    except AttributeError as e_plugin_unavailable:
-        _LOGGER.warning(
-            "Unable to initialize the TensorRT-LLM plugin library",
-            exc_info=e_plugin_unavailable,
-        )
-        return False
-
-    try:
-        # Initialize the plugin
-        TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
-        if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
-            _LOGGER.info("TensorRT-LLM plugin successfully initialized")
-            return True
-        else:
-            _LOGGER.warning("TensorRT-LLM plugin library failed in initialization")
-            return False
-    except Exception as e_initialization_error:
-        _LOGGER.warning(
-            "Exception occurred during TensorRT-LLM plugin library initialization",
-            exc_info=e_initialization_error,
-        )
-        return False
-    return False
diff --git a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
index 79611c7552..3e67457e54 100644
--- a/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
+++ b/py/torch_tensorrt/dynamo/conversion/custom_ops_converters.py
@@ -11,11 +11,11 @@
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     dynamo_tensorrt_converter,
 )
-from torch_tensorrt.dynamo.conversion.converter_utils import load_tensorrt_llm
 from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
     tensorrt_fused_nccl_all_gather_op,
     tensorrt_fused_nccl_reduce_scatter_op,
 )
+from torch_tensorrt.dynamo.utils import load_tensorrt_llm
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index e4018ae95c..2a65437946 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -1,7 +1,12 @@
 from __future__ import annotations
 
+import ctypes
 import gc
 import logging
+import os
+import shutil
+import subprocess
+import sys
 import warnings
 from dataclasses import fields, replace
 from enum import Enum
@@ -14,7 +19,7 @@
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt._Device import Device
-from torch_tensorrt._enums import dtype
+from torch_tensorrt._enums import Platform, dtype
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo import _defaults
@@ -812,3 +817,123 @@ def is_tegra_platform() -> bool:
     if torch.cuda.get_device_capability() in [(8, 7), (7, 2)]:
         return True
     return False
+
+
+def download_plugin_lib_path(py_version: str, platform: str) -> str:
+    plugin_lib_path = None
+
+    # Downloading TRT-LLM lib
+    # TODO: check how to fix the 0.18.0 hardcode below
+    base_url = "https://pypi.nvidia.com/tensorrt-llm/"
+    file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl"
+    download_url = base_url + file_name
+    cmd = ["wget", download_url]
+    if not (os.path.exists(file_name)):
+        try:
+            subprocess.run(cmd, check=True)
+            logger.debug("Download succeeded and TRT-LLM wheel is now present")
+        except subprocess.CalledProcessError as e:
+            logger.error(
+                "Download failed (file not found or connection issue). Error code:",
+                e.returncode,
+            )
+        except FileNotFoundError:
+            logger.error("wget is required but not found. Please install wget.")
+
+    # Proceeding with the unzip of the wheel file
+    # This will exist if the filename was already downloaded
+    if os.path.exists("./tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"):
+        plugin_lib_path = "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
+    else:
+        try:
+            import zipfile
+        except:
+            raise ImportError(
+                "zipfile module is required but not found. Please install zipfile"
+            )
+        with zipfile.ZipFile(file_name, "r") as zip_ref:
+            zip_ref.extractall(".")  # Extract to a folder named 'tensorrt_llm'
+            plugin_lib_path = (
+                "./tensorrt_llm/libs/" + "libnvinfer_plugin_tensorrt_llm.so"
+            )
+    return plugin_lib_path
+
+
+def load_tensorrt_llm() -> bool:
+    """
+    Attempts to load the TensorRT-LLM plugin and initialize it.
+    Either the env variable TRTLLM_PLUGINS_PATH can specify the path
+    Or the user can specify USE_TRTLLM_PLUGINS as either of (1, true, yes, on) to download the TRT-LLM distribution and load it
+
+    Returns:
+        bool: True if the plugin was successfully loaded and initialized, False otherwise.
+    """
+    plugin_lib_path = os.environ.get("TRTLLM_PLUGINS_PATH")
+    if not plugin_lib_path:
+        # this option can be used by user if TRTLLM_PLUGINS_PATH is not set by user
+        use_trtllm_plugin = os.environ.get("USE_TRTLLM_PLUGINS", "0").lower() in (
+            "1",
+            "true",
+            "yes",
+            "on",
+        )
+        if not use_trtllm_plugin:
+            logger.warning(
+                "Neither TRTLLM_PLUGIN_PATH is set nor is it directed to download the shared library. Please set either of the two to use TRT-LLM libraries in torchTRT"
+            )
+            return False
+        else:
+            # this is used as the default py version
+            py_version = f"cp312"
+            platform = Platform.current_platform()
+
+            platform = str(platform).lower()
+            plugin_lib_path = download_plugin_lib_path(py_version, platform)
+
+    try:
+        # Load the shared TRT-LLM file
+        handle = ctypes.CDLL(plugin_lib_path)
+        logger.info(f"Successfully loaded plugin library: {plugin_lib_path}")
+    except OSError as e_os_error:
+        if "libmpi" in str(e_os_error):
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}. "
+                f"The dependency libmpi.so is missing. "
+                f"Please install the packages libmpich-dev and libopenmpi-dev.",
+                exc_info=e_os_error,
+            )
+        else:
+            logger.warning(
+                f"Failed to load libnvinfer_plugin_tensorrt_llm.so from {plugin_lib_path}"
+                f"Ensure the path is correct and the library is compatible",
+                exc_info=e_os_error,
+            )
+        return False
+
+    try:
+        # Configure plugin initialization arguments
+        handle.initTrtLlmPlugins.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+        handle.initTrtLlmPlugins.restype = ctypes.c_bool
+    except AttributeError as e_plugin_unavailable:
+        logger.warning(
+            "Unable to initialize the TensorRT-LLM plugin library",
+            exc_info=e_plugin_unavailable,
+        )
+        return False
+
+    try:
+        # Initialize the plugin
+        TRT_LLM_PLUGIN_NAMESPACE = "tensorrt_llm"
+        if handle.initTrtLlmPlugins(None, TRT_LLM_PLUGIN_NAMESPACE.encode("utf-8")):
+            logger.info("TensorRT-LLM plugin successfully initialized")
+            return True
+        else:
+            logger.warning("TensorRT-LLM plugin library failed in initialization")
+            return False
+    except Exception as e_initialization_error:
+        logger.warning(
+            "Exception occurred during TensorRT-LLM plugin library initialization",
+            exc_info=e_initialization_error,
+        )
+        return False
+    return False

From 3469736223ca931612fcd8a66c83e006b314a0a6 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Fri, 25 Apr 2025 11:33:52 -0700
Subject: [PATCH 7/8] correcting misprint for TRT LLM load

---
 py/torch_tensorrt/dynamo/_compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index f7d9f6e9d3..7de6ddb0d0 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -502,7 +502,7 @@ def compile(
         enable_weight_streaming (bool): Enable weight streaming.
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
-       ç
+        use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT

From 9c238ae5bb3b5ffdbfb6715b95e7e2f401a3c114 Mon Sep 17 00:00:00 2001
From: apbose <apbose694@gmail.com>
Date: Mon, 28 Apr 2025 21:20:45 -0700
Subject: [PATCH 8/8] Using python lib for download to make it platform
 agnostic

---
 py/torch_tensorrt/dynamo/utils.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
index 2a65437946..a8b1b10e3a 100644
--- a/py/torch_tensorrt/dynamo/utils.py
+++ b/py/torch_tensorrt/dynamo/utils.py
@@ -7,10 +7,12 @@
 import shutil
 import subprocess
 import sys
+import urllib.request
 import warnings
 from dataclasses import fields, replace
 from enum import Enum
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from urllib.error import URLError
 
 import numpy as np
 import sympy
@@ -827,18 +829,21 @@ def download_plugin_lib_path(py_version: str, platform: str) -> str:
     base_url = "https://pypi.nvidia.com/tensorrt-llm/"
     file_name = f"tensorrt_llm-0.18.0-{py_version}-{py_version}-{platform}.whl"
     download_url = base_url + file_name
-    cmd = ["wget", download_url]
     if not (os.path.exists(file_name)):
         try:
-            subprocess.run(cmd, check=True)
+            logger.debug(f"Downloading {download_url} ...")
+            urllib.request.urlretrieve(download_url, file_name)
             logger.debug("Download succeeded and TRT-LLM wheel is now present")
-        except subprocess.CalledProcessError as e:
+        except urllib.error.HTTPError as e:
             logger.error(
-                "Download failed (file not found or connection issue). Error code:",
-                e.returncode,
+                f"HTTP error {e.code} when trying to download {download_url}: {e.reason}"
             )
-        except FileNotFoundError:
-            logger.error("wget is required but not found. Please install wget.")
+        except urllib.error.URLError as e:
+            logger.error(
+                f"URL error when trying to download {download_url}: {e.reason}"
+            )
+        except OSError as e:
+            logger.error(f"Local file write error: {e}")
 
     # Proceeding with the unzip of the wheel file
     # This will exist if the filename was already downloaded