qodo-benchmark
diff --git a/‎.ci/docker/common/install_inductor_benchmark_deps.sh‎
Lines changed: 0 additions & 5 deletions b/‎.ci/docker/common/install_inductor_benchmark_deps.sh‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎.ci/pytorch/test.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/pytorch/test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build_variables.bzl‎
Lines changed: 0 additions & 1 deletion b/‎build_variables.bzl‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎caffe2/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion b/‎caffe2/CMakeLists.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎test/distributed/test_nccl.py‎
Lines changed: 0 additions & 111 deletions b/‎test/distributed/test_nccl.py‎
Lines changed: 0 additions & 111 deletions
diff --git a/‎torch/_C/_distributed_c10d.pyi‎
Lines changed: 0 additions & 3 deletions b/‎torch/_C/_distributed_c10d.pyi‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎torch/csrc/distributed/c10d/init.cpp‎
Lines changed: 0 additions & 8 deletions b/‎torch/csrc/distributed/c10d/init.cpp‎
Lines changed: 0 additions & 8 deletions
@@ -55,8 +55,3 @@ install_timm
 
 # Clean up
 conda_run pip uninstall -y torch torchvision torchaudio triton torchao
-if [[ "${DESIRED_CUDA}" == 13.* ]]; then
-  conda_run pip uninstall -y nvidia-nccl-cu13
-else
-  conda_run pip uninstall -y nvidia-nccl-cu12
-fi
@@ -373,7 +373,7 @@ _run_symm_mem_tests() {
   time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  time python test/run_test.py --include distributed/test_nccl.py -k NCCLSymmetricMemoryTest $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
   assert_git_not_dirty
 }
 
 
@@ -768,7 +768,6 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp",
     "torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp",
     "torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu",
-    "torch/csrc/distributed/c10d/symm_mem/nccl_extension.cu",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp",
     "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu",
     "torch/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp",
 
@@ -593,7 +593,6 @@ if(USE_CUDA)
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nccl_extension.cu
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
         PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
       )
 
@@ -219,15 +219,6 @@ def test_reduce_scatter(self, device, dtype):
             self.assertEqual(outputs[i], expected[i])
 
 
-# Decorator
-def requires_nccl_backend_for_symmem():
-    return skip_but_pass_in_sandcastle_if(
-        not symm_mem.is_nccl_symmem_available(),
-        "test_nccl requires at least NCCL 2.28, skipping tests",
-    )
-
-
-@requires_nccl_backend_for_symmem()
 @requires_cuda_p2p_access()
 class NCCLSymmetricMemoryTest(MultiProcContinuousTest):
     @property
@@ -259,108 +250,6 @@ def foo():
         out = symm_mem.empty(numel, dtype=dtype, device=self.device)
         symm_mem.rendezvous(out, group=group_name)
 
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_ROCM, "Skip NCCL tests for ROCm")
-    @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
-    @skip_if_lt_x_gpu(2)
-    def test_nccl_symmem_collective(self):
-        symm_mem.set_backend("NCCL")
-        torch.cuda.set_device(self.rank)
-        # Need this all_reduce to initialize NCCL communicator. Otherwise, the
-        # test will hang.  TODO: investigate how NCCLSymmetricMemory can
-        # initialize NCCL communicator.
-        c10d.all_reduce(torch.ones(1, device=self.device))
-        group_name = c10d.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
-
-        dtype = torch.float
-        numel = 1024
-
-        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
-        symm_mem.rendezvous(out, group=group_name)
-        c10d.all_reduce(out)
-        torch.cuda.synchronize()
-        self.assertEqual(
-            out, torch.full_like(out, (self.world_size - 1) * self.world_size / 2)
-        )
-
-        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
-        symm_mem.rendezvous(inp, group=group_name)
-        res = torch.ops.symm_mem.one_shot_all_reduce(inp, "sum", group_name)
-        self.assertEqual(out, res)
-
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_ROCM, "Skip NCCL tests for ROCm")
-    @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
-    @skip_if_lt_x_gpu(2)
-    def test_nccl_symmem_put(self):
-        symm_mem.set_backend("NCCL")
-        torch.cuda.set_device(self.rank)
-        # Need this all_reduce to initialize NCCL communicator. Otherwise, the
-        # test will hang.  TODO: investigate how NCCLSymmetricMemory can
-        # initialize NCCL communicator.
-        c10d.all_reduce(torch.ones(1, device=self.device))
-        group_name = c10d.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
-
-        dtype = torch.float
-        numel = 1024
-        tensor = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
-        # This is needed to make sure we don't get blocked the second time we call rendezvous
-        # for the same tensor because it will be cached by that moment.
-        symm_mem.rendezvous(tensor, group=group_name)
-        signal_val = 5
-        c10d.barrier()
-
-        if self.rank == 1:
-            torch.ops.symm_mem.nccl_put_with_signal(tensor, signal_val, 0)
-        elif self.rank == 0:
-            torch.ops.symm_mem.nccl_wait_for_signal(tensor, signal_val)
-            torch.testing.assert_close(
-                tensor, torch.ones(numel, dtype=dtype, device=self.device)
-            )
-        c10d.barrier()
-        if self.rank == 1:
-            tensor *= 2
-            torch.ops.symm_mem.nccl_put(tensor, 0)
-            c10d.barrier()
-        else:
-            c10d.barrier()
-        if self.rank == 0:
-            torch.testing.assert_close(
-                tensor, torch.ones(numel, dtype=dtype, device=self.device) * 2
-            )
-
-    @skip_but_pass_in_sandcastle_if(TEST_WITH_ROCM, "Skip NCCL tests for ROCm")
-    @skip_but_pass_in_sandcastle_if(IS_WINDOWS, "NCCL doesn't support Windows")
-    @skip_if_lt_x_gpu(2)
-    def test_nccl_symmem_get(self):
-        symm_mem.set_backend("NCCL")
-        torch.cuda.set_device(self.rank)
-        # Need this all_reduce to initialize NCCL communicator. Otherwise, the
-        # test will hang.  TODO: investigate how NCCLSymmetricMemory can
-        # initialize NCCL communicator.
-        c10d.all_reduce(torch.ones(1, device=self.device))
-        group_name = c10d.group.WORLD.group_name
-        symm_mem.enable_symm_mem_for_group(group_name)
-
-        dtype = torch.float
-        numel = 1024
-        tensor = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(self.rank)
-        # This is needed to make sure we don't get blocked the second time we call rendezvous
-        # for the same tensor because it will be cached by that moment.
-        symm_mem.rendezvous(tensor, group=group_name)
-        c10d.barrier()
-        if self.rank == 0:
-            torch.ops.symm_mem.nccl_get(tensor, 1)
-            # TODO: remove after we have wait_signal
-            c10d.barrier()
-            torch.testing.assert_close(
-                tensor, torch.ones(numel, dtype=dtype, device=self.device)
-            )
-        else:
-            # handle.wait_signal(src_rank=0)
-            # TODO: remove after we have wait_signal
-            c10d.barrier()
-
 
 instantiate_device_type_tests(TestNCCL, globals(), only_for="cuda")
 
 
@@ -765,9 +765,6 @@ def _nvshmemx_cumodule_init(module: int) -> None: ...
 # Check if NVSHMEM is available on current system.
 def _is_nvshmem_available() -> bool: ...
 
-# Check if NCCL is available as a backend for symmetric memory.
-def _is_nccl_symmem_available() -> bool: ...
-
 class _SymmetricMemory:
     @staticmethod
     def set_group_info(
 
@@ -34,7 +34,6 @@
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
-#include <torch/csrc/distributed/c10d/symm_mem/nccl_extension.cuh>
 #endif
 
 #ifdef USE_C10D_MPI
@@ -1028,13 +1027,6 @@ This class does not support ``__members__`` property.)");
       "_is_nvshmem_available", ::c10d::nvshmem_extension::is_nvshmem_available);
 #endif
 
-#ifdef USE_C10D_NCCL
-  // Check if NCCL is available as a backend for symmetric memory.
-  module.def(
-      "_is_nccl_symmem_available",
-      ::c10d::nccl_extension::is_nccl_symmem_available);
-#endif
-
   py::class_<::c10d::BroadcastOptions>(module, "BroadcastOptions")
       .def(py::init<>())
       .def_readwrite("rootRank", &::c10d::BroadcastOptions::rootRank)
Original file line number	Diff line number	Diff line change
`@@ -373,7 +373,7 @@ _run_symm_mem_tests() {`
`373`	`373`	`time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`374`	`374`	`time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`375`	`375`	`time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`376`		`- time python test/run_test.py --include distributed/test_nccl.py -k NCCLSymmetricMemoryTest $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
	`376`	`+ time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running`
`377`	`377`	`assert_git_not_dirty`
`378`	`378`	`}`
`379`	`379`
Original file line number	Diff line number	Diff line change
`@@ -593,7 +593,6 @@ if(USE_CUDA)`
`593`	`593`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu`
`594`	`594`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp`
`595`	`595`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu`
`596`		`- ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nccl_extension.cu`
`597`	`596`	`${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp`
`598`	`597`	`PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"`
`599`	`598`	`)`