Implements Symmetric ICP registration on CUDA

eclipse0922 · eclipse0922 · commit d441e69da4cd · 2025-06-16T01:52:09.000+09:00
Adds CUDA implementation for the Symmetric ICP registration algorithm, enabling it to run on GPUs.

This improves performance for point cloud registration tasks by leveraging parallel processing capabilities.

Includes device consistency tests between CPU and GPU, and supports different robust kernels.
diff --git a/cpp/open3d/t/pipelines/kernel/Registration.cpp b/cpp/open3d/t/pipelines/kernel/Registration.cpp
@@ -68,6 +68,13 @@ core::Tensor ComputePoseSymmetric(const core::Tensor &source_points,
                 source_normals.Contiguous(), target_normals.Contiguous(),
                 correspondence_indices.Contiguous(), pose, residual,
                 inlier_count, source_points.GetDtype(), device, kernel);
+    } else if (source_points.IsCUDA()) {
+        core::CUDAScopedDevice scoped_device(source_points.GetDevice());
+        CUDA_CALL(ComputePoseSymmetricCUDA, source_points.Contiguous(),
+                  target_points.Contiguous(), source_normals.Contiguous(),
+                  target_normals.Contiguous(),
+                  correspondence_indices.Contiguous(), pose, residual,
+                  inlier_count, source_points.GetDtype(), device, kernel);
     } else {
         utility::LogError("Unimplemented device.");
     }
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationCUDA.cu b/cpp/open3d/t/pipelines/kernel/RegistrationCUDA.cu
@@ -229,6 +229,105 @@ void ComputePoseColoredICPCUDA(const core::Tensor &source_points,
     DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
 }
 
+template <typename scalar_t, typename func_t>
+__global__ void ComputePoseSymmetricKernelCUDA(
+        const scalar_t *source_points_ptr,
+        const scalar_t *target_points_ptr,
+        const scalar_t *source_normals_ptr,
+        const scalar_t *target_normals_ptr,
+        const int64_t *correspondence_indices,
+        const int n,
+        scalar_t *global_sum,
+        func_t GetWeightFromRobustKernel) {
+    typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
+    // Create shared memory.
+    typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    ReduceVec local_sum(static_cast<scalar_t>(0));
+
+    const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (workload_idx < n) {
+        scalar_t J_ij[12] = {0};  // 6 for each term in symmetric ICP
+        scalar_t r1 = 0, r2 = 0;
+        const bool valid = GetJacobianSymmetric<scalar_t>(
+                workload_idx, source_points_ptr, target_points_ptr,
+                source_normals_ptr, target_normals_ptr, correspondence_indices,
+                J_ij, r1, r2);
+
+        if (valid) {
+            const scalar_t w1 = GetWeightFromRobustKernel(r1);
+            const scalar_t w2 = GetWeightFromRobustKernel(r2);
+
+            // Accumulate JtJ and Jtr for both terms
+            int i = 0;
+            for (int j = 0; j < 6; ++j) {
+                for (int k = 0; k <= j; ++k) {
+                    // Contribution from first term (source to target)
+                    local_sum[i] += J_ij[j] * w1 * J_ij[k];
+                    // Contribution from second term (target to source)
+                    local_sum[i] += J_ij[j + 6] * w2 * J_ij[k + 6];
+                    ++i;
+                }
+                // Jtr contributions
+                local_sum[21 + j] += J_ij[j] * w1 * r1 + J_ij[j + 6] * w2 * r2;
+            }
+            local_sum[27] += r1 * r1 + r2 * r2;
+            local_sum[28] += 1;
+        }
+    }
+
+    // Reduction.
+    auto result = BlockReduce(temp_storage).Sum(local_sum);
+
+    // Add result to global_sum.
+    if (threadIdx.x == 0) {
+#pragma unroll
+        for (int i = 0; i < kReduceDim; ++i) {
+            atomicAdd(&global_sum[i], result[i]);
+        }
+    }
+}
+
+void ComputePoseSymmetricCUDA(const core::Tensor &source_points,
+                              const core::Tensor &target_points,
+                              const core::Tensor &source_normals,
+                              const core::Tensor &target_normals,
+                              const core::Tensor &correspondence_indices,
+                              core::Tensor &pose,
+                              float &residual,
+                              int &inlier_count,
+                              const core::Dtype &dtype,
+                              const core::Device &device,
+                              const registration::RobustKernel &kernel) {
+    core::CUDAScopedDevice scoped_device(source_points.GetDevice());
+    int n = source_points.GetLength();
+
+    core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
+    const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
+    const dim3 threads(kThread1DUnit);
+
+    DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
+        scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
+
+        DISPATCH_ROBUST_KERNEL_FUNCTION(
+                kernel.type_, scalar_t, kernel.scaling_parameter_,
+                kernel.shape_parameter_, [&]() {
+                    ComputePoseSymmetricKernelCUDA<<<blocks, threads, 0,
+                                                     core::cuda::GetStream()>>>(
+                            source_points.GetDataPtr<scalar_t>(),
+                            target_points.GetDataPtr<scalar_t>(),
+                            source_normals.GetDataPtr<scalar_t>(),
+                            target_normals.GetDataPtr<scalar_t>(),
+                            correspondence_indices.GetDataPtr<int64_t>(), n,
+                            global_sum_ptr, GetWeightFromRobustKernel);
+                });
+    });
+
+    core::cuda::Synchronize();
+
+    DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
+}
+
 template <typename scalar_t, typename funct1_t, typename funct2_t>
 __global__ void ComputePoseDopplerICPKernelCUDA(
         const scalar_t *source_points_ptr,
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h b/cpp/open3d/t/pipelines/kernel/RegistrationImpl.h
@@ -99,6 +99,18 @@ void ComputePosePointToPlaneCUDA(const core::Tensor &source_points,
                                  const core::Device &device,
                                  const registration::RobustKernel &kernel);
 
+void ComputePoseSymmetricCUDA(const core::Tensor &source_points,
+                              const core::Tensor &target_points,
+                              const core::Tensor &source_normals,
+                              const core::Tensor &target_normals,
+                              const core::Tensor &correspondence_indices,
+                              core::Tensor &pose,
+                              float &residual,
+                              int &inlier_count,
+                              const core::Dtype &dtype,
+                              const core::Device &device,
+                              const registration::RobustKernel &kernel);
+
 void ComputePoseColoredICPCUDA(const core::Tensor &source_points,
                                const core::Tensor &source_colors,
                                const core::Tensor &target_points,
@@ -215,6 +227,82 @@ template bool GetJacobianPointToPlane(int64_t workload_idx,
                                       double *J_ij,
                                       double &r);
 
+template <typename scalar_t>
+OPEN3D_HOST_DEVICE inline bool GetJacobianSymmetric(
+        int64_t workload_idx,
+        const scalar_t *source_points_ptr,
+        const scalar_t *target_points_ptr,
+        const scalar_t *source_normals_ptr,
+        const scalar_t *target_normals_ptr,
+        const int64_t *correspondence_indices,
+        scalar_t *J_ij,
+        scalar_t &r1,
+        scalar_t &r2) {
+    if (correspondence_indices[workload_idx] == -1) {
+        return false;
+    }
+
+    const int64_t target_idx = 3 * correspondence_indices[workload_idx];
+    const int64_t source_idx = 3 * workload_idx;
+
+    const scalar_t &sx = source_points_ptr[source_idx + 0];
+    const scalar_t &sy = source_points_ptr[source_idx + 1];
+    const scalar_t &sz = source_points_ptr[source_idx + 2];
+    const scalar_t &tx = target_points_ptr[target_idx + 0];
+    const scalar_t &ty = target_points_ptr[target_idx + 1];
+    const scalar_t &tz = target_points_ptr[target_idx + 2];
+    const scalar_t &nx_s = source_normals_ptr[source_idx + 0];
+    const scalar_t &ny_s = source_normals_ptr[source_idx + 1];
+    const scalar_t &nz_s = source_normals_ptr[source_idx + 2];
+    const scalar_t &nx_t = target_normals_ptr[target_idx + 0];
+    const scalar_t &ny_t = target_normals_ptr[target_idx + 1];
+    const scalar_t &nz_t = target_normals_ptr[target_idx + 2];
+
+    // Symmetric ICP: minimize both source-to-target and target-to-source
+    // distances
+    r1 = (sx - tx) * nx_t + (sy - ty) * ny_t + (sz - tz) * nz_t;
+    r2 = (sx - tx) * nx_s + (sy - ty) * ny_s + (sz - tz) * nz_s;
+
+    // For symmetric ICP, we compute Jacobians for both terms
+    // First term (source to target plane)
+    J_ij[0] = nz_t * sy - ny_t * sz;
+    J_ij[1] = nx_t * sz - nz_t * sx;
+    J_ij[2] = ny_t * sx - nx_t * sy;
+    J_ij[3] = nx_t;
+    J_ij[4] = ny_t;
+    J_ij[5] = nz_t;
+
+    // Second term (target to source plane)
+    J_ij[6] = nz_s * sy - ny_s * sz;
+    J_ij[7] = nx_s * sz - nz_s * sx;
+    J_ij[8] = ny_s * sx - nx_s * sy;
+    J_ij[9] = nx_s;
+    J_ij[10] = ny_s;
+    J_ij[11] = nz_s;
+
+    return true;
+}
+
+template bool GetJacobianSymmetric(int64_t workload_idx,
+                                   const float *source_points_ptr,
+                                   const float *target_points_ptr,
+                                   const float *source_normals_ptr,
+                                   const float *target_normals_ptr,
+                                   const int64_t *correspondence_indices,
+                                   float *J_ij,
+                                   float &r1,
+                                   float &r2);
+
+template bool GetJacobianSymmetric(int64_t workload_idx,
+                                   const double *source_points_ptr,
+                                   const double *target_points_ptr,
+                                   const double *source_normals_ptr,
+                                   const double *target_normals_ptr,
+                                   const int64_t *correspondence_indices,
+                                   double *J_ij,
+                                   double &r1,
+                                   double &r2);
+
 template <typename scalar_t>
 OPEN3D_HOST_DEVICE inline bool GetJacobianColoredICP(
         const int64_t workload_idx,
diff --git a/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp b/cpp/tests/t/pipelines/registration/TransformationEstimation.cpp