Skip to content

Commit d441e69

Browse files
committed
Implements Symmetric ICP registration on CUDA
Adds CUDA implementation for the Symmetric ICP registration algorithm, enabling it to run on GPUs. This improves performance for point cloud registration tasks by leveraging parallel processing capabilities. Includes device consistency tests between CPU and GPU, and supports different robust kernels.
1 parent 9ac96bb commit d441e69

File tree

4 files changed

+368
-0
lines changed

4 files changed

+368
-0
lines changed

cpp/open3d/t/pipelines/kernel/Registration.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ core::Tensor ComputePoseSymmetric(const core::Tensor &source_points,
6868
source_normals.Contiguous(), target_normals.Contiguous(),
6969
correspondence_indices.Contiguous(), pose, residual,
7070
inlier_count, source_points.GetDtype(), device, kernel);
71+
} else if (source_points.IsCUDA()) {
72+
core::CUDAScopedDevice scoped_device(source_points.GetDevice());
73+
CUDA_CALL(ComputePoseSymmetricCUDA, source_points.Contiguous(),
74+
target_points.Contiguous(), source_normals.Contiguous(),
75+
target_normals.Contiguous(),
76+
correspondence_indices.Contiguous(), pose, residual,
77+
inlier_count, source_points.GetDtype(), device, kernel);
7178
} else {
7279
utility::LogError("Unimplemented device.");
7380
}

cpp/open3d/t/pipelines/kernel/RegistrationCUDA.cu

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,105 @@ void ComputePoseColoredICPCUDA(const core::Tensor &source_points,
229229
DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
230230
}
231231

232+
template <typename scalar_t, typename func_t>
233+
__global__ void ComputePoseSymmetricKernelCUDA(
234+
const scalar_t *source_points_ptr,
235+
const scalar_t *target_points_ptr,
236+
const scalar_t *source_normals_ptr,
237+
const scalar_t *target_normals_ptr,
238+
const int64_t *correspondence_indices,
239+
const int n,
240+
scalar_t *global_sum,
241+
func_t GetWeightFromRobustKernel) {
242+
typedef utility::MiniVec<scalar_t, kReduceDim> ReduceVec;
243+
// Create shared memory.
244+
typedef cub::BlockReduce<ReduceVec, kThread1DUnit> BlockReduce;
245+
__shared__ typename BlockReduce::TempStorage temp_storage;
246+
ReduceVec local_sum(static_cast<scalar_t>(0));
247+
248+
const int workload_idx = threadIdx.x + blockIdx.x * blockDim.x;
249+
if (workload_idx < n) {
250+
scalar_t J_ij[12] = {0}; // 6 for each term in symmetric ICP
251+
scalar_t r1 = 0, r2 = 0;
252+
const bool valid = GetJacobianSymmetric<scalar_t>(
253+
workload_idx, source_points_ptr, target_points_ptr,
254+
source_normals_ptr, target_normals_ptr, correspondence_indices,
255+
J_ij, r1, r2);
256+
257+
if (valid) {
258+
const scalar_t w1 = GetWeightFromRobustKernel(r1);
259+
const scalar_t w2 = GetWeightFromRobustKernel(r2);
260+
261+
// Accumulate JtJ and Jtr for both terms
262+
int i = 0;
263+
for (int j = 0; j < 6; ++j) {
264+
for (int k = 0; k <= j; ++k) {
265+
// Contribution from first term (source to target)
266+
local_sum[i] += J_ij[j] * w1 * J_ij[k];
267+
// Contribution from second term (target to source)
268+
local_sum[i] += J_ij[j + 6] * w2 * J_ij[k + 6];
269+
++i;
270+
}
271+
// Jtr contributions
272+
local_sum[21 + j] += J_ij[j] * w1 * r1 + J_ij[j + 6] * w2 * r2;
273+
}
274+
local_sum[27] += r1 * r1 + r2 * r2;
275+
local_sum[28] += 1;
276+
}
277+
}
278+
279+
// Reduction.
280+
auto result = BlockReduce(temp_storage).Sum(local_sum);
281+
282+
// Add result to global_sum.
283+
if (threadIdx.x == 0) {
284+
#pragma unroll
285+
for (int i = 0; i < kReduceDim; ++i) {
286+
atomicAdd(&global_sum[i], result[i]);
287+
}
288+
}
289+
}
290+
291+
void ComputePoseSymmetricCUDA(const core::Tensor &source_points,
292+
const core::Tensor &target_points,
293+
const core::Tensor &source_normals,
294+
const core::Tensor &target_normals,
295+
const core::Tensor &correspondence_indices,
296+
core::Tensor &pose,
297+
float &residual,
298+
int &inlier_count,
299+
const core::Dtype &dtype,
300+
const core::Device &device,
301+
const registration::RobustKernel &kernel) {
302+
core::CUDAScopedDevice scoped_device(source_points.GetDevice());
303+
int n = source_points.GetLength();
304+
305+
core::Tensor global_sum = core::Tensor::Zeros({29}, dtype, device);
306+
const dim3 blocks((n + kThread1DUnit - 1) / kThread1DUnit);
307+
const dim3 threads(kThread1DUnit);
308+
309+
DISPATCH_FLOAT_DTYPE_TO_TEMPLATE(dtype, [&]() {
310+
scalar_t *global_sum_ptr = global_sum.GetDataPtr<scalar_t>();
311+
312+
DISPATCH_ROBUST_KERNEL_FUNCTION(
313+
kernel.type_, scalar_t, kernel.scaling_parameter_,
314+
kernel.shape_parameter_, [&]() {
315+
ComputePoseSymmetricKernelCUDA<<<blocks, threads, 0,
316+
core::cuda::GetStream()>>>(
317+
source_points.GetDataPtr<scalar_t>(),
318+
target_points.GetDataPtr<scalar_t>(),
319+
source_normals.GetDataPtr<scalar_t>(),
320+
target_normals.GetDataPtr<scalar_t>(),
321+
correspondence_indices.GetDataPtr<int64_t>(), n,
322+
global_sum_ptr, GetWeightFromRobustKernel);
323+
});
324+
});
325+
326+
core::cuda::Synchronize();
327+
328+
DecodeAndSolve6x6(global_sum, pose, residual, inlier_count);
329+
}
330+
232331
template <typename scalar_t, typename funct1_t, typename funct2_t>
233332
__global__ void ComputePoseDopplerICPKernelCUDA(
234333
const scalar_t *source_points_ptr,

cpp/open3d/t/pipelines/kernel/RegistrationImpl.h

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,18 @@ void ComputePosePointToPlaneCUDA(const core::Tensor &source_points,
9999
const core::Device &device,
100100
const registration::RobustKernel &kernel);
101101

102+
void ComputePoseSymmetricCUDA(const core::Tensor &source_points,
103+
const core::Tensor &target_points,
104+
const core::Tensor &source_normals,
105+
const core::Tensor &target_normals,
106+
const core::Tensor &correspondence_indices,
107+
core::Tensor &pose,
108+
float &residual,
109+
int &inlier_count,
110+
const core::Dtype &dtype,
111+
const core::Device &device,
112+
const registration::RobustKernel &kernel);
113+
102114
void ComputePoseColoredICPCUDA(const core::Tensor &source_points,
103115
const core::Tensor &source_colors,
104116
const core::Tensor &target_points,
@@ -215,6 +227,82 @@ template bool GetJacobianPointToPlane(int64_t workload_idx,
215227
double *J_ij,
216228
double &r);
217229

230+
template <typename scalar_t>
231+
OPEN3D_HOST_DEVICE inline bool GetJacobianSymmetric(
232+
int64_t workload_idx,
233+
const scalar_t *source_points_ptr,
234+
const scalar_t *target_points_ptr,
235+
const scalar_t *source_normals_ptr,
236+
const scalar_t *target_normals_ptr,
237+
const int64_t *correspondence_indices,
238+
scalar_t *J_ij,
239+
scalar_t &r1,
240+
scalar_t &r2) {
241+
if (correspondence_indices[workload_idx] == -1) {
242+
return false;
243+
}
244+
245+
const int64_t target_idx = 3 * correspondence_indices[workload_idx];
246+
const int64_t source_idx = 3 * workload_idx;
247+
248+
const scalar_t &sx = source_points_ptr[source_idx + 0];
249+
const scalar_t &sy = source_points_ptr[source_idx + 1];
250+
const scalar_t &sz = source_points_ptr[source_idx + 2];
251+
const scalar_t &tx = target_points_ptr[target_idx + 0];
252+
const scalar_t &ty = target_points_ptr[target_idx + 1];
253+
const scalar_t &tz = target_points_ptr[target_idx + 2];
254+
const scalar_t &nx_s = source_normals_ptr[source_idx + 0];
255+
const scalar_t &ny_s = source_normals_ptr[source_idx + 1];
256+
const scalar_t &nz_s = source_normals_ptr[source_idx + 2];
257+
const scalar_t &nx_t = target_normals_ptr[target_idx + 0];
258+
const scalar_t &ny_t = target_normals_ptr[target_idx + 1];
259+
const scalar_t &nz_t = target_normals_ptr[target_idx + 2];
260+
261+
// Symmetric ICP: minimize both source-to-target and target-to-source
262+
// distances
263+
r1 = (sx - tx) * nx_t + (sy - ty) * ny_t + (sz - tz) * nz_t;
264+
r2 = (sx - tx) * nx_s + (sy - ty) * ny_s + (sz - tz) * nz_s;
265+
266+
// For symmetric ICP, we compute Jacobians for both terms
267+
// First term (source to target plane)
268+
J_ij[0] = nz_t * sy - ny_t * sz;
269+
J_ij[1] = nx_t * sz - nz_t * sx;
270+
J_ij[2] = ny_t * sx - nx_t * sy;
271+
J_ij[3] = nx_t;
272+
J_ij[4] = ny_t;
273+
J_ij[5] = nz_t;
274+
275+
// Second term (target to source plane)
276+
J_ij[6] = nz_s * sy - ny_s * sz;
277+
J_ij[7] = nx_s * sz - nz_s * sx;
278+
J_ij[8] = ny_s * sx - nx_s * sy;
279+
J_ij[9] = nx_s;
280+
J_ij[10] = ny_s;
281+
J_ij[11] = nz_s;
282+
283+
return true;
284+
}
285+
286+
template bool GetJacobianSymmetric(int64_t workload_idx,
287+
const float *source_points_ptr,
288+
const float *target_points_ptr,
289+
const float *source_normals_ptr,
290+
const float *target_normals_ptr,
291+
const int64_t *correspondence_indices,
292+
float *J_ij,
293+
float &r1,
294+
float &r2);
295+
296+
template bool GetJacobianSymmetric(int64_t workload_idx,
297+
const double *source_points_ptr,
298+
const double *target_points_ptr,
299+
const double *source_normals_ptr,
300+
const double *target_normals_ptr,
301+
const int64_t *correspondence_indices,
302+
double *J_ij,
303+
double &r1,
304+
double &r2);
305+
218306
template <typename scalar_t>
219307
OPEN3D_HOST_DEVICE inline bool GetJacobianColoredICP(
220308
const int64_t workload_idx,

0 commit comments

Comments
 (0)