@@ -229,6 +229,105 @@ void ComputePoseColoredICPCUDA(const core::Tensor &source_points,
229229 DecodeAndSolve6x6 (global_sum, pose, residual, inlier_count);
230230}
231231
232+ template <typename scalar_t , typename func_t >
233+ __global__ void ComputePoseSymmetricKernelCUDA (
234+ const scalar_t *source_points_ptr,
235+ const scalar_t *target_points_ptr,
236+ const scalar_t *source_normals_ptr,
237+ const scalar_t *target_normals_ptr,
238+ const int64_t *correspondence_indices,
239+ const int n,
240+ scalar_t *global_sum,
241+ func_t GetWeightFromRobustKernel) {
242+ typedef utility::MiniVec<scalar_t , kReduceDim > ReduceVec;
243+ // Create shared memory.
244+ typedef cub::BlockReduce<ReduceVec, kThread1DUnit > BlockReduce;
245+ __shared__ typename BlockReduce::TempStorage temp_storage;
246+ ReduceVec local_sum (static_cast <scalar_t >(0 ));
247+
248+ const int workload_idx = threadIdx .x + blockIdx .x * blockDim .x ;
249+ if (workload_idx < n) {
250+ scalar_t J_ij[12 ] = {0 }; // 6 for each term in symmetric ICP
251+ scalar_t r1 = 0 , r2 = 0 ;
252+ const bool valid = GetJacobianSymmetric<scalar_t >(
253+ workload_idx, source_points_ptr, target_points_ptr,
254+ source_normals_ptr, target_normals_ptr, correspondence_indices,
255+ J_ij, r1, r2);
256+
257+ if (valid) {
258+ const scalar_t w1 = GetWeightFromRobustKernel (r1);
259+ const scalar_t w2 = GetWeightFromRobustKernel (r2);
260+
261+ // Accumulate JtJ and Jtr for both terms
262+ int i = 0 ;
263+ for (int j = 0 ; j < 6 ; ++j) {
264+ for (int k = 0 ; k <= j; ++k) {
265+ // Contribution from first term (source to target)
266+ local_sum[i] += J_ij[j] * w1 * J_ij[k];
267+ // Contribution from second term (target to source)
268+ local_sum[i] += J_ij[j + 6 ] * w2 * J_ij[k + 6 ];
269+ ++i;
270+ }
271+ // Jtr contributions
272+ local_sum[21 + j] += J_ij[j] * w1 * r1 + J_ij[j + 6 ] * w2 * r2;
273+ }
274+ local_sum[27 ] += r1 * r1 + r2 * r2;
275+ local_sum[28 ] += 1 ;
276+ }
277+ }
278+
279+ // Reduction.
280+ auto result = BlockReduce (temp_storage).Sum (local_sum);
281+
282+ // Add result to global_sum.
283+ if (threadIdx .x == 0 ) {
284+ #pragma unroll
285+ for (int i = 0 ; i < kReduceDim ; ++i) {
286+ atomicAdd (&global_sum[i], result[i]);
287+ }
288+ }
289+ }
290+
291+ void ComputePoseSymmetricCUDA (const core::Tensor &source_points,
292+ const core::Tensor &target_points,
293+ const core::Tensor &source_normals,
294+ const core::Tensor &target_normals,
295+ const core::Tensor &correspondence_indices,
296+ core::Tensor &pose,
297+ float &residual,
298+ int &inlier_count,
299+ const core::Dtype &dtype,
300+ const core::Device &device,
301+ const registration::RobustKernel &kernel) {
302+ core::CUDAScopedDevice scoped_device (source_points.GetDevice ());
303+ int n = source_points.GetLength ();
304+
305+ core::Tensor global_sum = core::Tensor::Zeros ({29 }, dtype, device);
306+ const dim3 blocks ((n + kThread1DUnit - 1 ) / kThread1DUnit );
307+ const dim3 threads (kThread1DUnit );
308+
309+ DISPATCH_FLOAT_DTYPE_TO_TEMPLATE (dtype, [&]() {
310+ scalar_t *global_sum_ptr = global_sum.GetDataPtr <scalar_t >();
311+
312+ DISPATCH_ROBUST_KERNEL_FUNCTION (
313+ kernel.type_ , scalar_t , kernel.scaling_parameter_ ,
314+ kernel.shape_parameter_ , [&]() {
315+ ComputePoseSymmetricKernelCUDA<<<blocks, threads, 0 ,
316+ core::cuda::GetStream ()>>>(
317+ source_points.GetDataPtr<scalar_t >(),
318+ target_points.GetDataPtr<scalar_t>(),
319+ source_normals.GetDataPtr<scalar_t>(),
320+ target_normals.GetDataPtr<scalar_t>(),
321+ correspondence_indices.GetDataPtr<int64_t>(), n,
322+ global_sum_ptr, GetWeightFromRobustKernel);
323+ });
324+ });
325+
326+ core::cuda::Synchronize ();
327+
328+ DecodeAndSolve6x6 (global_sum, pose, residual, inlier_count);
329+ }
330+
232331template <typename scalar_t , typename funct1_t , typename funct2_t >
233332__global__ void ComputePoseDopplerICPKernelCUDA (
234333 const scalar_t *source_points_ptr,
0 commit comments