From 6317e854b856f3b0869c873b7b3049027778591c Mon Sep 17 00:00:00 2001
From: qbp758 <qbp758@futharkhpa03fl.unicph.domain>
Date: Mon, 2 Oct 2023 19:23:13 +0200
Subject: [PATCH 1/3] add dcostring

---
 .../collision_detection/compute_contacts.py   | 303 +++++++++++-------
 .../prox_soft_bodies/collision_detection.py   |  51 ++-
 2 files changed, 232 insertions(+), 122 deletions(-)

diff --git a/python/rainbow/cuda/collision_detection/compute_contacts.py b/python/rainbow/cuda/collision_detection/compute_contacts.py
index 0be6f2a..e63c958 100644
--- a/python/rainbow/cuda/collision_detection/compute_contacts.py
+++ b/python/rainbow/cuda/collision_detection/compute_contacts.py
@@ -10,6 +10,14 @@
 
 @cuda.jit(device=True)
 def xform_triangle_to_model_space_device(P, X, X0, P0):
+    """ Converts a world space triangle into the material space of a tetrahedron.
+
+    Args:
+        P (float64[:]): The input triangle points in world space
+        X (float64[:, :]): The input tetrahedron corners in world space.
+        X0 (float64[:, :]): The input tetrahedron corners in material space.
+        P0 (float64[:]): The values are the triangle corner points in tetrahedron B's material coordinate space.
+    """
     Matrix.mat33_zero(P0)
 
     w0 = cuda.local.array(4, dtype=float64)
@@ -29,144 +37,202 @@ def xform_triangle_to_model_space_device(P, X, X0, P0):
 
 
 @cuda.jit(device=True)
-def compute_omegaB_device(p, X0B, omegaB):
-    BC.compute_barycentric_tetrahedron_device(X0B[0], X0B[1], X0B[2], X0B[3], p, omegaB)
+def compute_omega_device(p, X, omega):
+    """ Compute the barycentric coordinates for a point p of a tetrahedron
 
+    Args:
+        p (float64[:]): The contact point in the model space of a body 
+        X (float64[:, :]): The corner points of a tetrahedron in world coordinates.
+        omega (float64[:]): the barycentric coordinates  w1, w2, w3 and w4.
+    """
+    BC.compute_barycentric_tetrahedron_device(X[0], X[1], X[2], X[3], p, omega)
+
+
+# @cuda.jit(device=True)
+# def compute_omegaB_device(p, X0B, omegaB):
+#     BC.compute_barycentric_tetrahedron_device(X0B[0], X0B[1], X0B[2], X0B[3], p, omegaB)
 
-@cuda.jit(device=True)
-def compute_omegaA_device(p, XA, omegaA):
-    BC.compute_barycentric_tetrahedron_device(XA[0], XA[1], XA[2], XA[3], p, omegaA)
+
+# @cuda.jit(device=True)
+# def compute_omegaA_device(p, XA, omegaA):
+#     BC.compute_barycentric_tetrahedron_device(XA[0], XA[1], XA[2], XA[3], p, omegaA)
 
 
 @cuda.jit(device=True)
-def compute_p_device(p, XB, omegaB, result):
+def compute_p_device(p, X, omega, result):
+    """ Compute the contact point in the world space of a body
+
+    Args:
+        p (float64[:]): The contact point in the model space of a body 
+        X (float64[:, :]): The corner points of a tetrahedron in world coordinates.
+        omega (float64[:]): the barycentric coordinates  w1, w2, w3 and w4.
+        result (float64[:]): The contact point in the world space of a body
+    """
     t = cuda.local.array((3, 4), dtype=float64)
-    Matrix.mat43_T(XB, t)
-    Matrix.mat34_dot_vec4(t, omegaB, result)
+    Matrix.mat43_T(X, t)
+    Matrix.mat34_dot_vec4(t, omega, result)
 
 
 @cuda.jit(device=True)
-def compute_n_device(n, XB, X0B, result):
+def compute_n_device(n, X, X0, result):
+    """ Compute the contact normal in the world space of a body
+
+    Args:
+        n (float64[:]): The contact normal in the model space of a body
+        X (flaot64[:, :]): The corner points of a tetrahedron in world coordinates.
+        X0 (flaot64[:, :]): The corner points of a tetrahedron in material coordinates.
+        result (float64[:]): The contact normal in the world space of a body
+    """
     D = cuda.local.array((3,3), dtype=float64)
     D0 = cuda.local.array((3, 3), dtype=float64)
     for i in range(3):
         for j in range(3):
-            D[i, j] = XB[i, j] - XB[3, j]
-            D0[i, j] = X0B[i, j] - X0B[3, j]
+            D[i, j] = X[i, j] - X[3, j]
+            D0[i, j] = X0[i, j] - X0[3, j]
     solve_res = cuda.local.array(3, dtype=float64)
     LinAlg.cramer_solver(D0, n, solve_res)
     Matrix.mat33_dot_vec3(D, solve_res, result)
 
 
 @cuda.jit(device=True)
-def compute_contacts_device(idx_triA, idx_triB, A_owners, B_owners, A_x, B_x, B_x0, A_surface, A_T, B_T,  B_grid_min_coord, B_grid_max_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, max_iteration, optimization_tolerance, envelope, boundary, result):
-        idx_tetA = A_owners[idx_triA][0]
-        idx_tetB = B_owners[idx_triB][0]
+def compute_contacts_device( idx_triA, idx_triB, A_owners, B_owners,
+                             A_x, B_x, B_x0, A_surface, A_T, B_T,  
+                             B_grid_min_coord, B_grid_max_coord, B_grid_spacing, 
+                             B_grid_I, B_grid_J, B_grid_K, B_grid_values, 
+                             max_iteration, optimization_tolerance, envelope, boundary, result):
+    """ Compute the contact points between a triangle and a tetrahedron.
+
+    Args:
+        idx_triA (int32): The index of the triangle in the triangle array of body A.
+        idx_triB (int32): The index of the triangle in the triangle array of body B.
+        A_owners (dict): The dict of the tetrahedra of body A that own the triangle.
+        B_owners (dict): The dict of the tetrahedra of body B that own the triangle.
+        A_x (float64[:, :]): The vertices in deformed coordinates of body A.
+        B_x (float64[:, :]): The vertices in deformed coordinates of body B.
+        B_x0 (float64[:, :]): The vertices in material coordinates of body B.
+        A_surface (int32[:, :]): The indices of the vertices of the triangle of body A.
+        A_T (int32[:, :]): The indices of the vertices of the tetrahedra of body A.
+        B_T (int32[:, :]): The indices of the vertices of the tetrahedra of body B.
+        B_grid_min_coord (float64): The minimum coordinate of the grid of body B.
+        B_grid_max_coord (float64): The maximum coordinate of the grid of body B.
+        B_grid_spacing (float64[:]): The spacing of the grid of body B.
+        B_grid_I (int32): The number of cells in the x-direction of the grid.
+        B_grid_J (int32): The number of cells in the y-direction of the grid.
+        B_grid_K (int32): The number of cells in the z-direction of the grid.
+        B_grid_values (float64[:]): It is an array storing signed distances of grid points to a given mesh.
+        max_iteration (int32): Maximum number of Gauss-Seidel iterations.
+        optimization_tolerance (float64): The tolerance for the frank wolfe collision detection algorithm.
+        envelope (float64): Any geometry within this distance generates a contact point.
+        boundary (float64):  The boundary width. This specifies how far inside the grid bounding box the     point has to be to be considered inside.
+        result (dcit): The data of the contact point computation for creating a ContactPoint instance on CPU.
+    """
+    idx_tetA = A_owners[idx_triA][0]
+    idx_tetB = B_owners[idx_triB][0]
+
+    P = cuda.local.array((3, 3), dtype=float64)
+    XB = cuda.local.array((4, 3), dtype=float64)
+    X0B = cuda.local.array((4, 3), dtype=float64)
+
+    surface_indices = A_surface[idx_triA] 
+    for j in range(3):
+        for k in range(3): 
+            P[j, k] = A_x[surface_indices[j], k]
+        
+    tet_indices = B_T[idx_tetB] 
+    for j in range(4):
+        for k in range(3): 
+            XB[j, k] = B_x[tet_indices[j], k]
+            X0B[j, k] = B_x0[tet_indices[j], k]
+
+    P0 = cuda.local.array((3, 3), dtype=float64)
+    xform_triangle_to_model_space_device(P, XB, X0B, P0) 
+        
+    gradient_norms = cuda.local.array(3, dtype=float64)
+    gradient0 = cuda.local.array(3, float64)
+    gradient1 = cuda.local.array(3, float64)
+    gradient2 = cuda.local.array(3, float64)
+
+    Grid3.get_gradient_device(P0[0], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient0)
+    Grid3.get_gradient_device(P0[1], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient1)
+    Grid3.get_gradient_device(P0[2], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient2)
+
+    gradient_norms[0] = Vector.vec3_norm(gradient0)
+    gradient_norms[1] = Vector.vec3_norm(gradient1)
+    gradient_norms[2] = Vector.vec3_norm(gradient2)
+
+    x_i = cuda.local.array(3, dtype=float64)
+    for i in range(3):
+        x_i[i] = P0[Vector.argmin(gradient_norms, 3)][i]
 
-        P = cuda.local.array((3, 3), dtype=float64)
-        XB = cuda.local.array((4, 3), dtype=float64)
-        X0B = cuda.local.array((4, 3), dtype=float64)
+    for i in range(max_iteration):
+        t = cuda.local.array(3, dtype=float64)
+        Grid3.get_gradient_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, t)
 
-        surface_indices = A_surface[idx_triA] 
+        objectives = cuda.local.array(3, dtype=float64)
         for j in range(3):
-            for k in range(3): 
-                P[j, k] = A_x[surface_indices[j], k]
-        
-        tet_indices = B_T[idx_tetB] 
-        for j in range(4):
-            for k in range(3): 
-                XB[j, k] = B_x[tet_indices[j], k]
-                X0B[j, k] = B_x0[tet_indices[j], k]
-
-        P0 = cuda.local.array((3, 3), dtype=float64)
-        xform_triangle_to_model_space_device(P, XB, X0B, P0) 
-        
-        gradient_norms = cuda.local.array(3, dtype=float64)
-        gradient0 = cuda.local.array(3, float64)
-        gradient1 = cuda.local.array(3, float64)
-        gradient2 = cuda.local.array(3, float64)
-
-        Grid3.get_gradient_device(P0[0], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient0)
-        Grid3.get_gradient_device(P0[1], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient1)
-        Grid3.get_gradient_device(P0[2], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient2)
-
-        gradient_norms[0] = Vector.vec3_norm(gradient0)
-        gradient_norms[1] = Vector.vec3_norm(gradient1)
-        gradient_norms[2] = Vector.vec3_norm(gradient2)
-
-        x_i = cuda.local.array(3, dtype=float64)
-        for i in range(3):
-            x_i[i] = P0[Vector.argmin(gradient_norms, 3)][i]
-
-        for i in range(max_iteration):
-            t = cuda.local.array(3, dtype=float64)
-            Grid3.get_gradient_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, t)
-
-            objectives = cuda.local.array(3, dtype=float64)
-            for j in range(3):
-                objectives[j] = Vector.vec3_dot(P0[j], t)
+            objectives[j] = Vector.vec3_dot(P0[j], t)
             
-            vertex = Vector.argmin(objectives, 3)
-            s_i = P0[vertex]
-            alpha = 2 / (i + 2)
+        vertex = Vector.argmin(objectives, 3)
+        s_i = P0[vertex]
+        alpha = 2 / (i + 2)
 
-            sx_sub = cuda.local.array(3, dtype=float64)
-            Vector.vec3_sub(s_i, x_i, sx_sub)
+        sx_sub = cuda.local.array(3, dtype=float64)
+        Vector.vec3_sub(s_i, x_i, sx_sub)
 
-            sx_sub_alpha = cuda.local.array(3, dtype=float64)
-            Vector.vec3_mut_scalar(sx_sub, alpha, sx_sub_alpha)
+        sx_sub_alpha = cuda.local.array(3, dtype=float64)
+        Vector.vec3_mut_scalar(sx_sub, alpha, sx_sub_alpha)
 
-            x_i_new = cuda.local.array(3, dtype=float64)
-            Vector.vec3_add(x_i, sx_sub_alpha, x_i_new)
+        x_i_new = cuda.local.array(3, dtype=float64)
+        Vector.vec3_add(x_i, sx_sub_alpha, x_i_new)
 
-            for j in range(3):
-                x_i[j] = x_i_new[j]
+        for j in range(3):
+            x_i[j] = x_i_new[j]
 
-            if objectives[vertex] > optimization_tolerance:
-                break
+        if objectives[vertex] > optimization_tolerance:
+            break
         
         # contact point generation
-        if Grid3.is_inside_device(x_i, B_grid_min_coord, B_grid_max_coord, 0.5):
-            phi = Grid3.get_value_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values)
-            if phi < envelope:
-                gap = phi
-                n_res = cuda.local.array(3, dtype=float64)
-                Grid3.get_gradient_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, n_res)
-                if Vector.vec3_norm(n_res) > 0:
-                    XA = cuda.local.array((4, 3), dtype=float64)
-                    indices = A_T[idx_tetA]
-                    for i in range(4):
-                        for j in range(3):
-                            XA[i][j] = A_x[indices[i], j]
+    if Grid3.is_inside_device(x_i, B_grid_min_coord, B_grid_max_coord, 0.5):
+        phi = Grid3.get_value_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values)
+        if phi < envelope:
+            gap = phi
+            n_res = cuda.local.array(3, dtype=float64)
+            Grid3.get_gradient_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, n_res)
+            if Vector.vec3_norm(n_res) > 0:
+                XA = cuda.local.array((4, 3), dtype=float64)
+                indices = A_T[idx_tetA]
+                for i in range(4):
+                    for j in range(3):
+                        XA[i][j] = A_x[indices[i], j]
                     
-                    omegaB = cuda.local.array(4, dtype=float64)
-                    compute_omegaB_device(x_i, X0B, omegaB)
-                    p = cuda.local.array(3, dtype=float64)
-                    compute_p_device(x_i, XB, omegaB, p)
-                    omegaA = cuda.local.array(4, dtype=float64)
-                    compute_omegaA_device(p, XA, omegaA)
-                    n_res_new = cuda.local.array(3, dtype=float64)
-                    compute_n_device(n_res, XB, X0B, n_res_new)
-                    unit_n = cuda.local.array(3, dtype=float64)
-                    Vector.vec3_unit(n_res_new, unit_n)
-
-                    result['idx_tetB'] = idx_tetB
-                    result['idx_tetA'] = idx_tetA
-                    result['omegaB'] = omegaB
-                    result['omegaA'] = omegaA
-                    result['p'] = p
-                    result['unit_n'] = unit_n
-                    result['gap'] = gap
-
-                else:
-                    result['idx_tetB'] = -3
-                    result['idx_tetA'] = -3
+                omegaB = cuda.local.array(4, dtype=float64)
+                compute_omega_device(x_i, X0B, omegaB)
+                p = cuda.local.array(3, dtype=float64)
+                compute_p_device(x_i, XB, omegaB, p)
+                omegaA = cuda.local.array(4, dtype=float64)
+                compute_omega_device(p, XA, omegaA)
+                n_res_new = cuda.local.array(3, dtype=float64)
+                compute_n_device(n_res, XB, X0B, n_res_new)
+                unit_n = cuda.local.array(3, dtype=float64)
+                Vector.vec3_unit(n_res_new, unit_n)
+
+                result['idx_tetB'] = idx_tetB
+                result['idx_tetA'] = idx_tetA
+                result['omegaB'] = omegaB
+                result['omegaA'] = omegaA
+                result['p'] = p
+                result['unit_n'] = unit_n
+                result['gap'] = gap
+
             else:
-                result['idx_tetB'] = -2
-                result['idx_tetA'] = -2
+                result['idx_tetB'] = -1
+                result['idx_tetA'] = -1
         else:
             result['idx_tetB'] = -1
             result['idx_tetA'] = -1
+    else:
+        result['idx_tetB'] = -1
+        result['idx_tetA'] = -1
 
 
 @cuda.jit(lineinfo=True)
@@ -176,6 +242,30 @@ def contact_points_computing_kernel( d_bodyA_idxs, d_bodyB_idxs, d_overlaps,
                                     d_B_grid_min_coords, d_B_grid_max_coords, d_B_grid_spacings,
                                     d_B_grid_Is, d_B_grid_Js, d_B_grid_Ks,
                                     max_iterations, tolerance, envelope, boundary, result_gpu):
+    """ This a kernel function for computing contact points between a triangle and a tetrahedron.
+
+    Args:
+        d_B_values (float64[:, :]): It is an array storing signed distances of grid points to a given mesh.
+        d_A_owners (list(dict)): The dict of the tetrahedra of body A that own the triangle.
+        d_B_owners (list(dict)): The dict of the tetrahedra of body B that own the triangle.
+        d_A_xs (float64[:, :, :]): The vertices in deformed coordinates of body A.
+        d_B_xs (float64[:, :, :]): The vertices in deformed coordinates of body B.
+        d_B_x0s (float64[:, :, :]): The vertices in material coordinates of body B.
+        d_A_surfaces (float64[:]): The indices of the vertices of the triangle of body A.
+        d_A_Ts (int32[:, :]): The indices of the vertices of the tetrahedra of body A.
+        d_B_Ts (int32[:, :]): The indices of the vertices of the tetrahedra of body B.
+        d_B_grid_min_coords (_type_): The minimum coordinate of the grid of body B.
+        d_B_grid_max_coords (float64[:]): The maximum coordinate of the grid of body B.
+        d_B_grid_spacings (float64[:]):  The spacing of the grid of body B.
+        d_B_grid_Is (int32[:]): The number of cells in the x-direction of the grid.
+        d_B_grid_Js (int32[:]): The number of cells in the y-direction of the grid.
+        d_B_grid_Ks (int32[:]): The number of cells in the z-direction of the grid.
+        max_iterations (int32): Maximum number of Gauss-Seidel iterations.
+        tolerance (float64): Maximum number of Gauss-Seidel iterations.
+        envelope (float64): Any geometry within this distance generates a contact point.
+        boundary (float64): The boundary width. This specifies how far inside the grid bounding box the     point has to be to be considered inside.
+        result_gpu (list(dict)): The data of the contact point computation for creating a ContactPoint instance on CPU.
+    """
     gid = cuda.grid(1)
 
     if gid < d_overlaps.shape[0]:
@@ -197,10 +287,5 @@ def contact_points_computing_kernel( d_bodyA_idxs, d_bodyB_idxs, d_overlaps,
         B_grid_I = d_B_grid_Is[gid]
         B_grid_J = d_B_grid_Js[gid]
         B_grid_K = d_B_grid_Ks[gid]
-        
-
-        ## Those data maybe need to tansform to Shared Memory, but I just test two small bodies in scene, those data size total is over 30KB except the 'grid_values', and the 'grid_values' is over 4MB. Those data size is too large to put in Shared Memory. Maybe we need to consider other ways to optimize this.
-        
-
-        compute_contacts_device(idx_triA, idx_triB, A_owners, B_owners, A_x, B_x, B_x0, A_surface, A_T, B_T,  B_grid_min_coord, B_grid_max_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, max_iterations, tolerance, envelope, boundary, result_gpu[gid])
 
+        compute_contacts_device(idx_triA, idx_triB, A_owners, B_owners, A_x, B_x, B_x0, A_surface, A_T, B_T,  B_grid_min_coord, B_grid_max_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, max_iterations, tolerance, envelope, boundary, result_gpu[gid])
\ No newline at end of file
diff --git a/python/rainbow/simulators/prox_soft_bodies/collision_detection.py b/python/rainbow/simulators/prox_soft_bodies/collision_detection.py
index 702800b..cd88ff0 100644
--- a/python/rainbow/simulators/prox_soft_bodies/collision_detection.py
+++ b/python/rainbow/simulators/prox_soft_bodies/collision_detection.py
@@ -243,6 +243,15 @@ def _compute_contacts(engine, stats, bodyA, bodyB, results, debug_on):
 
 
 def _uniform_padding(listss, padding_value):
+    """ Pad the listss to the same length, means that the length of each sub list is the same.
+
+    Args:
+        listss (List[List]): A nested list.
+        padding_value (DataType): A padding value, which is used to pad the list.
+
+    Returns:
+        List[List]: the padded listss, which has the same length of each sub list.
+    """
     valid_lists = [l for l in listss if l is not None]
 
     if len(valid_lists) == 0:
@@ -258,6 +267,15 @@ def _uniform_padding(listss, padding_value):
 
 
 def _assemble_body_data_to_gpu(data_lists, bodyA, bodyB, triA, triB):
+    """ Assemble body data to a data list for GPU computing.
+
+    Args:
+        data_lists (List): The data list for GPU computing.
+        bodyA (SoftBody): A SofyBody instance.
+        bodyB (SoftBody): A SofyBody instance.
+        triA (int): The index of triangle in bodyA.
+        triB (int): The index of triangle in bodyB.
+    """
     data_lists['bodyA_idxs'].append(bodyA.idx)
     data_lists['bodyB_idxs'].append(bodyB.idx)
     data_lists['overlap_results'].append((triA, triB))
@@ -277,14 +295,21 @@ def _assemble_body_data_to_gpu(data_lists, bodyA, bodyB, triA, triB):
     data_lists['B_grid_Js'].append(bodyB.grid.J)
     data_lists['B_grid_Ks'].append(bodyB.grid.K)
 
+
 def _contact_point_gpu(overlaps, engine, stats, debug_on):
+    """ The GPU version of contact point computing, it flattens the data and send to GPU, then call the kernel function.
 
-    contact_optimization_timer = None
-    model_space_update_timer = None
+    Args:
+        overlaps (dict): A dictionary of triangles from one body that overlaps another body.
+        engine (Engine):  The current engine instance we are working with.
+        stats (dict): A dictionary where to add more profiling and timing measurements.
+        debug_on (bool): Boolean flag for toggling debug (aka profiling) info on and off.
+
+    Returns:
+        dict:  A dictionary with profiling and timing measurements.
+    """
     contact_point_generation_timer = None
     if debug_on:
-        # model_space_update_timer = Timer("model_space_update")
-        # contact_optimization_timer = Timer("contact_optimization")
         contact_point_generation_timer = Timer("contact_point_generation")
 
     data_lists = {
@@ -348,14 +373,17 @@ def _contact_point_gpu(overlaps, engine, stats, debug_on):
         'B_grid_Ks': np.int32
     }
 
+    # copy data to GPU
     d_data = {}
     for key, data in data_lists.items():
         array_data = np.array(data, dtype=type_map.get(key))
         d_data[f'd_{key}'] = cuda.to_device(array_data)
 
+    # setting up GPU computing (grid and block)
     threads_per_block = engine.params.gpu_grid_size
     blocks_per_grid = (data_length + threads_per_block - 1) // threads_per_block
 
+    # setting up result from GPU (data type and size)
     result_dtype = np.dtype([
         ('idx_tetB', np.int32),
         ('idx_tetA', np.int32),
@@ -367,6 +395,7 @@ def _contact_point_gpu(overlaps, engine, stats, debug_on):
     ])
     result_gpu = cuda.device_array(data_length, dtype=result_dtype)
 
+    # call GPU kernel function
     CUDA_COMPUTE_CONTACTS.contact_points_computing_kernel[blocks_per_grid, threads_per_block](
         d_data['d_bodyA_idxs'], d_data['d_bodyB_idxs'], d_data['d_overlap_results'],
         d_data['d_B_values'], d_data['d_A_owners'], d_data['d_B_owners'],
@@ -378,8 +407,9 @@ def _contact_point_gpu(overlaps, engine, stats, debug_on):
         engine.params.contact_optimization_tolerance,
         engine.params.envelope, 0.5, result_gpu)
 
-    cuda.synchronize() ## wait for GPU data 
-    result_to_cpu = result_gpu.copy_to_host() ## copy GPU data to CPU
+    # wait for GPU data and copy to CPU
+    cuda.synchronize() 
+    result_to_cpu = result_gpu.copy_to_host() 
 
     ## generate contact points
     for res in result_to_cpu:
@@ -394,12 +424,6 @@ def _contact_point_gpu(overlaps, engine, stats, debug_on):
         engine.contact_points.append(cp)
 
     if debug_on:
-        # if "model_space_update" not in stats:
-        #     stats["model_space_update"] = 0
-        # stats["model_space_update"] += model_space_update_timer.total
-        # if "contact_optimization" not in stats:
-        #     stats["contact_optimization"] = 0
-        # stats["contact_optimization"] += contact_optimization_timer.total
         if "contact_point_generation" not in stats:
             stats["contact_point_generation"] = 0
         stats["contact_point_generation"] += contact_point_generation_timer.total
@@ -430,8 +454,9 @@ def _contact_determination(overlaps, engine, stats, debug_on):
         if debug_on:
             contact_determination_timer.end()
             stats["contact_determination"] = contact_determination_timer.elapsed
-
         return stats
+    
+    # contact points computing on CPU, if GPU is not available or the flag is Flase
     for key, results in overlaps.items():
         # TODO 2022-12-31 Kenny: The code currently computes a lot of redundant contacts due
         #  to BVH traversal may return a triangle as part of several pairs. We only need

From 3594641779415a2631f0800fef31a2fb63f4db8d Mon Sep 17 00:00:00 2001
From: qbp758 <qbp758@futharkhpa03fl.unicph.domain>
Date: Mon, 2 Oct 2023 19:26:31 +0200
Subject: [PATCH 2/3] remove outer comments

---
 .../cuda/collision_detection/compute_contacts.py       | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/python/rainbow/cuda/collision_detection/compute_contacts.py b/python/rainbow/cuda/collision_detection/compute_contacts.py
index e63c958..a4ad4d0 100644
--- a/python/rainbow/cuda/collision_detection/compute_contacts.py
+++ b/python/rainbow/cuda/collision_detection/compute_contacts.py
@@ -48,16 +48,6 @@ def compute_omega_device(p, X, omega):
     BC.compute_barycentric_tetrahedron_device(X[0], X[1], X[2], X[3], p, omega)
 
 
-# @cuda.jit(device=True)
-# def compute_omegaB_device(p, X0B, omegaB):
-#     BC.compute_barycentric_tetrahedron_device(X0B[0], X0B[1], X0B[2], X0B[3], p, omegaB)
-
-
-# @cuda.jit(device=True)
-# def compute_omegaA_device(p, XA, omegaA):
-#     BC.compute_barycentric_tetrahedron_device(XA[0], XA[1], XA[2], XA[3], p, omegaA)
-
-
 @cuda.jit(device=True)
 def compute_p_device(p, X, omega, result):
     """ Compute the contact point in the world space of a body

From 7037884d0e6430bf58e39ba5b7df5703722745c4 Mon Sep 17 00:00:00 2001
From: qbp758 <qbp758@futharkhpa03fl.unicph.domain>
Date: Mon, 2 Oct 2023 19:36:19 +0200
Subject: [PATCH 3/3] unittest check the cuda is available or not

---
 python/rainbow/simulators/prox_soft_bodies/types.py | 2 +-
 python/unit_tests/test_cuda_geometry_barycentric.py | 1 +
 python/unit_tests/test_cuda_geometry_grid3.py       | 1 +
 python/unit_tests/test_cuda_math_linalg.py          | 1 +
 python/unit_tests/test_cuda_math_matrix.py          | 1 +
 python/unit_tests/test_cuda_math_vec.py             | 1 +
 6 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/rainbow/simulators/prox_soft_bodies/types.py b/python/rainbow/simulators/prox_soft_bodies/types.py
index 92e186f..2fb5884 100644
--- a/python/rainbow/simulators/prox_soft_bodies/types.py
+++ b/python/rainbow/simulators/prox_soft_bodies/types.py
@@ -252,7 +252,7 @@ def __init__(self):
             0.1  # Any geometry within this distance generates a contact point.
         )
         self.resolution = 64  # The number of grid cells along each axis in the signed distance fields.
-        self.use_gpu = True # Boolean flag that indicates if we should use the GPU or not.
+        self.use_gpu = False # Boolean flag that indicates if we should use the GPU or not.
         self.gpu_grid_size = 256 # The number of threads per block to use on the GPU.
 
 
diff --git a/python/unit_tests/test_cuda_geometry_barycentric.py b/python/unit_tests/test_cuda_geometry_barycentric.py
index e54be9d..4188ba2 100644
--- a/python/unit_tests/test_cuda_geometry_barycentric.py
+++ b/python/unit_tests/test_cuda_geometry_barycentric.py
@@ -11,6 +11,7 @@
 import rainbow.cuda.unit_tests.test_geometry_barycentric_kernel as TEST_BCK
 
 
+@unittest.skipIf(not cuda.is_available(), "CUDA not available")
 class TestGrid3Cuda(unittest.TestCase):
     
     def test_compute_barycentric_tetrahedron(self):
diff --git a/python/unit_tests/test_cuda_geometry_grid3.py b/python/unit_tests/test_cuda_geometry_grid3.py
index 726c3ac..288aec9 100644
--- a/python/unit_tests/test_cuda_geometry_grid3.py
+++ b/python/unit_tests/test_cuda_geometry_grid3.py
@@ -11,6 +11,7 @@
 import rainbow.cuda.unit_tests.test_geometry_grid3_kernel as TEST_GRID3K
 
 
+@unittest.skipIf(not cuda.is_available(), "CUDA not available")
 def simpelfunc(coord):
     _, _, z = coord[0], coord[1], coord[2]
     return z
diff --git a/python/unit_tests/test_cuda_math_linalg.py b/python/unit_tests/test_cuda_math_linalg.py
index abce04e..238426c 100644
--- a/python/unit_tests/test_cuda_math_linalg.py
+++ b/python/unit_tests/test_cuda_math_linalg.py
@@ -10,6 +10,7 @@
 import rainbow.util.test_tools as TEST
 
 
+@unittest.skipIf(not cuda.is_available(), "CUDA not available")
 class TestLinAlgCuda(unittest.TestCase):
     
     def test_cramer_solver(self):
diff --git a/python/unit_tests/test_cuda_math_matrix.py b/python/unit_tests/test_cuda_math_matrix.py
index e5bc91e..679c076 100644
--- a/python/unit_tests/test_cuda_math_matrix.py
+++ b/python/unit_tests/test_cuda_math_matrix.py
@@ -10,6 +10,7 @@
 import rainbow.cuda.unit_tests.test_math_matrix_kernel as TEST_MK
 
 
+@unittest.skipIf(not cuda.is_available(), "CUDA not available")
 class TestMatrixCuda(unittest.TestCase):
     
     def test_mat33_T(self):
diff --git a/python/unit_tests/test_cuda_math_vec.py b/python/unit_tests/test_cuda_math_vec.py
index 31ff070..75687f3 100644
--- a/python/unit_tests/test_cuda_math_vec.py
+++ b/python/unit_tests/test_cuda_math_vec.py
@@ -10,6 +10,7 @@
 import rainbow.util.test_tools as TEST
 
 
+@unittest.skipIf(not cuda.is_available(), "CUDA not available")
 class TestVec3Cuda(unittest.TestCase):
     
     def test_vec3_add(self):