Skip to content

Commit

Permalink
merge numba cuda feature
Browse files Browse the repository at this point in the history
  • Loading branch information
qbp758 committed Oct 2, 2023
2 parents b79f5c0 + dd8d88a commit eb3d2b0
Show file tree
Hide file tree
Showing 19 changed files with 1,859 additions and 1 deletion.
1 change: 1 addition & 0 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ jobs:
conda install -c anaconda ipython_genutils
conda install -c conda-forge meshplot
pip install usd-core
conda install numba
coverage run -m unittest python/unit_tests/test_*.py
coverage report
- name: Format the code
Expand Down
206 changes: 206 additions & 0 deletions python/rainbow/cuda/collision_detection/compute_contacts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import numpy as np
from numba import cuda, float64

import rainbow.cuda.math.vec as Vector
import rainbow.cuda.math.matrix as Matrix
import rainbow.cuda.math.linalg as LinAlg
import rainbow.cuda.geometry.barycentric as BC
import rainbow.cuda.geometry.grid3 as Grid3


@cuda.jit(device=True)
def xform_triangle_to_model_space_device(P, X, X0, P0):
Matrix.mat33_zero(P0)

w0 = cuda.local.array(4, dtype=float64)
w1 = cuda.local.array(4, dtype=float64)
w2 = cuda.local.array(4, dtype=float64)

BC.compute_barycentric_tetrahedron_device(X[0], X[1], X[2], X[3], P[0], w0)
BC.compute_barycentric_tetrahedron_device(X[0], X[1], X[2], X[3], P[1], w1)
BC.compute_barycentric_tetrahedron_device(X[0], X[1], X[2], X[3], P[2], w2)

t = cuda.local.array((3, 4), dtype=float64)
Matrix.mat43_T(X0, t)

Matrix.mat34_dot_vec4(t, w0, P0[0])
Matrix.mat34_dot_vec4(t, w1, P0[1])
Matrix.mat34_dot_vec4(t, w2, P0[2])


@cuda.jit(device=True)
def compute_omegaB_device(p, X0B, omegaB):
BC.compute_barycentric_tetrahedron_device(X0B[0], X0B[1], X0B[2], X0B[3], p, omegaB)


@cuda.jit(device=True)
def compute_omegaA_device(p, XA, omegaA):
BC.compute_barycentric_tetrahedron_device(XA[0], XA[1], XA[2], XA[3], p, omegaA)


@cuda.jit(device=True)
def compute_p_device(p, XB, omegaB, result):
t = cuda.local.array((3, 4), dtype=float64)
Matrix.mat43_T(XB, t)
Matrix.mat34_dot_vec4(t, omegaB, result)


@cuda.jit(device=True)
def compute_n_device(n, XB, X0B, result):
D = cuda.local.array((3,3), dtype=float64)
D0 = cuda.local.array((3, 3), dtype=float64)
for i in range(3):
for j in range(3):
D[i, j] = XB[i, j] - XB[3, j]
D0[i, j] = X0B[i, j] - X0B[3, j]
solve_res = cuda.local.array(3, dtype=float64)
LinAlg.cramer_solver(D0, n, solve_res)
Matrix.mat33_dot_vec3(D, solve_res, result)


@cuda.jit(device=True)
def compute_contacts_device(idx_triA, idx_triB, A_owners, B_owners, A_x, B_x, B_x0, A_surface, A_T, B_T, B_grid_min_coord, B_grid_max_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, max_iteration, optimization_tolerance, envelope, boundary, result):
idx_tetA = A_owners[idx_triA][0]
idx_tetB = B_owners[idx_triB][0]

P = cuda.local.array((3, 3), dtype=float64)
XB = cuda.local.array((4, 3), dtype=float64)
X0B = cuda.local.array((4, 3), dtype=float64)

surface_indices = A_surface[idx_triA]
for j in range(3):
for k in range(3):
P[j, k] = A_x[surface_indices[j], k]

tet_indices = B_T[idx_tetB]
for j in range(4):
for k in range(3):
XB[j, k] = B_x[tet_indices[j], k]
X0B[j, k] = B_x0[tet_indices[j], k]

P0 = cuda.local.array((3, 3), dtype=float64)
xform_triangle_to_model_space_device(P, XB, X0B, P0)

gradient_norms = cuda.local.array(3, dtype=float64)
gradient0 = cuda.local.array(3, float64)
gradient1 = cuda.local.array(3, float64)
gradient2 = cuda.local.array(3, float64)

Grid3.get_gradient_device(P0[0], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient0)
Grid3.get_gradient_device(P0[1], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient1)
Grid3.get_gradient_device(P0[2], B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, gradient2)

gradient_norms[0] = Vector.vec3_norm(gradient0)
gradient_norms[1] = Vector.vec3_norm(gradient1)
gradient_norms[2] = Vector.vec3_norm(gradient2)

x_i = cuda.local.array(3, dtype=float64)
for i in range(3):
x_i[i] = P0[Vector.argmin(gradient_norms, 3)][i]

for i in range(max_iteration):
t = cuda.local.array(3, dtype=float64)
Grid3.get_gradient_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, t)

objectives = cuda.local.array(3, dtype=float64)
for j in range(3):
objectives[j] = Vector.vec3_dot(P0[j], t)

vertex = Vector.argmin(objectives, 3)
s_i = P0[vertex]
alpha = 2 / (i + 2)

sx_sub = cuda.local.array(3, dtype=float64)
Vector.vec3_sub(s_i, x_i, sx_sub)

sx_sub_alpha = cuda.local.array(3, dtype=float64)
Vector.vec3_mut_scalar(sx_sub, alpha, sx_sub_alpha)

x_i_new = cuda.local.array(3, dtype=float64)
Vector.vec3_add(x_i, sx_sub_alpha, x_i_new)

for j in range(3):
x_i[j] = x_i_new[j]

if objectives[vertex] > optimization_tolerance:
break

# contact point generation
if Grid3.is_inside_device(x_i, B_grid_min_coord, B_grid_max_coord, 0.5):
phi = Grid3.get_value_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values)
if phi < envelope:
gap = phi
n_res = cuda.local.array(3, dtype=float64)
Grid3.get_gradient_device(x_i, B_grid_min_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, n_res)
if Vector.vec3_norm(n_res) > 0:
XA = cuda.local.array((4, 3), dtype=float64)
indices = A_T[idx_tetA]
for i in range(4):
for j in range(3):
XA[i][j] = A_x[indices[i], j]

omegaB = cuda.local.array(4, dtype=float64)
compute_omegaB_device(x_i, X0B, omegaB)
p = cuda.local.array(3, dtype=float64)
compute_p_device(x_i, XB, omegaB, p)
omegaA = cuda.local.array(4, dtype=float64)
compute_omegaA_device(p, XA, omegaA)
n_res_new = cuda.local.array(3, dtype=float64)
compute_n_device(n_res, XB, X0B, n_res_new)
unit_n = cuda.local.array(3, dtype=float64)
Vector.vec3_unit(n_res_new, unit_n)

result['idx_tetB'] = idx_tetB
result['idx_tetA'] = idx_tetA
result['omegaB'] = omegaB
result['omegaA'] = omegaA
result['p'] = p
result['unit_n'] = unit_n
result['gap'] = gap

else:
result['idx_tetB'] = -3
result['idx_tetA'] = -3
else:
result['idx_tetB'] = -2
result['idx_tetA'] = -2
else:
result['idx_tetB'] = -1
result['idx_tetA'] = -1


@cuda.jit(lineinfo=True)
def contact_points_computing_kernel( d_bodyA_idxs, d_bodyB_idxs, d_overlaps,
d_B_values, d_A_owners, d_B_owners, d_A_xs,
d_B_xs, d_B_x0s, d_A_surfaces, d_A_Ts, d_B_Ts,
d_B_grid_min_coords, d_B_grid_max_coords, d_B_grid_spacings,
d_B_grid_Is, d_B_grid_Js, d_B_grid_Ks,
max_iterations, tolerance, envelope, boundary, result_gpu):
gid = cuda.grid(1)

if gid < d_overlaps.shape[0]:
bodyA_idx = d_bodyA_idxs[gid]
bodyB_idx = d_bodyB_idxs[gid]
idx_triA, idx_triB = d_overlaps[gid]
B_grid_values = d_B_values[gid]
A_owners = d_A_owners[gid]
B_owners = d_B_owners[gid]
A_x = d_A_xs[gid]
B_x = d_B_xs[gid]
B_x0 = d_B_x0s[gid]
A_surface = d_A_surfaces[gid]
A_T = d_A_Ts[gid]
B_T = d_B_Ts[gid]
B_grid_min_coord = d_B_grid_min_coords[gid]
B_grid_max_coord = d_B_grid_max_coords[gid]
B_grid_spacing = d_B_grid_spacings[gid]
B_grid_I = d_B_grid_Is[gid]
B_grid_J = d_B_grid_Js[gid]
B_grid_K = d_B_grid_Ks[gid]


## Those data maybe need to tansform to Shared Memory, but I just test two small bodies in scene, those data size total is over 30KB except the 'grid_values', and the 'grid_values' is over 4MB. Those data size is too large to put in Shared Memory. Maybe we need to consider other ways to optimize this.


compute_contacts_device(idx_triA, idx_triB, A_owners, B_owners, A_x, B_x, B_x0, A_surface, A_T, B_T, B_grid_min_coord, B_grid_max_coord, B_grid_spacing, B_grid_I, B_grid_J, B_grid_K, B_grid_values, max_iterations, tolerance, envelope, boundary, result_gpu[gid])

60 changes: 60 additions & 0 deletions python/rainbow/cuda/geometry/barycentric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from numba import cuda, float64
import rainbow.cuda.math.vec as Vector
import rainbow.cuda.math.linalg as LinAlg
import rainbow.cuda.math.matrix as Matrix


@cuda.jit(device=True)
def compute_barycentric_tetrahedron_device(x1: float64[:], x2: float64[:], x3: float64[:], x4: float64[:], p: float64[:], result: float64[:]):
""" This method computes the barycentric coordinates for a point p of a tetrahedron
given by the points x1,x2,x3, x4 (in right-hand order).
Args:
x1 (float64[:]): The first point of the tetrahedron.
x2 (float64[:]): The second point of the tetrahedron.
x3 (float64[:]): The third point of the tetrahedron.
x4 (float64[:]): The fourth point of the tetrahedron.
p (float64[:]): The point for which the barycentric coordinates should be computed.
result (float64[:]): A quadruplet containing the first, second, third abd fourth
barycentric coordinates w1, w2, w3 and w4.
"""
b1 = cuda.local.array(3, dtype=float64)
b2 = cuda.local.array(3, dtype=float64)
b3 = cuda.local.array(3, dtype=float64)

Vector.vec3_sub(x2, x1, b1)
Vector.vec3_sub(x3, x1, b2)
Vector.vec3_sub(x4, x1, b3)

cross_result = cuda.local.array(3, dtype=float64)
Vector.vec3_cross(b2, b3, cross_result)

if Vector.vec3_dot(b1, cross_result) > 0:
basic = cuda.local.array((3, 3), dtype=float64)
Matrix.mat33_make_from_cols(b1, b2, b3, basic)
p_sub_x1 = cuda.local.array(3, dtype=float64)
Vector.vec3_sub(p, x1, p_sub_x1)
q = cuda.local.array(3, dtype=float64)
LinAlg.cramer_solver(basic, p_sub_x1, q)
w1 = 1.0 - q[0] - q[1] - q[2]
w2 = q[0]
w3 = q[1]
w4 = q[2]
else:
basic = cuda.local.array((3, 3), dtype=float64)
Matrix.mat33_make_from_cols(b1, b3, b2, basic)
p_sub_x1 = cuda.local.array(3, dtype=float64)
Vector.vec3_sub(p, x1, p_sub_x1)
q = cuda.local.array(3, dtype=float64)
LinAlg.cramer_solver(basic, p_sub_x1, q)

w1 = 1.0 - q[0] - q[1] - q[2]
w2 = q[0]
w3 = q[2]
w4 = q[1]

result[0] = w1
result[1] = w2
result[2] = w3
result[3] = w4

Loading

0 comments on commit eb3d2b0

Please sign in to comment.