-
I am working on (hopefully) fast implementations of batched forward kinematics for articulated robots. As such, I need a way to swiftly compute the cumulative product of a chain / series of 4x4 matrices (representing rigid body transformations). Apart from the obvious choice of looping and multiplying over I noticed that the resulting program consists mainly of fma operations, no tensorcore ops. Would ditching the Best, |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment
-
Just ran a quick test using a single environment: import warp as wp
import numpy as np
wp.config.enable_backward = False
N = 10
@wp.kernel
def matmul_test(
x: wp.array3d(dtype=float), prod_scan: wp.array3d(dtype=float)
):
prev = wp.tile_load(x[0], shape=(4, 4))
wp.tile_store(prod_scan[0], prev)
for i in range(1, N):
this = wp.tile_load(x[i], shape=(4, 4))
prod = wp.tile_matmul(this, prev)
wp.tile_store(prod_scan[i], prod)
prev = prod # prev should now alias prod
@wp.kernel
def matrix_test(
x: wp.array(dtype=wp.mat44), prod_scan: wp.array(dtype=wp.mat44)
):
prev = x[0]
prod_scan[0] = prev
for i in range(1, N):
this = x[i]
prod = wp.mul(this, prev)
prod_scan[i] = prod
prev = prod
def create_tensor(N):
tensor = np.zeros((N, 4, 4))
for i in range(N):
value = 1.0 + i / 10
tensor[i] = np.full((4, 4), value)
return tensor
x_np = create_tensor(N)
x = wp.array(x_np, dtype=float)
prod_scan = wp.zeros_like(x)
with wp.ScopedTimer("matmul"):
for _ in range(100):
prod_scan.zero_()
wp.launch_tiled(matmul_test, dim=1, inputs=[x], outputs=[prod_scan], block_dim=64)
# print(prod_scan.numpy())
x_np = create_tensor(N)
x = wp.array(x_np, dtype=wp.mat44)
prod_scan = wp.zeros_like(x)
with wp.ScopedTimer("matrix"):
for _ in range(100):
wp.launch_tiled(matrix_test, dim=1, inputs=[x], outputs=[prod_scan], block_dim=64)
prod_scan.zero_()
# print(prod_scan.numpy()) The mat44 approach takes about 50% longer. |
Beta Was this translation helpful? Give feedback.
Just ran a quick test using a single environment: