Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set(CMAKE_CXX_STANDARD 20)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -Xptxas -O3")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm adding the -Xptxas -O3 flag because in the past it's been giving me better performance... If you're running into build issue, you might need to clear out the build cache once (or also feel free to delete this)


################
# Core Library #
Expand Down
13 changes: 12 additions & 1 deletion genmetaballs/src/cuda/bindings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,17 @@ NB_MODULE(_genmetaballs_bindings, m) {
bind_array2d<float, MemoryLocation::HOST>(utils, "CPUFloatArray2D");
bind_array2d<float, MemoryLocation::DEVICE>(utils, "GPUFloatArray2D");

// bind dim3, which is used to specify the launch configuration for the kernel
nb::class_<dim3>(utils, "dim3")
.def(nb::init<uint32_t, uint32_t, uint32_t>(), nb::arg("x") = 1, nb::arg("y") = 1,
nb::arg("z") = 1)
.def_prop_ro("x", [](const dim3& self) { return self.x; })
.def_prop_ro("y", [](const dim3& self) { return self.y; })
.def_prop_ro("z", [](const dim3& self) { return self.z; })
.def("__repr__", [](const dim3& self) {
return nb::str("dim3(x={}, y={}, z={})").format(self.x, self.y, self.z);
});

} // NB_MODULE(_genmetaballs_bindings)

template <typename T, MemoryLocation location>
Expand Down Expand Up @@ -319,5 +330,5 @@ void bind_render_fmbs(nb::module_& m, const char* name) {
&render_fmbs<AllGetter<MemoryLocation::DEVICE>, LinearIntersector, Blender, Confidence>,
"Render the given FMB scene into the provided image view", nb::arg("fmbs"),
nb::arg("blender"), nb::arg("confidence"), nb::arg("intr"), nb::arg("extr"),
nb::arg("img"));
nb::arg("img"), nb::arg("grid_size"), nb::arg("block_size"));
}
5 changes: 3 additions & 2 deletions genmetaballs/src/cuda/core/camera.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ CUDA_CALLABLE PixelCoordRange::Iterator& PixelCoordRange::Iterator::operator++()
return *this;
}

CUDA_CALLABLE bool PixelCoordRange::Sentinel::operator==(const Iterator& it) const {
CUDA_CALLABLE bool operator!=(const PixelCoordRange::Iterator& it,
const PixelCoordRange::Sentinel& sentinel) {
// stop if we reach the end of rows, or if the range is empty
return it.py >= py_end || it.px_start >= it.px_end || it.py_start >= py_end;
return it.py < sentinel.py_end && it.px_start < it.px_end && it.py_start < sentinel.py_end;
}

CUDA_CALLABLE PixelCoordRange::Iterator PixelCoordRange::begin() const {
Expand Down
6 changes: 3 additions & 3 deletions genmetaballs/src/cuda/core/camera.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ struct PixelCoordRange {
// the Sentinel class only needs to hold the stop value (i.e. final row)
struct Sentinel {
uint32_t py_end;

// stopping criterion: true if current row (py) reaches py_end
CUDA_CALLABLE bool operator==(const Iterator& it) const;
};

// stopping criterion: true if current row (py) reaches py_end
friend CUDA_CALLABLE bool operator!=(const Iterator& it, const Sentinel& sentinel);

// range methods
CUDA_CALLABLE Iterator begin() const;
CUDA_CALLABLE Sentinel end() const;
Expand Down
9 changes: 3 additions & 6 deletions genmetaballs/src/cuda/core/forward.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
#include "image.cuh"
#include "utils.cuh"

// TODO: tune this number
constexpr auto NUM_BLOCKS = dim3(4, 4);
constexpr auto THREADS_PER_BLOCK = dim3(16, 16);

CUDA_CALLABLE PixelCoordRange get_pixel_coords(const dim3 thread_idx, const dim3 block_idx,
const dim3 block_dim, const dim3 grid_dim,
const Intrinsics& intr);
Expand Down Expand Up @@ -49,7 +45,8 @@ __global__ void render_kernel(const FMBScene<MemoryLocation::DEVICE>& fmbs, cons
template <typename Getter, typename Intersector, typename Blender, typename Confidence>
void render_fmbs(const FMBScene<MemoryLocation::DEVICE>& fmbs, const Blender& blender,
const Confidence& confidence, const Intrinsics& intr, const Pose& extr,
ImageView<MemoryLocation::DEVICE> img) {
ImageView<MemoryLocation::DEVICE> img, const dim3 grid_size,
const dim3 block_size) {
render_kernel<Getter, Intersector, Blender, Confidence>
<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(fmbs, blender, confidence, intr, extr, img);
<<<grid_size, block_size>>>(fmbs, blender, confidence, intr, extr, img);
}
12 changes: 10 additions & 2 deletions genmetaballs/src/genmetaballs/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
)
from genmetaballs._genmetaballs_bindings.fmb import FMB, CPUFMBScene, GPUFMBScene
from genmetaballs._genmetaballs_bindings.image import CPUImage, GPUImage
from genmetaballs._genmetaballs_bindings.utils import CPUFloatArray2D, GPUFloatArray2D, sigmoid
from genmetaballs._genmetaballs_bindings.utils import (
CPUFloatArray2D,
GPUFloatArray2D,
dim3,
sigmoid,
)

type DeviceType = Literal["cpu", "gpu"]

Expand Down Expand Up @@ -83,6 +88,8 @@ def render_fmbs(
intr: Intrinsics,
extr: geometry.Pose,
img: GPUImage | None = None,
grid_size: dim3 = dim3(4, 4),
block_size: dim3 = dim3(16, 16),
) -> GPUImage:
"""Render the given FMB scene into the provided image view.

Expand All @@ -105,7 +112,7 @@ def render_fmbs(
else:
raise TypeError("Unsupported blender and confidence combination.")

render_func(fmbs, blender, confidence, intr, extr, img.as_view())
render_func(fmbs, blender, confidence, intr, extr, img.as_view(), grid_size, block_size)
return img


Expand All @@ -122,6 +129,7 @@ def render_fmbs(
"Camera",
"FourParameterBlender",
"FMB",
"dim3",
"Intrinsics",
"ThreeParameterBlender",
"TwoParameterConfidence",
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp_tests/test_confidence.cu
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ TEST(GpuConfidenceTest, ConfidenceMultipleValuesGPU_AllTypes) {

std::vector<float> actual;
if (conf_case.is_two_param) {
TwoParameterConfidence conf(conf_case.beta4, conf_case.beta5);
TwoParameterConfidence conf{conf_case.beta4, conf_case.beta5};
actual = gpu_get_confidence(sumexpd_vec, conf);
} else {
ZeroParameterConfidence conf;
Expand Down