diff --git a/CMakeLists.txt b/CMakeLists.txt index 45321f8..61bee1e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,15 +18,17 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") add_library(genmetaballs_core genmetaballs/src/cuda/core/camera.cu genmetaballs/src/cuda/core/camera.cuh - genmetaballs/src/cuda/core/utils.cu - genmetaballs/src/cuda/core/utils.cuh - genmetaballs/src/cuda/core/fmb.cuh + genmetaballs/src/cuda/core/confidence.cuh genmetaballs/src/cuda/core/fmb.cu - genmetaballs/src/cuda/core/geometry.cuh + genmetaballs/src/cuda/core/fmb.cuh + genmetaballs/src/cuda/core/forward.cu + genmetaballs/src/cuda/core/forward.cuh genmetaballs/src/cuda/core/geometry.cu - genmetaballs/src/cuda/core/confidence.cuh + genmetaballs/src/cuda/core/geometry.cuh genmetaballs/src/cuda/core/image.cuh genmetaballs/src/cuda/core/intersector.cuh + genmetaballs/src/cuda/core/utils.cu + genmetaballs/src/cuda/core/utils.cuh ) # Set include directories for the core library diff --git a/genmetaballs/src/cuda/core/camera.cuh b/genmetaballs/src/cuda/core/camera.cuh index 428acf0..37e79ab 100644 --- a/genmetaballs/src/cuda/core/camera.cuh +++ b/genmetaballs/src/cuda/core/camera.cuh @@ -20,6 +20,8 @@ struct Intrinsics { CUDA_CALLABLE Vec3D get_ray_direction(uint32_t px, uint32_t py) const; }; +using PixelCoord = cuda::std::pair; + struct PixelCoordRange { uint32_t px_start; uint32_t px_end; @@ -38,7 +40,7 @@ struct PixelCoordRange { uint32_t py; // Returns the (px, py) coordinates of the current pixel - CUDA_CALLABLE cuda::std::pair operator*() const; + CUDA_CALLABLE PixelCoord operator*() const; // pre-increment operator that advances to the next pixel CUDA_CALLABLE Iterator& operator++(); diff --git a/genmetaballs/src/cuda/core/forward.cu b/genmetaballs/src/cuda/core/forward.cu index eb4d349..18cd82d 100644 --- a/genmetaballs/src/cuda/core/forward.cu +++ b/genmetaballs/src/cuda/core/forward.cu @@ -1,55 +1,20 @@ #include #include -#include -constexpr auto NUM_BLOCKS = dim3(10); // XXX madeup -constexpr auto THREADS_PER_BLOCK = dim3(10); - -namespace FMB { - -CUDA_CALLABLE std::vector> get_pixel_coords_and_rays( - const dim3 thread_idx, const dim3 block_idx) { - std::vector> res; - - uint32_t i_beg = 0; // XXX TODO - uint32_t i_end = 0; // XXX TODO - - for (int i = i_beg; i < i_end; i += blockDim.x) { - //... - } - - return res; +#include "camera.cuh" +#include "forward.cuh" +#include "utils.cuh" + +CUDA_CALLABLE PixelCoordRange get_pixel_coords(const dim3 thread_idx, const dim3 block_idx, + const dim3 block_dim, const dim3 grid_dim, + const Intrinsics& intr) { + // compute the number of pixels each thread should process + const auto num_pixels_x = int_ceil_div(intr.height, grid_dim.x * block_dim.x); + const auto num_pixels_y = int_ceil_div(intr.width, grid_dim.y * block_dim.y); + const auto start_x = (block_idx.x * block_dim.x + thread_idx.x) * num_pixels_x; + const auto start_y = (block_idx.y * block_dim.y + thread_idx.y) * num_pixels_y; + return PixelCoordRange{.px_start = start_x, + .px_end = min(start_x + num_pixels_x, intr.height), + .py_start = start_y, + .py_end = min(start_y + num_pixels_y, intr.width)}; } - -template -__global__ render_kernel(const Getter fmb_getter, const Blender blender, - Confidence const* confidence, Intrinsics const* intr, Pose const* extr, - Image* img) { - // TODO how to find the relevant chunk of computation from threadIdx, - // blockIdx, etc - auto pixel_coords_and_rays = - get_pixel_coords_and_rays(threadIdx, blockIdx, blockDim, gridDim, intr, extr); - - for (const auto& [pixel_coords, ray] : pixel_coords_and_rays) { - float w0 = 0.0f, tf = 0.0f, sumexpd = 0.0f; - for (const auto& fmb : fmb_getter->get_metaballs(ray)) { - const auto& [t, d] = Intersector::intersect(fmb, ray, extr); - w = blender->blend(t, d, fmb, ray); - sumexpd += exp(d); // numerically unstable. use logsumexp - tf += t; - w0 += w; - } - img->confidence.at(pixel_coords) = confidence->get_confidence(sumexpd); - img->depth.at(pixel_coords) = tf / w0; - } -} - -template -void render_fmbs(const FMBs& fmbs, const Intrinsics& intr, const Pose& extr) { - // initialize the fmb_getter - typename Getter::Getter fmb_getter(fmbs, extr); - auto kernel = render_kernel; - kernel<<>>(fmb_getter, fmbs, intr, extr); -} - -}; // namespace FMB diff --git a/genmetaballs/src/cuda/core/forward.cuh b/genmetaballs/src/cuda/core/forward.cuh new file mode 100644 index 0000000..9634127 --- /dev/null +++ b/genmetaballs/src/cuda/core/forward.cuh @@ -0,0 +1,48 @@ +#pragma once + +#include +#include + +#include "camera.cuh" +#include "fmb.cuh" +#include "geometry.cuh" +#include "image.cuh" +#include "utils.cuh" + +// TODO: tune this number +constexpr auto NUM_BLOCKS = dim3(4, 4); +constexpr auto THREADS_PER_BLOCK = dim3(16, 16); + +CUDA_CALLABLE PixelCoordRange get_pixel_coords(const dim3 thread_idx, const dim3 block_idx, + const dim3 block_dim, const dim3 grid_dim, + const Intrinsics& intr); + +template +__global__ void render_kernel(const Getter fmb_getter, const Blender blender, + Confidence const* confidence, Intrinsics const intr, Pose const* extr, + ImageView img) { + auto pixel_coords = get_pixel_coords(threadIdx, blockIdx, blockDim, gridDim, intr); + + for (const auto& [px, py] : pixel_coords) { + float w0 = 0.0f, tf = 0.0f, sumexpd = 0.0f; + auto ray = intr.get_ray_direction(px, py); + for (const auto& fmb : fmb_getter->get_metaballs(ray)) { + const auto& [t, d] = Intersector::intersect(fmb, ray, extr); + auto w = blender->blend(t, d, fmb, ray); + sumexpd += exp(d); // numerically unstable. use logsumexp + tf += t; + w0 += w; + } + img.confidence[px][py] = confidence->get_confidence(sumexpd); + img.depth[px][py] = tf / w0; + } +} + +template +void render_fmbs(const FMBScene& fmbs, const Intrinsics& intr, + const Pose& extr) { + // initialize the fmb_getter + auto fmb_getter = Getter(fmbs, extr); + auto& kernel = render_kernel; + kernel<<>>(fmb_getter, fmbs, intr, extr); +} diff --git a/genmetaballs/src/cuda/core/utils.cuh b/genmetaballs/src/cuda/core/utils.cuh index 1a567df..296a288 100644 --- a/genmetaballs/src/cuda/core/utils.cuh +++ b/genmetaballs/src/cuda/core/utils.cuh @@ -21,6 +21,12 @@ CUDA_CALLABLE __forceinline__ float sigmoid(float x) { return 1.0f / (1.0f + expf(-x)); } +// Integer ceiling division +template +CUDA_CALLABLE constexpr T int_ceil_div(T a, T b) { + return (a + b - 1) / b; +} + enum class MemoryLocation { HOST, DEVICE }; // Non-owning 2D view into a contiguous array in either host or device memory diff --git a/tests/cpp_tests/test_forward.cu b/tests/cpp_tests/test_forward.cu new file mode 100644 index 0000000..1b3b579 --- /dev/null +++ b/tests/cpp_tests/test_forward.cu @@ -0,0 +1,34 @@ +#include +#include + +#include "core/forward.cuh" +#include "core/utils.cuh" +#include "thrust/device_vector.h" +#include "thrust/host_vector.h" + +namespace get_pixel_coords_tests { +// A simple kernel that fills an Array2D with 1.0f in parallel +__global__ void fill_with_ones_kernel(Array2D output, + const Intrinsics& intr) { + auto pixel_coords = get_pixel_coords(threadIdx, blockIdx, blockDim, gridDim, intr); + for (const auto [px, py] : pixel_coords) { + output[px][py] = 1.0f; + } +} +} // namespace get_pixel_coords_tests + +// Test if fmb::get_pixel_coords correctly covers all image pixels +TEST(ForwardTest, GetPixelCoordsCoverage) { + const auto intrinsic = + Intrinsics{.height = 100, .width = 200, .fx = 1.0f, .fy = 1.0f, .cx = 50.0f, .cy = 100.0f}; + auto buffer = thrust::device_vector(intrinsic.height * intrinsic.width, 0.0f); + auto array2d = + Array2D(buffer.data(), intrinsic.height, intrinsic.width); + constexpr dim3 block_dim(12, 8); + constexpr dim3 grid_dim(16, 24); + get_pixel_coords_tests::fill_with_ones_kernel<<>>(array2d, intrinsic); + auto host_buffer = thrust::host_vector(buffer); + for (size_t i = 0; i < host_buffer.size(); ++i) { + EXPECT_EQ(host_buffer[i], 1.0f); + } +} diff --git a/tests/cpp_tests/test_utils.cu b/tests/cpp_tests/test_utils.cu index 32c476f..023fdba 100644 --- a/tests/cpp_tests/test_utils.cu +++ b/tests/cpp_tests/test_utils.cu @@ -207,3 +207,10 @@ TYPED_TEST(Array2DTestFixture, MultipleViewsOfSameData) { EXPECT_FLOAT_EQ(view1[1][2], 200.0f); } } + +TEST(CeilDivTests, TestBasicCeillDivCorrectness) { + EXPECT_EQ(int_ceil_div(10, 3), 4); + EXPECT_EQ(int_ceil_div(9, 3), 3); + EXPECT_EQ(int_ceil_div(0, 5), 0); + EXPECT_EQ(int_ceil_div(1, 1), 1); +};