Skip to content

Commit 0890e54

Browse files
Support HWC IO formats
1 parent 05b633c commit 0890e54

15 files changed

+494
-822
lines changed

CMakeLists.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@ set(CMAKE_CUDA_STANDARD 20)
1111
project(TRT-NNScaler LANGUAGES C CXX)
1212

1313
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
14-
if (CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
15-
set(CMAKE_CUDA_ARCHITECTURES 62 72 87)
16-
else ()
17-
set(CMAKE_CUDA_ARCHITECTURES 61 70 75 80 86 89 90)
18-
endif ()
14+
set(CMAKE_CUDA_ARCHITECTURES 61 70 75 80 86 89 90a 100a 120a)
1915
endif ()
2016

2117
if (MSVC)
@@ -114,6 +110,11 @@ enable_language(CUDA)
114110
# 20208: double_for_long_double // long double is required for user-defined literal. We are fine with float precision
115111
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=997,20208>")
116112

113+
option(CUDA_DEVICE_DEBUG "Enable CUDA Device Debug" OFF)
114+
if (CUDA_DEVICE_DEBUG)
115+
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
116+
endif()
117+
117118
add_library(reformat_cuda STATIC reformat/reformat_cuda.h reformat/reformat.cu)
118119

119120
add_library(reformat INTERFACE reformat/reformat.h reformat/reformat_cuda.h)

cmd_common.h

Lines changed: 13 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -30,17 +30,12 @@ InferenceSession *session = nullptr;
3030

3131
int using_io = 0;
3232

33-
pixel_importer_cpu *importer_cpu = nullptr;
34-
pixel_exporter_cpu *exporter_cpu = nullptr;
35-
3633
pixel_importer_gpu<float> *importer_gpu = nullptr;
3734
pixel_exporter_gpu<float> *exporter_gpu = nullptr;
3835

3936
pixel_importer_gpu<half> *importer_gpu_fp16 = nullptr;
4037
pixel_exporter_gpu<half> *exporter_gpu_fp16 = nullptr;
4138

42-
int32_t h_scale, w_scale;
43-
4439
#if defined(__GNUC__)
4540
extern "C" __attribute__((weak)) int32_t getInferLibVersion() noexcept {
4641
return NV_TENSORRT_VERSION;
@@ -56,11 +51,11 @@ static Logger gLogger;
5651

5752
ABSL_FLAG(bool, fp16, false, "use FP16 processing, allow FP16 in engine");
5853
ABSL_FLAG(bool, int8, false, "allow INT8 in engine");
54+
ABSL_FLAG(bool, strongly_typed, true, "enable strongly typed network definition");
5955
ABSL_FLAG(bool, force_precision, false, "Force precision config in model");
6056
ABSL_FLAG(bool, external, false, "use external algorithms from cuDNN and cuBLAS");
6157
ABSL_FLAG(bool, low_mem, false, "tweak configs to reduce memory consumption");
6258
ABSL_FLAG(int32_t, aux_stream, -1, "Auxiliary streams to use");
63-
ABSL_FLAG(std::string, reformatter, "auto", "reformatter used to import and export pixels: cpu, gpu, auto");
6459

6560
ABSL_FLAG(uint32_t, tile_width, 512, "tile width");
6661
ABSL_FLAG(uint32_t, tile_height, 512, "tile height");
@@ -149,6 +144,7 @@ void setup_session(bool handle_alpha) {
149144
int(max_height)},
150145
1,
151146
absl::GetFlag(FLAGS_aux_stream),
147+
absl::GetFlag(FLAGS_strongly_typed),
152148
absl::GetFlag(FLAGS_fp16),
153149
absl::GetFlag(FLAGS_int8),
154150
absl::GetFlag(FLAGS_force_precision),
@@ -183,47 +179,23 @@ void setup_session(bool handle_alpha) {
183179
if (!err.empty()) {
184180
LOG(QFATAL) << "Failed allocate memory for context: " << err;
185181
}
186-
std::tie(h_scale, w_scale) = session->detect_scale();
187-
if (h_scale == -1 || w_scale == -1) {
188-
LOG(QFATAL) << "Bad model, can't detect scale ratio.";
189-
}
190-
191-
if (h_scale != w_scale) {
192-
LOG(QFATAL) << "different width and height scale ratio unimplemented.";
193-
}
194182

195183
// ------------------------------
196184
// Import & Export
197-
auto max_size = size_t(max_width) * max_height;
185+
auto max_size = size_t(max_width) * max_height * (handle_alpha ? 4 : 3);
186+
auto max_size_out = max_size * session->scale_w * session->scale_h;
198187

199-
if (absl::GetFlag(FLAGS_reformatter) == "auto") {
200-
absl::SetFlag(&FLAGS_reformatter, absl::GetFlag(FLAGS_fp16) ? "gpu" : "cpu");
201-
}
202-
if (absl::GetFlag(FLAGS_fp16) && absl::GetFlag(FLAGS_reformatter) == "cpu") {
203-
LOG(QFATAL) << "CPU reformatter can not handle FP16.";
204-
}
205-
206-
if (absl::GetFlag(FLAGS_reformatter) == "cpu") {
207-
importer_cpu = new pixel_importer_cpu(max_size, handle_alpha);
208-
exporter_cpu = new pixel_exporter_cpu(h_scale * w_scale * max_size, handle_alpha);
209-
using_io = 0;
210-
}
211-
else if (absl::GetFlag(FLAGS_reformatter) == "gpu") {
212-
if (absl::GetFlag(FLAGS_fp16)) {
213-
importer_gpu_fp16 = new pixel_importer_gpu<half>(max_size, handle_alpha);
214-
exporter_gpu_fp16 =
215-
new pixel_exporter_gpu<half>(h_scale * w_scale * max_size, handle_alpha);
216-
using_io = 2;
217-
}
218-
else {
219-
importer_gpu = new pixel_importer_gpu<float>(max_size, handle_alpha);
220-
exporter_gpu =
221-
new pixel_exporter_gpu<float>(h_scale * w_scale * max_size, handle_alpha);
222-
using_io = 1;
223-
}
188+
if (absl::GetFlag(FLAGS_fp16)) {
189+
importer_gpu_fp16 = new pixel_importer_gpu<half>(max_size, 1);
190+
exporter_gpu_fp16 =
191+
new pixel_exporter_gpu<half>(max_size_out, 1);
192+
using_io = 1;
224193
}
225194
else {
226-
LOG(QFATAL) << "Unknown reformatter.";
195+
importer_gpu = new pixel_importer_gpu<float>(max_size, 1);
196+
exporter_gpu =
197+
new pixel_exporter_gpu<float>(max_size_out, 1);
198+
using_io = 0;
227199
}
228200
}
229201

image_io.h

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,67 @@
44
#include <utility>
55
#include <string>
66
#include <variant>
7-
#include <filesystem>
7+
8+
#include <cuda_runtime_api.h>
89

910
#include "nn-scaler.h"
1011
#include "md_view.h"
1112

1213
std::string init_image_io();
1314

14-
typedef std::unique_ptr<uint8_t[]> mem_owner;
15+
struct pinned_deleter {
16+
void operator()(void* p) const {
17+
cudaFreeHost(p);
18+
}
19+
};
20+
21+
struct pinned_memory : std::unique_ptr<uint8_t[], pinned_deleter> {
22+
struct alloc_flag {
23+
uint32_t flags;
24+
};
25+
26+
constexpr static alloc_flag alloc_default {cudaHostAllocDefault};
27+
constexpr static alloc_flag alloc_h2d {cudaHostAllocWriteCombined};
28+
29+
pinned_memory() = default;
30+
31+
explicit pinned_memory(size_t count, alloc_flag flags=alloc_default) {
32+
void* mem{};
33+
auto result = cudaHostAlloc(&mem, count, flags.flags);
34+
if (result != cudaSuccess) {
35+
throw std::bad_alloc();
36+
}
37+
this->reset(static_cast<uint8_t*>(mem));
38+
}
39+
};
40+
41+
using mem_owner = pinned_memory;
1542

1643
template<typename U, size_t DIMS>
17-
static std::pair<md_view<U, int32_t, DIMS>, mem_owner> alloc_buffer(shape_t<int32_t, DIMS> s) {
18-
auto ptr = std::make_unique<uint8_t[]>(s.count() * sizeof(U));
44+
static std::pair<md_view<U, int32_t, DIMS>, mem_owner> alloc_buffer(mem_owner::alloc_flag flags, shape_t<int32_t, DIMS> s) {
45+
auto ptr = pinned_memory(s.count() * sizeof(U), flags);
1946
md_view<U, int32_t, DIMS> view = {reinterpret_cast<U *>(ptr.get()), s};
2047
return {view, std::move(ptr)};
2148
}
2249

2350
template<typename U, typename ...D>
24-
static std::pair<md_view<U, int32_t, sizeof...(D)>, mem_owner> alloc_buffer(D... d) {
51+
static std::pair<md_view<U, int32_t, sizeof...(D)>, mem_owner> alloc_buffer(mem_owner::alloc_flag flags, D... d) {
2552
shape_t<int32_t, sizeof...(D)> s{static_cast<int32_t>(d)...};
26-
auto ptr = std::make_unique<uint8_t[]>(s.count() * sizeof(U));
53+
auto ptr = pinned_memory(s.count() * sizeof(U), flags);
2754
md_view<U, int32_t, sizeof...(D)> view = {reinterpret_cast<U *>(ptr.get()), s};
2855
return {view, std::move(ptr)};
2956
}
3057

58+
template<typename U, size_t DIMS>
59+
static std::pair<md_view<U, int32_t, DIMS>, mem_owner> alloc_buffer(shape_t<int32_t, DIMS> s) {
60+
return alloc_buffer<U>(pinned_memory::alloc_default, s);
61+
}
62+
63+
template<typename U, typename ...D>
64+
static std::pair<md_view<U, int32_t, sizeof...(D)>, mem_owner> alloc_buffer(D... d) {
65+
return alloc_buffer<U>(pinned_memory::alloc_default, d...);
66+
}
67+
3168
std::variant<std::pair<shape_t<int32_t, 3>, mem_owner>, std::string>
3269
load_image(Work::input_t file, bool ignore_alpha);
3370
std::string save_image(Work::output_t file, md_view<uint8_t, int32_t, 3> data);

image_wic.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ load_image(Work::input_t file, bool ignore_alpha) {
8888
IWICBitmapFrameDecode *pFrame = nullptr;
8989
IWICFormatConverter *pConverter = nullptr;
9090
HRESULT hr;
91-
md_view<uint8_t, int32_t, 3> view;
91+
md_view<uint8_t, int32_t, 3> view{};
9292
mem_owner pixels;
9393

9494
if (file.index() == 0) {
@@ -120,7 +120,7 @@ load_image(Work::input_t file, bool ignore_alpha) {
120120
bool use_opaque = (ignore_alpha || !has_alpha);
121121
desire_format = use_opaque ? desire_format_opaque : desire_format_alpha;
122122

123-
std::tie(view, pixels) = alloc_buffer<uint8_t>(height, width, use_opaque ? 3 : 4);
123+
std::tie(view, pixels) = alloc_buffer<uint8_t>(mem_owner::alloc_h2d, height, width, use_opaque ? 3 : 4);
124124
if (input_format != desire_format) {
125125
HR_CHECK(pFactory->CreateFormatConverter(&pConverter));
126126
HR_CHECK(pConverter->Initialize(pFrame,
@@ -133,13 +133,13 @@ load_image(Work::input_t file, bool ignore_alpha) {
133133
HR_CHECK(pConverter->CopyPixels(nullptr,
134134
view.at(0).size(),
135135
view.size(),
136-
reinterpret_cast<BYTE *>(pixels.get())));
136+
pixels.get()));
137137
}
138138
else {
139139
HR_CHECK(pFrame->CopyPixels(nullptr,
140140
view.at(0).size(),
141141
view.size(),
142-
reinterpret_cast<BYTE *>(pixels.get())));
142+
pixels.get()));
143143
}
144144
}
145145

image_wuffs.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include <optional>
2727
#include <vector>
2828

29+
#include "cuda_runtime.h"
30+
2931
#include "jpeglib.h"
3032
#include <setjmp.h>
3133

@@ -56,19 +58,19 @@ class MyDecodeImageCallbacks : public wuffs_aux::DecodeImageCallbacks {
5658
if ((len == 0) || (SIZE_MAX < len)) {
5759
return {wuffs_aux::DecodeImage_UnsupportedPixelConfiguration};
5860
}
59-
auto mem = std::make_unique<uint8_t[]>(len);
61+
auto mem_ = alloc_buffer<uint8_t>(mem_owner::alloc_h2d, len);
62+
auto mem = wuffs_aux::MemOwner{mem_.second.release(), [](void *ptr) noexcept { cudaFreeHost(ptr); }};
6063
if (!mem) {
6164
return {wuffs_aux::DecodeImage_OutOfMemory};
6265
}
6366
wuffs_base__pixel_buffer pixbuf;
6467
wuffs_base__status status = pixbuf.set_from_slice(
6568
&image_config.pixcfg,
66-
wuffs_base__make_slice_u8(mem.get(), (size_t) len));
69+
wuffs_base__make_slice_u8(static_cast<uint8_t *>(mem.get()), (size_t) len));
6770
if (!status.is_ok()) {
6871
return {status.message()};
6972
}
70-
wuffs_aux::MemOwner owner {mem.release(), operator delete[]};
71-
return {std::move(owner), pixbuf};
73+
return {std::move(mem), pixbuf};
7274
}
7375

7476
std::string //
@@ -249,11 +251,12 @@ load_image(Work::input_t file, bool ignore_alpha) {
249251
}
250252
}
251253

252-
md_view<uint8_t, int32_t, 3> in_view{reinterpret_cast<uint8_t *>(res.pixbuf_mem_owner.get()),
254+
md_view in_view{static_cast<uint8_t *>(res.pixbuf_mem_owner.get()),
253255
{static_cast<int>(res.pixbuf.pixcfg.height()),
254256
static_cast<int>(res.pixbuf.pixcfg.width()),
255257
res.pixbuf.pixcfg.pixel_format().transparency() ? 4 : 3}};
256258

257-
std::unique_ptr<uint8_t[]> in_ptr(reinterpret_cast<uint8_t*>(res.pixbuf_mem_owner.release()));
258-
return std::make_pair(in_view.shape, std::move(in_ptr));
259+
mem_owner owner;
260+
owner.reset(static_cast<uint8_t*>(res.pixbuf_mem_owner.release()));
261+
return std::make_pair(in_view.shape, std::move(owner));
259262
}

infer_engine.h

Lines changed: 53 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#include "NvInfer.h"
88

9+
#include "md_view.h"
10+
911
struct optimization_axis {
1012
optimization_axis(int32_t min, int32_t opt, int32_t max) : min(min), opt(opt), max(max) {}
1113
optimization_axis(int32_t same) : min(same), opt(same), max(same) {}
@@ -24,6 +26,7 @@ struct ScalerConfig {
2426
optimization_axis batch;
2527

2628
int32_t aux_stream;
29+
bool use_strong_type;
2730
bool use_fp16;
2831
bool use_int8;
2932
bool force_precision;
@@ -33,11 +36,15 @@ struct ScalerConfig {
3336
[[nodiscard]] std::string engine_name() const {
3437
std::stringstream ss;
3538
ss << "_w" << input_width << "_h" << input_height << "_b" << batch << "_a" << aux_stream;
36-
if (use_fp16) {
37-
ss << "_fp16";
38-
}
39-
if (use_int8) {
40-
ss << "_int8";
39+
if (use_strong_type) {
40+
ss << "_stype";
41+
} else {
42+
if (use_fp16) {
43+
ss << "_fp16";
44+
}
45+
if (use_int8) {
46+
ss << "_int8";
47+
}
4148
}
4249
if (force_precision) {
4350
ss << "_force_prec";
@@ -87,7 +94,7 @@ class InferenceContext {
8794
public:
8895
ScalerConfig config;
8996
InferenceContext(ScalerConfig config, nvinfer1::ILogger &logger, const std::filesystem::path& path_prefix);
90-
bool has_file();
97+
bool has_file() const;
9198
std::string load_engine();
9299

93100
bool good() {
@@ -99,14 +106,17 @@ class InferenceSession {
99106
InferenceContext ctx;
100107

101108
nvinfer1::IExecutionContext *context;
102-
void *execution_memory;
103-
int32_t last_batch, last_height, last_width;
109+
void *execution_memory{};
110+
int32_t last_batch=-1, last_height=-1, last_width=-1;
104111
std::atomic<bool> good_;
112+
void *input_ptr{}, *output_ptr{};
113+
bool input_interleaved{}, output_interleaved{};
114+
int32_t input_channel_stride{}, output_channel_stride{};
105115

106116
public:
107-
cudaStream_t stream;
108-
cudaEvent_t input_consumed;
109-
void *input, *output;
117+
cudaStream_t stream{};
118+
cudaEvent_t input_consumed{};
119+
int32_t scale_w=-1, scale_h=-1;
110120

111121
explicit InferenceSession(InferenceContext &ctx);
112122
~InferenceSession();
@@ -117,7 +127,37 @@ class InferenceSession {
117127
std::string allocation();
118128
std::string deallocation();
119129
void config(int32_t batch, int32_t height, int32_t width);
120-
std::pair<int32_t, int32_t> detect_scale();
130+
void detect_scale();
131+
132+
bool inference() const;
133+
134+
template<typename F>
135+
md_uview<F, int32_t, 3, int64_t> input(int32_t height, int32_t width) const {
136+
shape_t shape {3, height, width};
137+
138+
shape_t stride_shape {input_channel_stride, height, width};
139+
if (input_interleaved) {
140+
stride_shape = stride_shape.gather<1, 2, 0>();
141+
}
142+
stride_t stride = stride_shape.stride<int64_t>();
143+
if (input_interleaved) {
144+
stride = stride.gather<2, 0, 1>();
145+
}
146+
return {static_cast<F *>(input_ptr), shape, stride};
147+
}
148+
149+
template<typename F>
150+
md_uview<F, int32_t, 3, int64_t> output(int32_t height, int32_t width) const {
151+
shape_t shape {3, height * scale_h, width * scale_h};
121152

122-
bool inference();
153+
shape_t stride_shape {output_channel_stride, height * scale_h, width * scale_h};
154+
if (output_interleaved) {
155+
stride_shape = stride_shape.gather<1, 2, 0>();
156+
}
157+
stride_t stride = stride_shape.stride<int64_t>();
158+
if (output_interleaved) {
159+
stride = stride.gather<2, 0, 1>();
160+
}
161+
return {static_cast<F *>(output_ptr), shape, stride};
162+
}
123163
};

0 commit comments

Comments
 (0)