tongyuantongyu
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎cmd_common.h‎
Lines changed: 13 additions & 41 deletions b/‎cmd_common.h‎
Lines changed: 13 additions & 41 deletions
diff --git a/‎image_io.h‎
Lines changed: 43 additions & 6 deletions b/‎image_io.h‎
Lines changed: 43 additions & 6 deletions
diff --git a/‎image_wic.cpp‎
Lines changed: 4 additions & 4 deletions b/‎image_wic.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎image_wuffs.cpp‎
Lines changed: 10 additions & 7 deletions b/‎image_wuffs.cpp‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎infer_engine.h‎
Lines changed: 53 additions & 13 deletions b/‎infer_engine.h‎
Lines changed: 53 additions & 13 deletions
@@ -11,11 +11,7 @@ set(CMAKE_CUDA_STANDARD 20)
 project(TRT-NNScaler LANGUAGES C CXX)
 
 if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    if (CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
-        set(CMAKE_CUDA_ARCHITECTURES 62 72 87)
-    else ()
-        set(CMAKE_CUDA_ARCHITECTURES 61 70 75 80 86 89 90)
-    endif ()
+    set(CMAKE_CUDA_ARCHITECTURES 61 70 75 80 86 89 90a 100a 120a)
 endif ()
 
 if (MSVC)
@@ -114,6 +110,11 @@ enable_language(CUDA)
 # 20208: double_for_long_double  // long double is required for user-defined literal. We are fine with float precision
 add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=997,20208>")
 
+option(CUDA_DEVICE_DEBUG "Enable CUDA Device Debug" OFF)
+if (CUDA_DEVICE_DEBUG)
+    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
+endif()
+
 add_library(reformat_cuda STATIC reformat/reformat_cuda.h reformat/reformat.cu)
 
 add_library(reformat INTERFACE reformat/reformat.h reformat/reformat_cuda.h)
 
@@ -30,17 +30,12 @@ InferenceSession *session = nullptr;
 
 int using_io = 0;
 
-pixel_importer_cpu *importer_cpu = nullptr;
-pixel_exporter_cpu *exporter_cpu = nullptr;
-
 pixel_importer_gpu<float> *importer_gpu = nullptr;
 pixel_exporter_gpu<float> *exporter_gpu = nullptr;
 
 pixel_importer_gpu<half> *importer_gpu_fp16 = nullptr;
 pixel_exporter_gpu<half> *exporter_gpu_fp16 = nullptr;
 
-int32_t h_scale, w_scale;
-
 #if defined(__GNUC__)
 extern "C" __attribute__((weak)) int32_t getInferLibVersion() noexcept {
   return NV_TENSORRT_VERSION;
@@ -56,11 +51,11 @@ static Logger gLogger;
 
 ABSL_FLAG(bool, fp16, false, "use FP16 processing, allow FP16 in engine");
 ABSL_FLAG(bool, int8, false, "allow INT8 in engine");
+ABSL_FLAG(bool, strongly_typed, true, "enable strongly typed network definition");
 ABSL_FLAG(bool, force_precision, false, "Force precision config in model");
 ABSL_FLAG(bool, external, false, "use external algorithms from cuDNN and cuBLAS");
 ABSL_FLAG(bool, low_mem, false, "tweak configs to reduce memory consumption");
 ABSL_FLAG(int32_t, aux_stream, -1, "Auxiliary streams to use");
-ABSL_FLAG(std::string, reformatter, "auto", "reformatter used to import and export pixels: cpu, gpu, auto");
 
 ABSL_FLAG(uint32_t, tile_width, 512, "tile width");
 ABSL_FLAG(uint32_t, tile_height, 512, "tile height");
@@ -149,6 +144,7 @@ void setup_session(bool handle_alpha) {
            int(max_height)},
           1,
           absl::GetFlag(FLAGS_aux_stream),
+          absl::GetFlag(FLAGS_strongly_typed),
           absl::GetFlag(FLAGS_fp16),
           absl::GetFlag(FLAGS_int8),
           absl::GetFlag(FLAGS_force_precision),
@@ -183,47 +179,23 @@ void setup_session(bool handle_alpha) {
   if (!err.empty()) {
     LOG(QFATAL) << "Failed allocate memory for context: " << err;
   }
-  std::tie(h_scale, w_scale) = session->detect_scale();
-  if (h_scale == -1 || w_scale == -1) {
-    LOG(QFATAL) << "Bad model, can't detect scale ratio.";
-  }
-
-  if (h_scale != w_scale) {
-    LOG(QFATAL) << "different width and height scale ratio unimplemented.";
-  }
 
   // ------------------------------
   // Import & Export
-  auto max_size = size_t(max_width) * max_height;
+  auto max_size = size_t(max_width) * max_height * (handle_alpha ? 4 : 3);
+  auto max_size_out = max_size * session->scale_w * session->scale_h;
 
-  if (absl::GetFlag(FLAGS_reformatter) == "auto") {
-    absl::SetFlag(&FLAGS_reformatter, absl::GetFlag(FLAGS_fp16) ? "gpu" : "cpu");
-  }
-  if (absl::GetFlag(FLAGS_fp16) && absl::GetFlag(FLAGS_reformatter) == "cpu") {
-    LOG(QFATAL) << "CPU reformatter can not handle FP16.";
-  }
-
-  if (absl::GetFlag(FLAGS_reformatter) == "cpu") {
-    importer_cpu = new pixel_importer_cpu(max_size, handle_alpha);
-    exporter_cpu = new pixel_exporter_cpu(h_scale * w_scale * max_size, handle_alpha);
-    using_io = 0;
-  }
-  else if (absl::GetFlag(FLAGS_reformatter) == "gpu") {
-    if (absl::GetFlag(FLAGS_fp16)) {
-      importer_gpu_fp16 = new pixel_importer_gpu<half>(max_size, handle_alpha);
-      exporter_gpu_fp16 =
-          new pixel_exporter_gpu<half>(h_scale * w_scale * max_size, handle_alpha);
-      using_io = 2;
-    }
-    else {
-      importer_gpu = new pixel_importer_gpu<float>(max_size, handle_alpha);
-      exporter_gpu =
-          new pixel_exporter_gpu<float>(h_scale * w_scale * max_size, handle_alpha);
-      using_io = 1;
-    }
+  if (absl::GetFlag(FLAGS_fp16)) {
+    importer_gpu_fp16 = new pixel_importer_gpu<half>(max_size, 1);
+    exporter_gpu_fp16 =
+        new pixel_exporter_gpu<half>(max_size_out, 1);
+    using_io = 1;
   }
   else {
-    LOG(QFATAL) << "Unknown reformatter.";
+    importer_gpu = new pixel_importer_gpu<float>(max_size, 1);
+    exporter_gpu =
+        new pixel_exporter_gpu<float>(max_size_out, 1);
+    using_io = 0;
   }
 }
 
 
@@ -4,30 +4,67 @@
 #include <utility>
 #include <string>
 #include <variant>
-#include <filesystem>
+
+#include <cuda_runtime_api.h>
 
 #include "nn-scaler.h"
 #include "md_view.h"
 
 std::string init_image_io();
 
-typedef std::unique_ptr<uint8_t[]> mem_owner;
+struct pinned_deleter {
+  void operator()(void* p) const {
+    cudaFreeHost(p);
+  }
+};
+
+struct pinned_memory : std::unique_ptr<uint8_t[], pinned_deleter> {
+  struct alloc_flag {
+    uint32_t flags;
+  };
+
+  constexpr static alloc_flag alloc_default {cudaHostAllocDefault};
+  constexpr static alloc_flag alloc_h2d {cudaHostAllocWriteCombined};
+
+  pinned_memory() = default;
+
+  explicit pinned_memory(size_t count, alloc_flag flags=alloc_default) {
+    void* mem{};
+    auto result = cudaHostAlloc(&mem, count, flags.flags);
+    if (result != cudaSuccess) {
+      throw std::bad_alloc();
+    }
+    this->reset(static_cast<uint8_t*>(mem));
+  }
+};
+
+using mem_owner = pinned_memory;
 
 template<typename U, size_t DIMS>
-static std::pair<md_view<U, int32_t, DIMS>, mem_owner> alloc_buffer(shape_t<int32_t, DIMS> s) {
-  auto ptr = std::make_unique<uint8_t[]>(s.count() * sizeof(U));
+static std::pair<md_view<U, int32_t, DIMS>, mem_owner> alloc_buffer(mem_owner::alloc_flag flags, shape_t<int32_t, DIMS> s) {
+  auto ptr = pinned_memory(s.count() * sizeof(U), flags);
   md_view<U, int32_t, DIMS> view = {reinterpret_cast<U *>(ptr.get()), s};
   return {view, std::move(ptr)};
 }
 
 template<typename U, typename ...D>
-static std::pair<md_view<U, int32_t, sizeof...(D)>, mem_owner> alloc_buffer(D... d) {
+static std::pair<md_view<U, int32_t, sizeof...(D)>, mem_owner> alloc_buffer(mem_owner::alloc_flag flags, D... d) {
   shape_t<int32_t, sizeof...(D)> s{static_cast<int32_t>(d)...};
-  auto ptr = std::make_unique<uint8_t[]>(s.count() * sizeof(U));
+  auto ptr = pinned_memory(s.count() * sizeof(U), flags);
   md_view<U, int32_t, sizeof...(D)> view = {reinterpret_cast<U *>(ptr.get()), s};
   return {view, std::move(ptr)};
 }
 
+template<typename U, size_t DIMS>
+static std::pair<md_view<U, int32_t, DIMS>, mem_owner> alloc_buffer(shape_t<int32_t, DIMS> s) {
+  return alloc_buffer<U>(pinned_memory::alloc_default, s);
+}
+
+template<typename U, typename ...D>
+static std::pair<md_view<U, int32_t, sizeof...(D)>, mem_owner> alloc_buffer(D... d) {
+  return alloc_buffer<U>(pinned_memory::alloc_default, d...);
+}
+
 std::variant<std::pair<shape_t<int32_t, 3>, mem_owner>, std::string>
 load_image(Work::input_t file, bool ignore_alpha);
 std::string save_image(Work::output_t file, md_view<uint8_t, int32_t, 3> data);
 
@@ -88,7 +88,7 @@ load_image(Work::input_t file, bool ignore_alpha) {
   IWICBitmapFrameDecode *pFrame = nullptr;
   IWICFormatConverter *pConverter = nullptr;
   HRESULT hr;
-  md_view<uint8_t, int32_t, 3> view;
+  md_view<uint8_t, int32_t, 3> view{};
   mem_owner pixels;
 
   if (file.index() == 0) {
@@ -120,7 +120,7 @@ load_image(Work::input_t file, bool ignore_alpha) {
     bool use_opaque = (ignore_alpha || !has_alpha);
     desire_format = use_opaque ? desire_format_opaque : desire_format_alpha;
 
-    std::tie(view, pixels) = alloc_buffer<uint8_t>(height, width, use_opaque ? 3 : 4);
+    std::tie(view, pixels) = alloc_buffer<uint8_t>(mem_owner::alloc_h2d, height, width, use_opaque ? 3 : 4);
     if (input_format != desire_format) {
       HR_CHECK(pFactory->CreateFormatConverter(&pConverter));
       HR_CHECK(pConverter->Initialize(pFrame,
@@ -133,13 +133,13 @@ load_image(Work::input_t file, bool ignore_alpha) {
       HR_CHECK(pConverter->CopyPixels(nullptr,
                                       view.at(0).size(),
                                       view.size(),
-                                      reinterpret_cast<BYTE *>(pixels.get())));
+                                      pixels.get()));
     }
     else {
       HR_CHECK(pFrame->CopyPixels(nullptr,
                                   view.at(0).size(),
                                   view.size(),
-                                  reinterpret_cast<BYTE *>(pixels.get())));
+                                  pixels.get()));
     }
   }
 
 
@@ -26,6 +26,8 @@
 #include <optional>
 #include <vector>
 
+#include "cuda_runtime.h"
+
 #include "jpeglib.h"
 #include <setjmp.h>
 
@@ -56,19 +58,19 @@ class MyDecodeImageCallbacks : public wuffs_aux::DecodeImageCallbacks {
     if ((len == 0) || (SIZE_MAX < len)) {
       return {wuffs_aux::DecodeImage_UnsupportedPixelConfiguration};
     }
-    auto mem = std::make_unique<uint8_t[]>(len);
+    auto mem_ = alloc_buffer<uint8_t>(mem_owner::alloc_h2d, len);
+    auto mem = wuffs_aux::MemOwner{mem_.second.release(), [](void *ptr) noexcept { cudaFreeHost(ptr); }};
     if (!mem) {
       return {wuffs_aux::DecodeImage_OutOfMemory};
     }
     wuffs_base__pixel_buffer pixbuf;
     wuffs_base__status status = pixbuf.set_from_slice(
         &image_config.pixcfg,
-        wuffs_base__make_slice_u8(mem.get(), (size_t) len));
+        wuffs_base__make_slice_u8(static_cast<uint8_t *>(mem.get()), (size_t) len));
     if (!status.is_ok()) {
       return {status.message()};
     }
-    wuffs_aux::MemOwner owner {mem.release(), operator delete[]};
-    return {std::move(owner), pixbuf};
+    return {std::move(mem), pixbuf};
   }
 
   std::string  //
@@ -249,11 +251,12 @@ load_image(Work::input_t file, bool ignore_alpha) {
     }
   }
 
-  md_view<uint8_t, int32_t, 3> in_view{reinterpret_cast<uint8_t *>(res.pixbuf_mem_owner.get()),
+  md_view in_view{static_cast<uint8_t *>(res.pixbuf_mem_owner.get()),
                   {static_cast<int>(res.pixbuf.pixcfg.height()),
                    static_cast<int>(res.pixbuf.pixcfg.width()),
                    res.pixbuf.pixcfg.pixel_format().transparency() ? 4 : 3}};
 
-  std::unique_ptr<uint8_t[]> in_ptr(reinterpret_cast<uint8_t*>(res.pixbuf_mem_owner.release()));
-  return std::make_pair(in_view.shape, std::move(in_ptr));
+  mem_owner owner;
+  owner.reset(static_cast<uint8_t*>(res.pixbuf_mem_owner.release()));
+  return std::make_pair(in_view.shape, std::move(owner));
 }
@@ -6,6 +6,8 @@
 
 #include "NvInfer.h"
 
+#include "md_view.h"
+
 struct optimization_axis {
   optimization_axis(int32_t min, int32_t opt, int32_t max) : min(min), opt(opt), max(max) {}
   optimization_axis(int32_t same) : min(same), opt(same), max(same) {}
@@ -24,6 +26,7 @@ struct ScalerConfig {
   optimization_axis batch;
 
   int32_t aux_stream;
+  bool use_strong_type;
   bool use_fp16;
   bool use_int8;
   bool force_precision;
@@ -33,11 +36,15 @@ struct ScalerConfig {
   [[nodiscard]] std::string engine_name() const {
     std::stringstream ss;
     ss << "_w" << input_width << "_h" << input_height << "_b" << batch << "_a" << aux_stream;
-    if (use_fp16) {
-      ss << "_fp16";
-    }
-    if (use_int8) {
-      ss << "_int8";
+    if (use_strong_type) {
+      ss << "_stype";
+    } else {
+      if (use_fp16) {
+        ss << "_fp16";
+      }
+      if (use_int8) {
+        ss << "_int8";
+      }
     }
     if (force_precision) {
       ss << "_force_prec";
@@ -87,7 +94,7 @@ class InferenceContext {
  public:
   ScalerConfig config;
   InferenceContext(ScalerConfig config, nvinfer1::ILogger &logger, const std::filesystem::path& path_prefix);
-  bool has_file();
+  bool has_file() const;
   std::string load_engine();
 
   bool good() {
@@ -99,14 +106,17 @@ class InferenceSession {
   InferenceContext ctx;
 
   nvinfer1::IExecutionContext *context;
-  void *execution_memory;
-  int32_t last_batch, last_height, last_width;
+  void *execution_memory{};
+  int32_t last_batch=-1, last_height=-1, last_width=-1;
   std::atomic<bool> good_;
+  void *input_ptr{}, *output_ptr{};
+  bool input_interleaved{}, output_interleaved{};
+  int32_t input_channel_stride{}, output_channel_stride{};
 
  public:
-  cudaStream_t stream;
-  cudaEvent_t input_consumed;
-  void *input, *output;
+  cudaStream_t stream{};
+  cudaEvent_t input_consumed{};
+  int32_t scale_w=-1, scale_h=-1;
 
   explicit InferenceSession(InferenceContext &ctx);
   ~InferenceSession();
@@ -117,7 +127,37 @@ class InferenceSession {
   std::string allocation();
   std::string deallocation();
   void config(int32_t batch, int32_t height, int32_t width);
-  std::pair<int32_t, int32_t> detect_scale();
+  void detect_scale();
+
+  bool inference() const;
+
+  template<typename F>
+  md_uview<F, int32_t, 3, int64_t> input(int32_t height, int32_t width) const {
+    shape_t shape {3, height, width};
+
+    shape_t stride_shape {input_channel_stride, height, width};
+    if (input_interleaved) {
+      stride_shape = stride_shape.gather<1, 2, 0>();
+    }
+    stride_t stride = stride_shape.stride<int64_t>();
+    if (input_interleaved) {
+      stride = stride.gather<2, 0, 1>();
+    }
+    return {static_cast<F *>(input_ptr), shape, stride};
+  }
+
+  template<typename F>
+  md_uview<F, int32_t, 3, int64_t> output(int32_t height, int32_t width) const {
+    shape_t shape {3, height * scale_h, width * scale_h};
 
-  bool inference();
+    shape_t stride_shape {output_channel_stride, height * scale_h, width * scale_h};
+    if (output_interleaved) {
+      stride_shape = stride_shape.gather<1, 2, 0>();
+    }
+    stride_t stride = stride_shape.stride<int64_t>();
+    if (output_interleaved) {
+      stride = stride.gather<2, 0, 1>();
+    }
+    return {static_cast<F *>(output_ptr), shape, stride};
+  }
 };