wang-xinyu
diff --git a/‎alexnet/CMakeLists.txt‎
Lines changed: 38 additions & 18 deletions b/‎alexnet/CMakeLists.txt‎
Lines changed: 38 additions & 18 deletions
diff --git a/‎alexnet/FindTensorRT.cmake‎
Lines changed: 78 additions & 0 deletions b/‎alexnet/FindTensorRT.cmake‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎alexnet/README.md‎
Lines changed: 53 additions & 19 deletions b/‎alexnet/README.md‎
Lines changed: 53 additions & 19 deletions
@@ -1,25 +1,45 @@
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 3.14)
 
-project(alexnet)
+project(
+  alexnet
+  VERSION 0.1
+  LANGUAGES C CXX CUDA)
 
-add_definitions(-std=c++11)
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES
+      60
+      70
+      72
+      75
+      80
+      86
+      89)
+endif()
 
-option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_BUILD_TYPE Debug)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_INCLUDE_CURRENT_DIR TRUE)
+set(CMAKE_BUILD_TYPE
+    "Debug"
+    CACHE STRING "Build type for this project" FORCE)
 
-include_directories(${PROJECT_SOURCE_DIR}/include)
-# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
-# cuda
-include_directories(/usr/local/cuda/include)
-link_directories(/usr/local/cuda/lib64)
-# tensorrt
-include_directories(/usr/include/x86_64-linux-gnu/)
-link_directories(/usr/lib/x86_64-linux-gnu/)
+option(CUDA_USE_STATIC_CUDA_RUNTIME "Use static cudaruntime library" OFF)
 
-add_executable(alexnet ${PROJECT_SOURCE_DIR}/alex.cpp)
-target_link_libraries(alexnet nvinfer)
-target_link_libraries(alexnet cudart)
+find_package(Threads REQUIRED)
+find_package(CUDAToolkit REQUIRED)
 
-add_definitions(-O2 -pthread)
+if(NOT TARGET TensorRT::TensorRT)
+  include(FindTensorRT.cmake)
+else()
+  message("TensorRT has been found, skipping for ${PROJECT_NAME}")
+endif()
 
+add_executable(${PROJECT_NAME} alex.cpp)
+
+target_include_directories(${PROJECT_NAME} PUBLIC ${CMAKE_SOURCE_DIR}/include)
+
+target_link_libraries(${PROJECT_NAME} PUBLIC Threads::Threads m
+                                             TensorRT::TensorRT CUDA::cudart)
@@ -0,0 +1,78 @@
+cmake_minimum_required(VERSION 3.17.0)
+
+set(TRT_VERSION
+    $ENV{TRT_VERSION}
+    CACHE STRING
+          "TensorRT version, e.g. \"8.6.1.6\" or \"8.6.1.6+cuda12.0.1.011\"")
+
+# find TensorRT include folder
+if(NOT TensorRT_INCLUDE_DIR)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    set(TensorRT_INCLUDE_DIR
+        "/usr/local/cuda/targets/aarch64-linux/include"
+        CACHE PATH "TensorRT_INCLUDE_DIR")
+  else()
+    set(TensorRT_INCLUDE_DIR
+        "/usr/include/x86_64-linux-gnu"
+        CACHE PATH "TensorRT_INCLUDE_DIR")
+  endif()
+  message(STATUS "TensorRT: ${TensorRT_INCLUDE_DIR}")
+endif()
+
+# find TensorRT library folder
+if(NOT TensorRT_LIBRARY_DIR)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    set(TensorRT_LIBRARY_DIR
+        "/usr/lib/aarch64-linux-gnu/tegra"
+        CACHE PATH "TensorRT_LIBRARY_DIR")
+  else()
+    set(TensorRT_LIBRARY_DIR
+        "/usr/include/x86_64-linux-gnu"
+        CACHE PATH "TensorRT_LIBRARY_DIR")
+  endif()
+  message(STATUS "TensorRT: ${TensorRT_LIBRARY_DIR}")
+endif()
+
+set(TensorRT_LIBRARIES)
+
+message(STATUS "Found TensorRT lib: ${TensorRT_LIBRARIES}")
+
+# process for different TensorRT version
+if(DEFINED TRT_VERSION AND NOT TRT_VERSION STREQUAL "")
+  string(REGEX MATCH "([0-9]+)" _match ${TRT_VERSION})
+  set(TRT_MAJOR_VERSION "${_match}")
+  set(_modules nvinfer nvinfer_plugin)
+
+  if(TRT_MAJOR_VERSION GREATER_EQUAL 8)
+    list(APPEND _modules nvinfer_vc_plugin nvinfer_dispatch nvinfer_lean)
+  endif()
+else()
+  message(FATAL_ERROR "Please set a environment variable \"TRT_VERSION\"")
+endif()
+
+# find and add all modules of TensorRT into list
+foreach(lib IN LISTS _modules)
+  find_library(
+    TensorRT_${lib}_LIBRARY
+    NAMES ${lib}
+    HINTS ${TensorRT_LIBRARY_DIR})
+  list(APPEND TensorRT_LIBRARIES ${TensorRT_${lib}_LIBRARY})
+endforeach()
+
+# make the "TensorRT target"
+add_library(TensorRT IMPORTED INTERFACE)
+add_library(TensorRT::TensorRT ALIAS TensorRT)
+target_link_libraries(TensorRT INTERFACE ${TensorRT_LIBRARIES})
+
+set_target_properties(
+  TensorRT
+  PROPERTIES C_STANDARD 17
+             CXX_STANDARD 17
+             POSITION_INDEPENDENT_CODE ON
+             SKIP_BUILD_RPATH TRUE
+             BUILD_WITH_INSTALL_RPATH TRUE
+             INSTALL_RPATH "$\{ORIGIN\}"
+             INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIR}")
+
+unset(TRT_MAJOR_VERSION)
+unset(_modules)
@@ -1,33 +1,67 @@
-# alexnet
+# AlexNet
 
-AlexNet model architecture from the "One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
+## Introduction
 
-For the details, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)
+AlexNet model architecture comes from this paper: [One weird trick for parallelizing convolutional neural networks](https://arxiv.org/abs/1404.5997). To generate `.wts` file, you can refer to [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet). To check the pytorch implementation of AlexNet, refer to [HERE](https://github.com/pytorch/vision/blob/main/torchvision/models/alexnet.py#L17)
 
-This alexnet is just several `conv-relu-pool` blocks followed by several `fc-relu`, nothing special. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addFullyConnected`.
+AlexNet consists of 3 major parts: features, adaptive average pooling, and classifier:
+* features: just several stacked `CRP`(conv-relu-pool) and `CR` layers
+* adaptive average pooling: pytorch can decide its inner parameters, but we need to calculate it ourselves in TensorRT API
+* classifier: just several `fc-relu` layers. All layers can be implemented by tensorrt api, including `addConvolution`, `addActivation`, `addPooling`, `addMatrixMultiply`, `addElementWise` etc.
 
-```
-// 1. generate alexnet.wts from [pytorchx/alexnet](https://github.com/wang-xinyu/pytorchx/tree/master/alexnet)
-
-// 2. put alexnet.wts into tensorrtx/alexnet
-
-// 3. build and run
-
-cd tensorrtx/alexnet
+## Use AlexNet from PyTorch
 
-mkdir build
+We can use torchvision to load the pretrained alexnet model:
 
-cd build
+```python
+alexnet = torchvision.models.alexnet(pretrained=True)
+```
 
-cmake ..
+The model structure is:
+
+```txt
+AlexNet(
+  (features): Sequential(
+    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
+    (1): ReLU(inplace=True)
+    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
+    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
+    (4): ReLU(inplace=True)
+    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
+    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    (7): ReLU(inplace=True)
+    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    (9): ReLU(inplace=True)
+    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+    (11): ReLU(inplace=True)
+    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
+  )
+  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
+  (classifier): Sequential(
+    (0): Dropout(p=0.5, inplace=False)
+    (1): Linear(in_features=9216, out_features=4096, bias=True)
+    (2): ReLU(inplace=True)
+    (3): Dropout(p=0.5, inplace=False)
+    (4): Linear(in_features=4096, out_features=4096, bias=True)
+    (5): ReLU(inplace=True)
+    (6): Linear(in_features=4096, out_features=1000, bias=True)
+  )
+)
+```
 
-make
+## FAQ
 
-sudo ./alexnet -s   // serialize model to plan file i.e. 'alexnet.engine'
+### How to align the output with Pytorch?
 
-sudo ./alexnet -d   // deserialize plan file and run inference
+If your output is different from pytorch, you have to check which TensorRT API or your code cause this. A simple solution would be check the `.engine` output part by part, e.g., you can set the early layer of alexnet as output:
 
-// 4. see if the output is same as pytorchx/alexnet
+```c++
+fc3_1->getOutput(0)->setName(OUTPUT_NAME);
+network->markOutput(*pool3->getOutput(0)); // original is: "*fc3_1->getOutput(0)"
 ```
 
+For this line of code, i use the output from "feature" part of alexnet, ignoring the rest of the model, then, don't forget to change the `OUTPUT_SIZE` macro on top of the file, lastly, build the `.engine` file to apply the changes.
+
+You can sum up all output from C++ code, and compare it with Pytorch output, for Pytorch, you can do this by: `torch.sum(x)` at debug phase. The ideal value deviation between 2 values would be $[10^{-1}, 10^{-2}]$, for this example, since the output elements for "feature" is $256 * 6 * 6$ (bacth = 1), the final error would roughly be $10^{-4}$.
 
+Note: This is a quick check, for more accurate check, you have to save the output tensor into a file to compare them value by value, but this situation is rare.