Updates for Offline Deployment and General Bug/QoL Fixes (#349)

* Adding all Thirdparty packages to CPM for ease of deployment * Updating CPM and adding as git subtree of project * Documentation on all of the above (in progress) * Fixes for NVTX Macros and CMake flags * Update includes for einsum * Fix for size checks on matvec * Documentation on linkage/usage
NVIDIA · Jan 11, 2023 · 20e00a2 · 20e00a2
1 parent 74442fa
commit 20e00a2
Show file tree

Hide file tree

Showing 155 changed files with 5,206 additions and 326 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,7 +68,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CUDA_CXX_STANDARD 17)
 
 # CPM is required for all package management
-include(cmake/GetCPM.cmake)
+include(public/cpm-cmake/cmake/CPM.cmake)
 # Helper for selecting build type
 include(cmake/BuildType.cmake)
 
@@ -244,6 +244,7 @@ endif()
 
 if (MATX_NVTX_FLAGS)
     add_definitions(-DMATX_NVTX_FLAGS)
+    target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
 endif()
 if (MATX_BUILD_32_BIT)
     add_definitions(-DINDEX_32_BIT)

diff --git a/cmake/FindcuTENSOR.cmake b/cmake/FindcuTENSOR.cmake
@@ -84,13 +84,17 @@ if(NOT cuTENSOR_FOUND)
 
   message(STATUS "cuTENSOR not found. Downloading library. By continuing this download you accept to the license terms of cuTENSOR")
 
-  file(DOWNLOAD https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-${CUTENSOR_VERSION}-archive.tar.xz
-      ${CMAKE_BINARY_DIR}/${CUTENSOR_FILENAME}.tar.xz)
+  CPMAddPackage(
+    NAME cutensor
+    VERSION ${CUTENSOR_VERSION}
+    URL https://developer.download.nvidia.com/compute/cutensor/redist/libcutensor/linux-x86_64/libcutensor-linux-x86_64-${CUTENSOR_VERSION}-archive.tar.xz
+    # Eigen's CMakelists are not intended for library use
+    DOWNLOAD_ONLY YES 
+  )
+
+  set(cuTENSOR_LIBRARY ${cutensor_SOURCE_DIR}/lib/11/libcutensor.so)
+  set(cuTENSOR_INCLUDE_DIR ${cutensor_SOURCE_DIR}/include) 
 
-  file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/${CUTENSOR_FILENAME}.tar.xz DESTINATION ${CMAKE_BINARY_DIR}/cutensor/)    
-
-  set(cuTENSOR_LIBRARY ${CMAKE_BINARY_DIR}/cutensor/${CUTENSOR_FILENAME}/lib/11/libcutensor.so) 
-  set(cuTENSOR_INCLUDE_DIR ${CMAKE_BINARY_DIR}/cutensor/${CUTENSOR_FILENAME}/include) 
 
   set(cuTENSOR_FOUND TRUE)
 endif()

diff --git a/cmake/FindcuTensorNet.cmake b/cmake/FindcuTensorNet.cmake
@@ -83,14 +83,17 @@ if(NOT cuTensorNet_FOUND)
 
   set(CUTENSORNET_VERSION 22.03.0.40)
   set(CUTENSORNET_FILENAME cuquantum-linux-x86_64-${CUTENSORNET_VERSION}-archive)
-
-  file(DOWNLOAD https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum/linux-x86_64/${CUTENSORNET_FILENAME}.tar.xz
-       ${CMAKE_BINARY_DIR}/${CUTENSORNET_FILENAME}.tar.xz)
-
-  file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/${CUTENSORNET_FILENAME}.tar.xz DESTINATION ${CMAKE_BINARY_DIR}/cutensornet/)
 
-  set(cuTensorNet_LIBRARY ${CMAKE_BINARY_DIR}/cutensornet/${CUTENSORNET_FILENAME}/lib/libcutensornet.so) 
-  set(cuTensorNet_INCLUDE_DIR ${CMAKE_BINARY_DIR}/cutensornet/${CUTENSORNET_FILENAME}/include) 
+  CPMAddPackage(
+               NAME cutensornet
+               VERSION ${CUTENSORNET_VERSION}
+               URL https://developer.download.nvidia.com/compute/cuquantum/redist/cuquantum/linux-x86_64/${CUTENSORNET_FILENAME}.tar.xz
+               # Eigen's CMakelists are not intended for library use
+               DOWNLOAD_ONLY YES 
+               )
+
+  set(cuTensorNet_LIBRARY ${cutensornet_SOURCE_DIR}/lib/libcutensornet.so) 
+  set(cuTensorNet_INCLUDE_DIR ${cutensornet_SOURCE_DIR}/include) 
 
   set(cuTensorNet_FOUND TRUE)
 endif()

diff --git a/docs/_sources/api/fft.rst.txt b/docs/_sources/api/fft.rst.txt
@@ -2,8 +2,8 @@ FFT
 ###
 The API below provides transformations for Fast Fourier Transforms (FFTs) of both 1D and 2D types with batching.
 
-.. doxygenfunction:: fft(OutputTensor &o, const InputTensor &i, index_t fft_size = 0, cudaStream_t stream = 0)
-.. doxygenfunction:: ifft(OutputTensor &o, const InputTensor &i, index_t fft_size = 0, cudaStream_t stream = 0)
-.. doxygenfunction:: fft2(OutputTensor &o, const InputTensor &i, cudaStream_t stream = 0)
-.. doxygenfunction:: ifft2(OutputTensor &o, const InputTensor &i, cudaStream_t stream = 0)
+.. doxygenfunction:: fft(OutputTensor o, const InputTensor i, index_t fft_size = 0, cudaStream_t stream = 0)
+.. doxygenfunction:: ifft(OutputTensor o, const InputTensor i, index_t fft_size = 0, cudaStream_t stream = 0)
+.. doxygenfunction:: fft2(OutputTensor o, const InputTensor i, cudaStream_t stream = 0)
+.. doxygenfunction:: ifft2(OutputTensor o, const InputTensor i, cudaStream_t stream = 0)
 .. doxygenfunction:: dct(OutputTensor &out, const InputTensor &in, const cudaStream_t stream = 0)
diff --git a/docs/_sources/api/index.rst.txt b/docs/_sources/api/index.rst.txt
@@ -15,3 +15,4 @@ API Reference
    utilities.rst
    type_traits.rst
    einsum.rst
+   nvtx.rst
diff --git a/docs/_sources/api/matmul.rst.txt b/docs/_sources/api/matmul.rst.txt
@@ -6,5 +6,6 @@ is supported for any tensor with a rank higher than 2.
 
 Cached API
 ----------
-.. doxygenfunction:: matmul
+.. doxygenfunction:: matmul(TensorTypeC C, const TensorTypeA A, const TensorTypeB B, const int32_t (&axis)[2], cudaStream_t stream = 0, float alpha = 1.0, float beta = 0.0)
+.. doxygenfunction:: matmul(TensorTypeC C, const TensorTypeA A, const TensorTypeB B, cudaStream_t stream = 0, float alpha = 1.0, float beta = 0.0)
 
diff --git a/docs/_sources/api/nvtx.rst.txt b/docs/_sources/api/nvtx.rst.txt
@@ -0,0 +1,70 @@
+NVTX Profiling
+##############
+
+Overview
+--------
+MatX provides an NVTX API to enable native compile-in profiling capabilities. The MatX NVTX API enable a user to 
+easily profile all MatX calls using built-in NVTX ranges, while also providing a convenient API for the user to insert 
+custom ranges in their own code. This API provides many convenience features such as:
+
+- A convenient compile-in/compile-out MACRO based API 
+- verbosity levels allowing varying levels of profiling detail
+- Built-in color rotation
+- Automatic scope management and range naming 
+- Overloaded API for manual range specification
+
+The MatX NVTX API is implemented as a set of C++ Macros, allowing the user to compile all calls out of the project for 
+maximum performance when profiling is not needed. 
+
+Enabling NVTX API and Setting Log Level
+---------------------------------------
+To enable the NVTX Profiling API, simple compile with the ``MATX_NVTX_FLAG=ON`` enabled in the cmake command.
+Once the flags are enabled at compile time, the project defaults to logging at the API level, which will provide NVTX
+ranges for all MatX API calls. If another logging level is desired, this can be changed using the ``matx::setNVTXLogLevel()`` call. 
+Possible log levels are defined in ``matx_nvxtLogLevels``.
+
+Using the NVTX API
+------------------
+The MatX NVTX API consists of two modes: auto managed, and manual range. The auto-managed API will automatically match the NVTX range to 
+the scope in which it is declared, establishing the NVTX range from the call’s instantiation to the end of its parent scope. Only a single 
+call is needed, with optional inputs defined below. If no message is provided, the call defaults to using the calling function’s name as 
+the NVTX range’s message.
+
+The Manual Range NVTX API requires the user to make a call to the NVTX API at both the beginning and end of the desired range. The Manual 
+Range API uses a user defined handle (int) to reference the NVTX range. A Manual NVTX Range must be fully qualified on every instantiation. 
+
+NVTX Examples
+-------------
+
+.. list-table::
+  :widths: 60 40
+  :header-rows: 1
+
+  * - Command 
+    - Result
+  * - MATX_NVTX_START("")
+    - NVTX range scoped to this function, named the same as function with log level of Internal 
+  * - MATX_NVTX_START("MY_MESSAGE")
+    - NVTX range scoped to this function, named “MY_MESSAGE” with log level of Internal
+  * - MATX_NVTX_START("MY_MESSAGE", matx::MATX_NVTX_LOG_API )
+    - NVTX range scoped to this function, named “MY_MESSAGE” with log level of API
+  * - MATX_NVTX_START_RANGE( "MY_MESSAGE", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1 )
+    - NVTX range with manual scope, named “MY_MESSAGE”, log level of USER, and handle ID of 1
+  * - MATX_NVTX_END_RANGE(1)
+    - Ends the NVTX range of range with a handle of 1 used in NVTX_START_RANGE        
+
+Code examples are provided in the ``simple_radar_pipeline`` code to show user utilization of the MatX NVTX API. 
+
+MatX NVTX API 
+-------------
+.. doxygenfunction:: matx::setNVTXLogLevel
+.. doxygenfunction:: matx::registerEvent
+.. doxygenfunction:: matx::endEvent
+
+MatX NVTX Logging Levels
+------------------------
+.. doxygenenum:: matx::matx_nvxtLogLevels
+
+MatX NVTX Auto Range Colors
+---------------------------
+.. doxygenvariable:: matx::nvtxColors    
diff --git a/docs/_sources/api/reduce.rst.txt b/docs/_sources/api/reduce.rst.txt
@@ -5,16 +5,25 @@ The reductions API provides functions for reducing data from a higher rank to a
 are listed in the :ref:`statistics` API guide section. Note that to avoid collissions with the C++ standard library, 
 ``min`` and ``max`` are called ``rmin`` and ``rmax`` in MatX.
 
-.. doxygenfunction:: reduce(OutType dest, [[maybe_unused]] TensorIndexType idest, InType in, ReduceOp op, cudaStream_t stream = 0, bool init = true)
+.. doxygenfunction:: reduce(OutType dest, [[maybe_unused]] TensorIndexType idest, const InType &in, ReduceOp op, cudaStream_t stream = 0, bool init = true)
 .. doxygenfunction:: reduce(OutType dest, const InType &in, ReduceOp op, cudaStream_t stream = 0, [[maybe_unused]] bool init = true)
-.. doxygenfunction:: any
-.. doxygenfunction:: all
-.. doxygenfunction:: rmin
-.. doxygenfunction:: rmax
-.. doxygenfunction:: sum  
-.. doxygenfunction:: argmin
-.. doxygenfunction:: argmax
+.. doxygenfunction:: any(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: any(OutType dest, const InType &in, cudaStream_t stream = 0)
+.. doxygenfunction:: all(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: all(OutType dest, const InType &in, cudaStream_t stream = 0)
+.. doxygenfunction:: rmin(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: rmin(OutType dest, const InType &in, cudaStream_t stream = 0)
+.. doxygenfunction:: rmax(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: rmax(OutType dest, const InType &in, cudaStream_t stream = 0)
+.. doxygenfunction:: sum(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: sum(OutType dest, const InType &in, cudaStream_t stream = 0)
+.. doxygenfunction:: argmin(OutType dest, TensorIndexType &idest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: argmin(OutType dest, TensorIndexType &idest, const InType &in, cudaStream_t stream = 0)
+.. doxygenfunction:: argmax(OutType dest, const TensorIndexType &idest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: argmax(OutType dest, TensorIndexType &idest, const InType &in, cudaStream_t stream = 0)  
 .. doxygenfunction:: trace
 .. doxygenfunction:: find
 .. doxygenfunction:: find_idx
 .. doxygenfunction:: unique
+.. doxygenfunction:: softmax(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)  
+.. doxygenfunction:: softmax(OutType dest, const InType &in, cudaStream_t stream = 0)
diff --git a/docs/_sources/api/stats.rst.txt b/docs/_sources/api/stats.rst.txt
@@ -6,7 +6,11 @@ The API below provides methods for statistics functions.
 
 .. doxygenfunction:: cumsum
 .. doxygenfunction:: hist
-.. doxygenfunction:: mean
-.. doxygenfunction:: median
-.. doxygenfunction:: var 
-.. doxygenfunction:: stdd
+.. doxygenfunction:: mean(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: mean(OutType dest, const InType &in, cudaStream_t stream = 0)  
+.. doxygenfunction:: median(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: median(OutType dest, const InType &in, cudaStream_t stream = 0)  
+.. doxygenfunction:: var(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: var(OutType dest, const InType &in, cudaStream_t stream = 0)  
+.. doxygenfunction:: stdd(OutType dest, const InType &in, const int (&dims)[D], cudaStream_t stream = 0)
+.. doxygenfunction:: stdd(OutType dest, const InType &in, cudaStream_t stream = 0)  
diff --git a/docs/_sources/api/tensorgenerators.rst.txt b/docs/_sources/api/tensorgenerators.rst.txt
@@ -30,8 +30,7 @@ since their only purpose is to compute a single value at a particular location b
 .. doxygenfunction:: matx::linspace(const index_t (&s)[RANK], T first, T last)
 .. doxygenfunction:: matx::logspace(ShapeType &&s, T first, T last)
 .. doxygenfunction:: matx::logspace(const index_t (&s)[RANK], T first, T last)
-.. doxygenfunction:: matx::meshgrid_x(const std::array<T, 3> &x, const std::array<T, 3> &y)
-.. doxygenfunction:: matx::meshgrid_y(const std::array<T, 3> &x, const std::array<T, 3> &y)
+.. doxygenfunction:: matx::meshgrid(Ts&&... ts)
 .. doxygenfunction:: matx::chirp(SpaceOp t, FreqType f0, typename SpaceOp::scalar_type t1, FreqType f1, ChirpMethod method )
 .. doxygenfunction:: matx::chirp(index_t num, TimeType last, FreqType f0, TimeType t1, FreqType f1, ChirpMethod method)
 .. doxygenfunction:: matx::cchirp(SpaceOp t, FreqType f0, typename SpaceOp::scalar_type t1, FreqType f1, ChirpMethod method)

diff --git a/docs/_sources/api/tensorops.rst.txt b/docs/_sources/api/tensorops.rst.txt
@@ -75,7 +75,7 @@ Casting Operators
 Advanced Operators
 ------------------
 
-.. doxygenclass:: matx::IF 
+.. doxygenfunction:: matx::IF
 .. doxygenclass:: matx::IFELSE
 .. doxygenfunction:: reverse(Op t)
 .. doxygenfunction:: reverse(Op_type t)
@@ -85,19 +85,26 @@ Advanced Operators
 .. doxygenfunction:: repmat(T1 t, index_t reps)    
 .. doxygenfunction:: repmat(T1 t, const index_t(&reps)[])
 .. doxygenfunction:: repmat(T1 t, const index_t *reps)
+.. doxygenfunction:: cart2sph
+.. doxygenfunction:: sph2cart
 .. doxygenfunction:: kron
+.. doxygenfunction:: legendre(T1 n, T2 m, const T3 in)
+.. doxygenfunction:: legendre(T1 n, T2 m, const T3 in, int (&axis)[2])
+.. doxygenfunction:: legendre(T1 n, T2 m, const T3 in, std::array<int, 2> axis)  
 .. doxygenfunction:: hermitianT
-.. doxygenfunction:: r2cop
+.. doxygenfunction:: r2c(T1 t, index_t orig) 
 .. doxygenfunction:: flatten
 .. doxygenfunction:: remap(Op t, Ind idx)
 .. doxygenfunction:: remap(Op t, Ind idx, Inds... inds)
 .. doxygenfunction:: rcollapse
 .. doxygenfunction:: lcollapse
-.. doxygenfunction:: clone
-.. doxygenfunction:: slice( const T_wStrideT op, const typename T_wStrideT::shape_type (&starts)[T_wStrideT::Rank()], const typename T_wStrideT::shape_type (&ends)[T_wStrideT::Rank()], const typename T_wStrideT::stride_type (&strides)[T_wStrideT::Rank()])                 
-.. doxygenfunction:: slice( const T_wShapeT op, const typename T_wShapeT::shape_type (&starts)[T_wShapeT::Rank()], const typename T_wShapeT::shape_type (&ends)[T_wShapeT::Rank()])
-.. doxygenfunction:: slice( const T_wStridet_2 op, const typename T_wStridet_2::shape_type (&starts)[T_wStridet_2::Rank()], const typename T_wStridet_2::shape_type (&ends)[T_wStridet_2::Rank()], const typename T_wStridet_2::stride_type (&strides)[T_wStridet_2::Rank()])
-.. doxygenfunction:: slice( const T_wShapet_2 op, const typename T_wShapet_2::shape_type (&starts)[T_wShapet_2::Rank()], const typename T_wShapet_2::shape_type (&ends)[T_wShapet_2::Rank()])
-
+.. doxygenfunction:: clone(Op t, const index_t (&shape)[Rank])
+.. doxygenfunction:: clone(Op t, const std::array<index_t, Rank> &shape)
+.. doxygenfunction:: stack 
+.. doxygenfunction:: slice(const OpType opIn, const index_t (&starts)[OpType::Rank()], const index_t (&ends)[OpType::Rank()])
+.. doxygenfunction:: slice(const OpType op, const index_t (&starts)[OpType::Rank()], const index_t (&ends)[OpType::Rank()], const index_t (&strides)[OpType::Rank()])
 .. doxygenfunction:: permute(detail::tensor_impl_t<T, Rank> &out, const detail::tensor_impl_t<T, Rank> &in, const std::initializer_list<uint32_t> &dims, const cudaStream_t stream)
-.. doxygenfunction:: permute(const T op, const int32_t (&dims)[T::Rank()])  
+.. doxygenfunction:: permute(const T &op, const int32_t (&dims)[T::Rank()])
+.. doxygenfunction:: permute(const T &op, const std::array<int32_t, T::Rank()> &dims)
+.. doxygenfunction:: reshape(const T &op, ShapeType &&s)
+.. doxygenfunction:: reshape(const T &op, const int32_t (&sizes)[RANK])  
diff --git a/docs/_sources/api/tensorview.rst.txt b/docs/_sources/api/tensorview.rst.txt
@@ -1,7 +1,7 @@
 .. _tensorview_api:
 
 Tensors
-########
+#######
 
 tensor_t is the main tensor class in MatX used for tensor operations. The member functions allow different ways
 to manipulate how the same data region is viewed without modifying the data. 

diff --git a/docs/_sources/api/transformations.rst.txt b/docs/_sources/api/transformations.rst.txt
@@ -21,4 +21,4 @@ below provide both cached and non-cached interfaces.
   filter.rst
   reduce.rst
   sort.rst
-
+.. doxygenfunction:: permute(detail::tensor_impl_t<T, Rank> &out, const detail::tensor_impl_t<T, Rank> &in, const std::initializer_list<uint32_t> &dims, const cudaStream_t stream)