Merge remote-tracking branch 'origin' into llvm-backend

tensor-compiler · May 27, 2021 · 3198ba7 · 3198ba7
2 parents 731ed20 + ad13006
commit 3198ba7
Show file tree

Hide file tree

Showing 29 changed files with 648 additions and 167 deletions.
diff --git a/.github/workflows/buildandtest.yml b/.github/workflows/buildandtest.yml
@@ -77,33 +77,3 @@ jobs:
           CTEST_OUTPUT_ON_FAILURE: 1
           CTEST_PARALLEL_LEVEL: 2
         working-directory: build
-
-  build-gpu:
-    name: build taco for gpu, but does not run tests
-    runs-on: ubuntu-18.04
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: download cuda
-        run: wget http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
-      - name: install cuda
-        run: sudo sh cuda_10.2.89_440.33.01_linux.run --silent --toolkit --installpath="$GITHUB_WORKSPACE/cuda"
-      - name: add path
-        run: echo "$GITHUB_WORKSPACE/cuda/bin" >> $GITHUB_PATH
-      - name: set ld_library_path
-        run: echo "LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cuda/lib64" >> $GITHUB_ENV
-      - name: set library_path
-        run: echo "LIBRARY_PATH=$GITHUB_WORKSPACE/cuda/lib64" >> $GITHUB_ENV
-      - name: print environment
-        run: |
-          echo ${PATH}
-          echo ${LD_LIBRARY_PATH}
-          echo ${LIBRARY_PATH}
-      - name: create_build
-        run: mkdir build
-      - name: cmake
-        run: cmake -DCUDA=ON ..
-        working-directory: build
-      - name: make
-        run: make -j2
-        working-directory: build
diff --git a/.github/workflows/cuda-test-manual.yml b/.github/workflows/cuda-test-manual.yml
@@ -0,0 +1,42 @@
+name: "CUDA build and test (manual)"
+
+# Note: This workflow is triggered by hand by TACO developers.
+# It should be run after the code has been reviewed by humans.
+# This review step is important to ensure the safety of the
+# self-hosted runner.
+
+on:
+  workflow_dispatch:
+    inputs:
+      CMAKE_BUILD_TYPE:
+        description: CMAKE_BUILD_TYPE
+        required: true
+        default: Debug
+      OPENMP:
+        description: OPENMP
+        required: true
+        default: 'ON'
+      PYTHON:
+        description: PYTHON
+        required: true
+        default: 'OFF'
+jobs:
+  ubuntu1604-cuda:
+    name: tests ubuntu 16.04 with CUDA 9
+    runs-on: [self-hosted, ubuntu-16.04, cuda]
+    steps:
+    - uses: actions/checkout@v2
+    - name: create_build
+      run: mkdir build
+    - name: cmake
+      run: cmake -DCMAKE_BUILD_TYPE=${{ github.event.inputs.CMAKE_BUILD_TYPE }} -DCUDA=ON -DOPENMP=${{ github.event.inputs.OPENMP }} -DPYTHON=${{ github.event.inputs.PYTHON }} ..
+      working-directory: build
+    - name: make
+      run: make -j8
+      working-directory: build
+    - name: test
+      run: make test
+      env:
+        CTEST_OUTPUT_ON_FAILURE: 1
+        CTEST_PARALLEL_LEVEL: 8
+      working-directory: build
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.4.0 FATAL_ERROR)
 if(POLICY CMP0048)
   cmake_policy(SET CMP0048 NEW)
 endif()
@@ -12,14 +12,10 @@ project(taco
 option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF)
 option(PYTHON "Build TACO for python environment" OFF)
 option(OPENMP "Build with OpenMP execution support" OFF)
-
-option(LLVM "Build with LLVM backend support")
-option(ENABLE_TESTS "Enable tests" ON)
 option(COVERAGE "Build with code coverage analysis" OFF)
 set(TACO_FEATURE_CUDA 0)
 set(TACO_FEATURE_OPENMP 0)
 set(TACO_FEATURE_PYTHON 0)
-
 if(CUDA)
   message("-- Searching for CUDA Installation")
   find_package(CUDA REQUIRED)
@@ -99,13 +95,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 
 set(OPTIMIZE "-O3" CACHE STRING "Optimization level")
-
-if(CUDA)
-  set(C_CXX_FLAGS "$ENV{CXXFLAGS} -lcudart -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -Wmissing-declarations -Woverloaded-virtual -pedantic-errors -Wno-deprecated")
-else()
-  set(C_CXX_FLAGS "$ENV{CXXFLAGS} -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -Wmissing-declarations -Woverloaded-virtual -pedantic-errors -Wno-deprecated")
-endif(CUDA)
-
+set(C_CXX_FLAGS "-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -Wmissing-declarations -Woverloaded-virtual -pedantic-errors -Wno-deprecated")
 if(OPENMP)
   set(C_CXX_FLAGS "-fopenmp ${C_CXX_FLAGS}")
 endif(OPENMP)

diff --git a/README.md b/README.md
@@ -29,14 +29,18 @@ TL;DR build taco using CMake. Run `make test`.
 # Build and test
 ![Build and Test](https://github.com/RSenApps/taco/workflows/Build%20and%20Test/badge.svg?branch=master)
 
-Build taco using CMake 2.8.12 or greater:
+Build taco using CMake 3.4.0 or greater:
 
     cd <taco-directory>
     mkdir build
     cd build
     cmake -DCMAKE_BUILD_TYPE=Release ..
     make -j8
 
+Building taco requires `gcc` 5.0 or newer, or `clang` 3.9 or newer.  You can
+use a specific compiler or version by setting the `CC` and `CXX` environment
+variables before running `cmake`.
+
 ## Building Python API
 To build taco with the Python API (pytaco), add `-DPYTHON=ON` to the cmake line above. For example:
 
@@ -46,13 +50,20 @@ You will then need to add the pytaco module to PYTHONPATH:
 
     export PYTHONPATH=<taco-directory>/build/lib:$PYTHONPATH
 
-pytaco requires NumPy and SciPy to be installed.
+This requires Python 3.x and some development libraries.  It also requires
+NumPy and SciPy to be installed.  For Debian/Ubuntu, the following packages
+are needed: `python3 libpython3-dev python3-distutils python3-numpy python3-scipy`.
 
 ## Building for OpenMP
 To build taco with support for parallel execution (using OpenMP), add `-DOPENMP=ON` to the cmake line above. For example:
 
     cmake -DCMAKE_BUILD_TYPE=Release -DOPENMP=ON ..
 
+If you are building with the `clang` compiler, you may need to ensure that
+the `libomp` development headers are installed.  For Debian/Ubuntu, this is
+provided by `libomp-dev`, One of the more specific versions like
+`libomp-13-dev` may also work.
+
 ## Building for CUDA
 To build taco for NVIDIA CUDA, add `-DCUDA=ON` to the cmake line above. For example:
 
@@ -66,12 +77,7 @@ Please also make sure that you have CUDA installed properly and that the followi
 
 If you do not have CUDA installed, you can still use the taco cli to generate CUDA code with the -cuda flag.
 
-## Building for LLVM Backend
-To build taco for LLVM, add `-DLLVM=ON` to the cmake line above. For example:
-
-    cmake -DCMAKE_BUILD_TYPE=Release -DLLVM=ON ..
-
-Make sure you have LLVM properly installed. 
+The generated CUDA code will require compute capability 6.1 or higher to run.
 
 ## Running tests
 To run all tests:

diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h
@@ -351,12 +351,13 @@ class LowererImpl : public util::Uncopyable {
 
   /// Initializes a temporary workspace
   std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
-
+  std::vector<ir::Stmt> codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit);
+  std::vector<ir::Stmt> codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit);
   /// Gets the size of a temporary tensorVar in the where statement
   ir::Expr getTemporarySize(Where where);
 
   /// Initializes helper arrays to give dense workspaces sparse acceleration
-  std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where);
+  std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false);
 
   /// Recovers a derived indexvar from an underived variable.
   ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl);
@@ -443,6 +444,12 @@ class LowererImpl : public util::Uncopyable {
   /// Map used to hoist temporary workspace initialization
   std::map<Forall, Where> temporaryInitialization;
 
+  /// Map used to hoist parallel temporary workspaces. Maps workspace shared by all threads to where statement
+  std::map<Where, TensorVar> whereToTemporaryVar;
+  std::map<Where, ir::Expr> whereToIndexListAll;
+  std::map<Where, ir::Expr> whereToIndexListSizeAll;
+  std::map<Where, ir::Expr> whereToBitGuardAll;
+
   /// Map from tensor variables in index notation to variables in the IR
   std::map<TensorVar, ir::Expr> tensorVars;
 

diff --git a/include/taco/storage/typed_value.h b/include/taco/storage/typed_value.h
@@ -20,7 +20,7 @@ class TypedComponent {
 
 protected:
   /// Gets the value of this TypedComponent as a size_t (for use in indexing)
-  size_t getAsIndex(const ComponentTypeUnion mem) const;
+  size_t getAsIndex(const ComponentTypeUnion &mem) const;
   /// Sets mem to value (ensure that it does not write to bytes past the size of the type in the union)
   void set(ComponentTypeUnion& mem, const ComponentTypeUnion& value);
   /// Sets mem to casted value of integer
@@ -237,4 +237,3 @@ bool operator!=(const TypedComponentVal& a, const int other);
 
 }
 #endif
-
diff --git a/python_bindings/CMakeLists.txt b/python_bindings/CMakeLists.txt
@@ -5,6 +5,9 @@ add_subdirectory(pybind11)
 include_directories(${PYTHON_INCLUDE_DIRS})
 include_directories(${TACO_INCLUDE_DIR})
 include_directories(${TACO_PROJECT_DIR}/python_bindings/include)
+if(CUDA)
+  include_directories(${CUDA_INCLUDE_DIRS})
+endif(CUDA)
 
 set(PY_SRC_DIRS src)
 

diff --git a/python_bindings/pytaco/pytensor/taco_tensor.py b/python_bindings/pytaco/pytensor/taco_tensor.py
@@ -537,6 +537,11 @@ def from_sp_csr(matrix, copy=True):
     copy: boolean, optional
         If true, taco copies the data from scipy and stores it. Otherwise, taco points to the same data as scipy.
 
+    Notes
+    --------
+    The copy flag is ignored if the GPU backend is enabled.
+    (This restriction will be lifted in future versions of taco.)
+
     Returns
     --------
     t: tensor
@@ -560,6 +565,11 @@ def from_sp_csc(matrix, copy=True):
     copy: boolean, optional
         If true, taco copies the data from scipy and stores it. Otherwise, taco points to the same data as scipy.
 
+    Notes
+    --------
+    The copy flag is ignored if the GPU backend is enabled.
+    (This restriction will be lifted in future versions of taco.)
+
     Returns
     --------
     t: tensor
@@ -594,7 +604,8 @@ def from_array(array, copy=True):
     --------
     The copy flag is ignored if the input array is not C contiguous or F contiguous (so for most transposed views).
     If taco detects an array that is not contiguous, it will always copy the numpy array into a C contiguous format.
-    This restriction will be lifted in future versions of taco.
+    Additionally, if the GPU backend is enabled, taco will always copy the numpy array to CUDA unified memory.
+    These restriction will be lifted in future versions of taco.
 
     Taco is mainly intended to operate on sparse tensors. As a result, it buffers inserts since inserting into sparse
     structures is very costly. This means that when the full tensor structure is needed, taco will copy the tensor to

diff --git a/python_bindings/src/pyDatatypes.cpp b/python_bindings/src/pyDatatypes.cpp
@@ -8,7 +8,7 @@ namespace py = pybind11;
 
 
 std::string getNpType(const taco::Datatype& dtype) {
-  if (dtype.isBool()) return "bool";
+  if (dtype.isBool()) return "bool_";
   else if (dtype.isInt()) return "int" + std::to_string(dtype.getNumBits());
   else if (dtype.isUInt()) return "uint" + std::to_string(dtype.getNumBits());
   else if (dtype.isFloat()) return "float" + std::to_string(dtype.getNumBits());

diff --git a/python_bindings/src/pyTensor.cpp b/python_bindings/src/pyTensor.cpp
@@ -9,6 +9,10 @@
 #include "taco/type.h"
 #include "taco/tensor.h"
 
+#if CUDA_BUILT
+#include <cuda_runtime_api.h>
+#endif
+
 // Add Python dictionary initializer with {tuple(coordinate) : data} pairs
 
 
@@ -53,7 +57,17 @@ static Tensor<T> fromNpArr(py::buffer_info& array_buffer, Format& fmt, bool copy
   TensorStorage& storage = tensor.getStorage();
   void *buf_data = array_buffer.ptr;
   Array::Policy policy = Array::Policy::UserOwns;
-  if(copy){
+  if(should_use_CUDA_codegen()){
+#if CUDA_BUILT
+    taco_iassert(should_use_CUDA_unified_memory());
+    buf_data = cuda_unified_alloc(size * array_buffer.itemsize);
+    cudaMemcpy(buf_data, array_buffer.ptr, size * array_buffer.itemsize, cudaMemcpyDefault);
+    policy = Array::Policy::Free;
+#else
+    taco_iassert(false);
+#endif
+  }
+  else if(copy){
     buf_data = new T[size];
     memcpy(buf_data, array_buffer.ptr, size*array_buffer.itemsize);
     policy = Array::Policy::Delete;
@@ -106,7 +120,12 @@ static Tensor<T> fromSpMatrix(py::array_t<IdxType> &ind_ptr, py::array_t<IdxType
   T *mat_data = static_cast<T *>(data_buf.ptr);
   Array::Policy policy = Array::Policy::UserOwns;
 
-  if(copy){
+  if(should_use_CUDA_codegen()){
+    taco_iassert(should_use_CUDA_unified_memory());
+    // TODO: Should copy arrays to unified memory
+    taco_not_supported_yet;
+  }
+  else if(copy){
     mat_ptr = new IdxType[ind_ptr_buf.size];
     mat_ind = new IdxType[inds_buf.size];
     mat_data = new T[data_buf.size];
@@ -129,7 +148,6 @@ static Tensor<T> fromSpMatrix(py::array_t<IdxType> &ind_ptr, py::array_t<IdxType
 
 template<typename T>
 static py::tuple toSpMatrix(Tensor<T> &tensor, bool tocsr) {
-
   if(tensor.getOrder() != 2) {
     throw py::value_error("Must be a matrix to convert to scipy");
   }
@@ -198,7 +216,6 @@ static py::tuple toSpMatrix(Tensor<T> &tensor, bool tocsr) {
   py::array_t<T> val_arr({val_arr_size}, {sizeof(T)}, np_vals, free_vals);
 
   return py::make_tuple(ptr_arr, idx_arr, val_arr);
-
 }
 
 template<typename CType, typename idxVar>

diff --git a/python_bindings/src/pytaco.cpp b/python_bindings/src/pytaco.cpp
@@ -11,6 +11,8 @@
 void addHelpers(py::module &m) {
   m.def("unique_name", (std::string(*)(char)) &taco::util::uniqueName);
 
+  m.def("should_use_cuda_codegen", &taco::should_use_CUDA_codegen);
+
   py::options options;
   options.disable_function_signatures();
 
@@ -69,7 +71,7 @@ Examples
 
 
 
-  m.def("set_parallel_schedule", [](std::string sched_type, int chunk_size = 0){
+  m.def("set_parallel_schedule", [](std::string sched_type, int chunk_size){
     std::transform(sched_type.begin(), sched_type.end(), sched_type.begin(), ::tolower);
 
     if(sched_type == "static") {
@@ -106,7 +108,7 @@ Examples
 
 
 
-)");
+)", py::arg("sched_type"), py::arg("chunk_size") = 1);
 
   m.def("get_parallel_schedule", [](){
       taco::ParallelSchedule sched;

diff --git a/python_bindings/unit_tests.py b/python_bindings/unit_tests.py
@@ -33,7 +33,7 @@ def test_dtype_inspectors(self):
         self.assertEqual(pt.uint64.__repr__(), "pytaco.uint64_t")
 
     def test_dtype_conversion(self):
-        expected_types = [np.bool, np.float32, np.float64,  np.int8, np.int16, np.int32, np.int64,
+        expected_types = [np.bool_, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
                           np.uint8, np.uint16, np.uint32, np.uint64]
 
         for i, dt in enumerate(types):
@@ -168,6 +168,11 @@ def test_tensor_from_numpy(self):
         self.assertTrue(np.array_equal(tensor_copy, self.c_array))
 
     def test_array_copy_C_and_F_style(self):
+        if pt.should_use_cuda_codegen():
+          # `from_array` always performs deep copy when GPU backend is enabled, 
+          # so don't run this test
+          return
+
         # Getting a reference to taco then back to numpy should return the same data with the read only flag set to true
         # only for C and F style arrays. Arrays of different forms will always be copied
         c_copy = pt.from_array(self.c_array, copy=False)

diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
@@ -34,6 +34,9 @@ const string cHeaders =
   "#include <math.h>\n"
   "#include <complex.h>\n"
   "#include <string.h>\n"
+  "#if _OPENMP\n"
+  "#include <omp.h>\n"
+  "#endif\n"
   "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n"
   "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n"
   "#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n"
@@ -51,6 +54,10 @@ const string cHeaders =
   "  int32_t      vals_size;     // values array size\n"
   "} taco_tensor_t;\n"
   "#endif\n"
+  "#if !_OPENMP\n"
+  "int omp_get_thread_num() { return 0; }\n"
+  "int omp_get_max_threads() { return 1; }\n"
+  "#endif\n"
   "int cmp(const void *a, const void *b) {\n"
   "  return *((const int*)a) - *((const int*)b);\n"
   "}\n"