Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into llvm-backend
Browse files Browse the repository at this point in the history
  • Loading branch information
guilhermeleobas committed May 27, 2021
2 parents 731ed20 + ad13006 commit 3198ba7
Show file tree
Hide file tree
Showing 29 changed files with 648 additions and 167 deletions.
30 changes: 0 additions & 30 deletions .github/workflows/buildandtest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,33 +77,3 @@ jobs:
CTEST_OUTPUT_ON_FAILURE: 1
CTEST_PARALLEL_LEVEL: 2
working-directory: build

build-gpu:
name: build taco for gpu, but does not run tests
runs-on: ubuntu-18.04

steps:
- uses: actions/checkout@v2
- name: download cuda
run: wget http://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
- name: install cuda
run: sudo sh cuda_10.2.89_440.33.01_linux.run --silent --toolkit --installpath="$GITHUB_WORKSPACE/cuda"
- name: add path
run: echo "$GITHUB_WORKSPACE/cuda/bin" >> $GITHUB_PATH
- name: set ld_library_path
run: echo "LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cuda/lib64" >> $GITHUB_ENV
- name: set library_path
run: echo "LIBRARY_PATH=$GITHUB_WORKSPACE/cuda/lib64" >> $GITHUB_ENV
- name: print environment
run: |
echo ${PATH}
echo ${LD_LIBRARY_PATH}
echo ${LIBRARY_PATH}
- name: create_build
run: mkdir build
- name: cmake
run: cmake -DCUDA=ON ..
working-directory: build
- name: make
run: make -j2
working-directory: build
42 changes: 42 additions & 0 deletions .github/workflows/cuda-test-manual.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: "CUDA build and test (manual)"

# Note: This workflow is triggered by hand by TACO developers.
# It should be run after the code has been reviewed by humans.
# This review step is important to ensure the safety of the
# self-hosted runner.

on:
workflow_dispatch:
inputs:
CMAKE_BUILD_TYPE:
description: CMAKE_BUILD_TYPE
required: true
default: Debug
OPENMP:
description: OPENMP
required: true
default: 'ON'
PYTHON:
description: PYTHON
required: true
default: 'OFF'
jobs:
ubuntu1604-cuda:
name: tests ubuntu 16.04 with CUDA 9
runs-on: [self-hosted, ubuntu-16.04, cuda]
steps:
- uses: actions/checkout@v2
- name: create_build
run: mkdir build
- name: cmake
run: cmake -DCMAKE_BUILD_TYPE=${{ github.event.inputs.CMAKE_BUILD_TYPE }} -DCUDA=ON -DOPENMP=${{ github.event.inputs.OPENMP }} -DPYTHON=${{ github.event.inputs.PYTHON }} ..
working-directory: build
- name: make
run: make -j8
working-directory: build
- name: test
run: make test
env:
CTEST_OUTPUT_ON_FAILURE: 1
CTEST_PARALLEL_LEVEL: 8
working-directory: build
14 changes: 2 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
cmake_minimum_required(VERSION 3.4.0 FATAL_ERROR)
if(POLICY CMP0048)
cmake_policy(SET CMP0048 NEW)
endif()
Expand All @@ -12,14 +12,10 @@ project(taco
option(CUDA "Build for NVIDIA GPU (CUDA must be preinstalled)" OFF)
option(PYTHON "Build TACO for python environment" OFF)
option(OPENMP "Build with OpenMP execution support" OFF)

option(LLVM "Build with LLVM backend support")
option(ENABLE_TESTS "Enable tests" ON)
option(COVERAGE "Build with code coverage analysis" OFF)
set(TACO_FEATURE_CUDA 0)
set(TACO_FEATURE_OPENMP 0)
set(TACO_FEATURE_PYTHON 0)

if(CUDA)
message("-- Searching for CUDA Installation")
find_package(CUDA REQUIRED)
Expand Down Expand Up @@ -99,13 +95,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")

set(OPTIMIZE "-O3" CACHE STRING "Optimization level")

if(CUDA)
set(C_CXX_FLAGS "$ENV{CXXFLAGS} -lcudart -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -Wmissing-declarations -Woverloaded-virtual -pedantic-errors -Wno-deprecated")
else()
set(C_CXX_FLAGS "$ENV{CXXFLAGS} -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -Wmissing-declarations -Woverloaded-virtual -pedantic-errors -Wno-deprecated")
endif(CUDA)

set(C_CXX_FLAGS "-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -Wmissing-declarations -Woverloaded-virtual -pedantic-errors -Wno-deprecated")
if(OPENMP)
set(C_CXX_FLAGS "-fopenmp ${C_CXX_FLAGS}")
endif(OPENMP)
Expand Down
22 changes: 14 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,18 @@ TL;DR build taco using CMake. Run `make test`.
# Build and test
![Build and Test](https://github.com/RSenApps/taco/workflows/Build%20and%20Test/badge.svg?branch=master)

Build taco using CMake 2.8.12 or greater:
Build taco using CMake 3.4.0 or greater:

cd <taco-directory>
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j8

Building taco requires `gcc` 5.0 or newer, or `clang` 3.9 or newer. You can
use a specific compiler or version by setting the `CC` and `CXX` environment
variables before running `cmake`.

## Building Python API
To build taco with the Python API (pytaco), add `-DPYTHON=ON` to the cmake line above. For example:

Expand All @@ -46,13 +50,20 @@ You will then need to add the pytaco module to PYTHONPATH:

export PYTHONPATH=<taco-directory>/build/lib:$PYTHONPATH

pytaco requires NumPy and SciPy to be installed.
This requires Python 3.x and some development libraries. It also requires
NumPy and SciPy to be installed. For Debian/Ubuntu, the following packages
are needed: `python3 libpython3-dev python3-distutils python3-numpy python3-scipy`.

## Building for OpenMP
To build taco with support for parallel execution (using OpenMP), add `-DOPENMP=ON` to the cmake line above. For example:

cmake -DCMAKE_BUILD_TYPE=Release -DOPENMP=ON ..

If you are building with the `clang` compiler, you may need to ensure that
the `libomp` development headers are installed. For Debian/Ubuntu, this is
provided by `libomp-dev`, One of the more specific versions like
`libomp-13-dev` may also work.

## Building for CUDA
To build taco for NVIDIA CUDA, add `-DCUDA=ON` to the cmake line above. For example:

Expand All @@ -66,12 +77,7 @@ Please also make sure that you have CUDA installed properly and that the followi

If you do not have CUDA installed, you can still use the taco cli to generate CUDA code with the -cuda flag.

## Building for LLVM Backend
To build taco for LLVM, add `-DLLVM=ON` to the cmake line above. For example:

cmake -DCMAKE_BUILD_TYPE=Release -DLLVM=ON ..

Make sure you have LLVM properly installed.
The generated CUDA code will require compute capability 6.1 or higher to run.

## Running tests
To run all tests:
Expand Down
11 changes: 9 additions & 2 deletions include/taco/lower/lowerer_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,13 @@ class LowererImpl : public util::Uncopyable {

/// Initializes a temporary workspace
std::vector<ir::Stmt> codeToInitializeTemporary(Where where);

std::vector<ir::Stmt> codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit);
std::vector<ir::Stmt> codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit);
/// Gets the size of a temporary tensorVar in the where statement
ir::Expr getTemporarySize(Where where);

/// Initializes helper arrays to give dense workspaces sparse acceleration
std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where);
std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false);

/// Recovers a derived indexvar from an underived variable.
ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl);
Expand Down Expand Up @@ -443,6 +444,12 @@ class LowererImpl : public util::Uncopyable {
/// Map used to hoist temporary workspace initialization
std::map<Forall, Where> temporaryInitialization;

/// Map used to hoist parallel temporary workspaces. Maps workspace shared by all threads to where statement
std::map<Where, TensorVar> whereToTemporaryVar;
std::map<Where, ir::Expr> whereToIndexListAll;
std::map<Where, ir::Expr> whereToIndexListSizeAll;
std::map<Where, ir::Expr> whereToBitGuardAll;

/// Map from tensor variables in index notation to variables in the IR
std::map<TensorVar, ir::Expr> tensorVars;

Expand Down
3 changes: 1 addition & 2 deletions include/taco/storage/typed_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class TypedComponent {

protected:
/// Gets the value of this TypedComponent as a size_t (for use in indexing)
size_t getAsIndex(const ComponentTypeUnion mem) const;
size_t getAsIndex(const ComponentTypeUnion &mem) const;
/// Sets mem to value (ensure that it does not write to bytes past the size of the type in the union)
void set(ComponentTypeUnion& mem, const ComponentTypeUnion& value);
/// Sets mem to casted value of integer
Expand Down Expand Up @@ -237,4 +237,3 @@ bool operator!=(const TypedComponentVal& a, const int other);

}
#endif

3 changes: 3 additions & 0 deletions python_bindings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ add_subdirectory(pybind11)
include_directories(${PYTHON_INCLUDE_DIRS})
include_directories(${TACO_INCLUDE_DIR})
include_directories(${TACO_PROJECT_DIR}/python_bindings/include)
if(CUDA)
include_directories(${CUDA_INCLUDE_DIRS})
endif(CUDA)

set(PY_SRC_DIRS src)

Expand Down
13 changes: 12 additions & 1 deletion python_bindings/pytaco/pytensor/taco_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,11 @@ def from_sp_csr(matrix, copy=True):
copy: boolean, optional
If true, taco copies the data from scipy and stores it. Otherwise, taco points to the same data as scipy.
Notes
--------
The copy flag is ignored if the GPU backend is enabled.
(This restriction will be lifted in future versions of taco.)
Returns
--------
t: tensor
Expand All @@ -560,6 +565,11 @@ def from_sp_csc(matrix, copy=True):
copy: boolean, optional
If true, taco copies the data from scipy and stores it. Otherwise, taco points to the same data as scipy.
Notes
--------
The copy flag is ignored if the GPU backend is enabled.
(This restriction will be lifted in future versions of taco.)
Returns
--------
t: tensor
Expand Down Expand Up @@ -594,7 +604,8 @@ def from_array(array, copy=True):
--------
The copy flag is ignored if the input array is not C contiguous or F contiguous (so for most transposed views).
If taco detects an array that is not contiguous, it will always copy the numpy array into a C contiguous format.
This restriction will be lifted in future versions of taco.
Additionally, if the GPU backend is enabled, taco will always copy the numpy array to CUDA unified memory.
These restriction will be lifted in future versions of taco.
Taco is mainly intended to operate on sparse tensors. As a result, it buffers inserts since inserting into sparse
structures is very costly. This means that when the full tensor structure is needed, taco will copy the tensor to
Expand Down
2 changes: 1 addition & 1 deletion python_bindings/src/pyDatatypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace py = pybind11;


std::string getNpType(const taco::Datatype& dtype) {
if (dtype.isBool()) return "bool";
if (dtype.isBool()) return "bool_";
else if (dtype.isInt()) return "int" + std::to_string(dtype.getNumBits());
else if (dtype.isUInt()) return "uint" + std::to_string(dtype.getNumBits());
else if (dtype.isFloat()) return "float" + std::to_string(dtype.getNumBits());
Expand Down
25 changes: 21 additions & 4 deletions python_bindings/src/pyTensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
#include "taco/type.h"
#include "taco/tensor.h"

#if CUDA_BUILT
#include <cuda_runtime_api.h>
#endif

// Add Python dictionary initializer with {tuple(coordinate) : data} pairs


Expand Down Expand Up @@ -53,7 +57,17 @@ static Tensor<T> fromNpArr(py::buffer_info& array_buffer, Format& fmt, bool copy
TensorStorage& storage = tensor.getStorage();
void *buf_data = array_buffer.ptr;
Array::Policy policy = Array::Policy::UserOwns;
if(copy){
if(should_use_CUDA_codegen()){
#if CUDA_BUILT
taco_iassert(should_use_CUDA_unified_memory());
buf_data = cuda_unified_alloc(size * array_buffer.itemsize);
cudaMemcpy(buf_data, array_buffer.ptr, size * array_buffer.itemsize, cudaMemcpyDefault);
policy = Array::Policy::Free;
#else
taco_iassert(false);
#endif
}
else if(copy){
buf_data = new T[size];
memcpy(buf_data, array_buffer.ptr, size*array_buffer.itemsize);
policy = Array::Policy::Delete;
Expand Down Expand Up @@ -106,7 +120,12 @@ static Tensor<T> fromSpMatrix(py::array_t<IdxType> &ind_ptr, py::array_t<IdxType
T *mat_data = static_cast<T *>(data_buf.ptr);
Array::Policy policy = Array::Policy::UserOwns;

if(copy){
if(should_use_CUDA_codegen()){
taco_iassert(should_use_CUDA_unified_memory());
// TODO: Should copy arrays to unified memory
taco_not_supported_yet;
}
else if(copy){
mat_ptr = new IdxType[ind_ptr_buf.size];
mat_ind = new IdxType[inds_buf.size];
mat_data = new T[data_buf.size];
Expand All @@ -129,7 +148,6 @@ static Tensor<T> fromSpMatrix(py::array_t<IdxType> &ind_ptr, py::array_t<IdxType

template<typename T>
static py::tuple toSpMatrix(Tensor<T> &tensor, bool tocsr) {

if(tensor.getOrder() != 2) {
throw py::value_error("Must be a matrix to convert to scipy");
}
Expand Down Expand Up @@ -198,7 +216,6 @@ static py::tuple toSpMatrix(Tensor<T> &tensor, bool tocsr) {
py::array_t<T> val_arr({val_arr_size}, {sizeof(T)}, np_vals, free_vals);

return py::make_tuple(ptr_arr, idx_arr, val_arr);

}

template<typename CType, typename idxVar>
Expand Down
6 changes: 4 additions & 2 deletions python_bindings/src/pytaco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
void addHelpers(py::module &m) {
m.def("unique_name", (std::string(*)(char)) &taco::util::uniqueName);

m.def("should_use_cuda_codegen", &taco::should_use_CUDA_codegen);

py::options options;
options.disable_function_signatures();

Expand Down Expand Up @@ -69,7 +71,7 @@ Examples



m.def("set_parallel_schedule", [](std::string sched_type, int chunk_size = 0){
m.def("set_parallel_schedule", [](std::string sched_type, int chunk_size){
std::transform(sched_type.begin(), sched_type.end(), sched_type.begin(), ::tolower);

if(sched_type == "static") {
Expand Down Expand Up @@ -106,7 +108,7 @@ Examples
)");
)", py::arg("sched_type"), py::arg("chunk_size") = 1);

m.def("get_parallel_schedule", [](){
taco::ParallelSchedule sched;
Expand Down
7 changes: 6 additions & 1 deletion python_bindings/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_dtype_inspectors(self):
self.assertEqual(pt.uint64.__repr__(), "pytaco.uint64_t")

def test_dtype_conversion(self):
expected_types = [np.bool, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
expected_types = [np.bool_, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64,
np.uint8, np.uint16, np.uint32, np.uint64]

for i, dt in enumerate(types):
Expand Down Expand Up @@ -168,6 +168,11 @@ def test_tensor_from_numpy(self):
self.assertTrue(np.array_equal(tensor_copy, self.c_array))

def test_array_copy_C_and_F_style(self):
if pt.should_use_cuda_codegen():
# `from_array` always performs deep copy when GPU backend is enabled,
# so don't run this test
return

# Getting a reference to taco then back to numpy should return the same data with the read only flag set to true
# only for C and F style arrays. Arrays of different forms will always be copied
c_copy = pt.from_array(self.c_array, copy=False)
Expand Down
7 changes: 7 additions & 0 deletions src/codegen/codegen_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ const string cHeaders =
"#include <math.h>\n"
"#include <complex.h>\n"
"#include <string.h>\n"
"#if _OPENMP\n"
"#include <omp.h>\n"
"#endif\n"
"#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n"
"#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n"
"#define TACO_DEREF(_a) (((___context___*)(*__ctx__))->_a)\n"
Expand All @@ -51,6 +54,10 @@ const string cHeaders =
" int32_t vals_size; // values array size\n"
"} taco_tensor_t;\n"
"#endif\n"
"#if !_OPENMP\n"
"int omp_get_thread_num() { return 0; }\n"
"int omp_get_max_threads() { return 1; }\n"
"#endif\n"
"int cmp(const void *a, const void *b) {\n"
" return *((const int*)a) - *((const int*)b);\n"
"}\n"
Expand Down
Loading

0 comments on commit 3198ba7

Please sign in to comment.