Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement separate lowerers for C and CUDA #515

Open
wants to merge 13 commits into
base: split-cuda-lowerer
Choose a base branch
from
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ lib/
*cmake_install.cmake
CMakeCache.txt
doc

.idea/
apps/tensor_times_vector/tensor_times_vector
cmake-build-debug/
42 changes: 42 additions & 0 deletions include/taco/lower/lowerer_impl_c.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
//
// Created by 张 on 2022/3/10.
//

#ifndef TACO_LOWERER_IMPL_C_H
#define TACO_LOWERER_IMPL_C_H

#include <memory>
#include "taco/lower/lowerer_impl_imperative.h"
namespace taco {
class LowererImplC: public LowererImplImperative {
public:
LowererImplC();
virtual ~LowererImplC() = default;

protected:
std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false);
std::vector<ir::Stmt> codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit);
std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
std::pair<bool,bool> canAccelerateDenseTemp(Where where);
std::vector<ir::Stmt> codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit);
/**
* Generate code to initialize values array in range
* [begin * size, (begin + 1) * size) with the fill value.
*/
ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size);
ir::Stmt lowerWhere(Where where);
ir::Stmt lowerForall(Forall forall);

/// Lower a forall that needs to be cloned so that one copy does not have guards
/// used for vectorized and unrolled loops
ir::Stmt lowerForallCloned(Forall forall);

private:
class Visitor;
friend class Visitor;
std::shared_ptr<Visitor> visitor;
};
}


#endif //TACO_LOWERER_IMPL_C_H
44 changes: 44 additions & 0 deletions include/taco/lower/lowerer_impl_cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
//
// Created by 张 on 2022/3/12.
//

#ifndef TACO_LOWERER_IMPL_CUDA_H
#define TACO_LOWERER_IMPL_CUDA_H

#include <memory>
#include "taco/lower/lowerer_impl_imperative.h"



namespace taco {
class LowererImplCUDA: public LowererImplImperative {
public:
LowererImplCUDA();
virtual ~LowererImplCUDA() = default;

protected:
std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false);
std::vector<ir::Stmt> codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit);
std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
std::pair<bool,bool> canAccelerateDenseTemp(Where where);
std::vector<ir::Stmt> codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit);
/**
* Generate code to initialize values array in range
* [begin * size, (begin + 1) * size) with the fill value.
*/
ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size);
ir::Stmt lowerWhere(Where where);
ir::Stmt lowerForall(Forall forall);

/// Lower a forall that needs to be cloned so that one copy does not have guards
/// used for vectorized and unrolled loops
ir::Stmt lowerForallCloned(Forall forall);

private:
class Visitor;
friend class Visitor;
std::shared_ptr<Visitor> visitor;
};
}

#endif //TACO_LOWERER_IMPL_CUDA_H
59 changes: 45 additions & 14 deletions include/taco/lower/lowerer_impl_imperative.h
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ class LowererImplImperative : public LowererImpl {
* Generate code to initialize values array in range
* [begin * size, (begin + 1) * size) with the fill value.
*/
ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size);
virtual ir::Stmt initValues(ir::Expr tensor, ir::Expr initVal, ir::Expr begin, ir::Expr size);

/// Declare position variables and initialize them with a locate.
ir::Stmt declLocatePosVars(std::vector<Iterator> iterators);
Expand All @@ -367,17 +367,17 @@ class LowererImplImperative : public LowererImpl {

/// Returns true iff the temporary used in the where statement is dense and sparse iteration over that
/// temporary can be automaticallty supported by the compiler.
std::pair<bool,bool> canAccelerateDenseTemp(Where where);
virtual std::pair<bool,bool> canAccelerateDenseTemp(Where where);

/// Initializes a temporary workspace
std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
std::vector<ir::Stmt> codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit);
std::vector<ir::Stmt> codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit);
virtual std::vector<ir::Stmt> codeToInitializeTemporary(Where where);
virtual std::vector<ir::Stmt> codeToInitializeTemporaryParallel(Where where, ParallelUnit parallelUnit);
virtual std::vector<ir::Stmt> codeToInitializeLocalTemporaryParallel(Where where, ParallelUnit parallelUnit);
/// Gets the size of a temporary tensorVar in the where statement
ir::Expr getTemporarySize(Where where);

/// Initializes helper arrays to give dense workspaces sparse acceleration
std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false);
virtual std::vector<ir::Stmt> codeToInitializeDenseAcceleratorArrays(Where where, bool parallel = false);

/// Recovers a derived indexvar from an underived variable.
ir::Stmt codeToRecoverDerivedIndexVar(IndexVar underived, IndexVar indexVar, bool emitVarDecl);
Expand Down Expand Up @@ -498,12 +498,38 @@ class LowererImplImperative : public LowererImpl {
/// loop iterator variable should be incremented when the guard is fired.
ir::Stmt strideBoundsGuard(Iterator iterator, ir::Expr access, bool incrementPosVar);

private:
util::ScopedSet<Iterator> accessibleIterators;
int inParallelLoopDepth = 0;
/// Map used to hoist parallel temporary workspaces. Maps workspace shared by all threads to where statement
std::map<Where, TensorVar> whereToTemporaryVar;
std::map<Where, ir::Expr> whereToIndexListAll;
std::map<Where, ir::Expr> whereToIndexListSizeAll;
std::map<Where, ir::Expr> whereToBitGuardAll;
/// Map form temporary to indexList var if accelerating dense workspace
std::map<TensorVar, ir::Expr> tempToIndexList;

/// Map form temporary to indexListSize if accelerating dense workspace
std::map<TensorVar, ir::Expr> tempToIndexListSize;

/// Map form temporary to bitGuard var if accelerating dense workspace
std::map<TensorVar, ir::Expr> tempToBitGuard;

std::set<TensorVar> needCompute;

struct TemporaryArrays {
ir::Expr values;
};
std::map<TensorVar, TemporaryArrays> temporaryArrays;
std::set<TensorVar> guardedTemps;
ProvenanceGraph provGraph;
bool emitUnderivedGuards = true;


bool assemble;
bool compute;
bool loopOrderAllowsShortCircuit = false;

std::set<TensorVar> needCompute;
// std::set<TensorVar> needCompute;

int markAssignsAtomicDepth = 0;
ParallelUnit atomicParallelUnit;
Expand All @@ -515,15 +541,17 @@ class LowererImplImperative : public LowererImpl {
/// Map used to hoist temporary workspace initialization
std::map<Forall, Where> temporaryInitialization;

/*
/// Map used to hoist parallel temporary workspaces. Maps workspace shared by all threads to where statement
std::map<Where, TensorVar> whereToTemporaryVar;
std::map<Where, ir::Expr> whereToIndexListAll;
std::map<Where, ir::Expr> whereToIndexListSizeAll;
std::map<Where, ir::Expr> whereToBitGuardAll;
*/

/// Map from tensor variables in index notation to variables in the IR
std::map<TensorVar, ir::Expr> tensorVars;

/*
struct TemporaryArrays {
ir::Expr values;
};
Expand All @@ -539,7 +567,7 @@ class LowererImplImperative : public LowererImpl {
std::map<TensorVar, ir::Expr> tempToBitGuard;

std::set<TensorVar> guardedTemps;

*/
/// Map from result tensors to variables tracking values array capacity.
std::map<ir::Expr, ir::Expr> capacityVars;

Expand All @@ -556,7 +584,7 @@ class LowererImplImperative : public LowererImpl {
Iterators iterators;

/// Keep track of relations between IndexVars
ProvenanceGraph provGraph;
// ProvenanceGraph provGraph;

bool ignoreVectorize = false; // already being taken into account

Expand All @@ -573,9 +601,9 @@ class LowererImplImperative : public LowererImpl {
bool captureNextLocatePos = false;
ir::Stmt capturedLocatePos; // used for whereConsumer when want to replicate same locating

bool emitUnderivedGuards = true;
// bool emitUnderivedGuards = true;

int inParallelLoopDepth = 0;
// int inParallelLoopDepth = 0;

std::map<ParallelUnit, ir::Expr> parallelUnitSizes;
std::map<ParallelUnit, IndexVar> parallelUnitIndexVars;
Expand All @@ -588,14 +616,17 @@ class LowererImplImperative : public LowererImpl {
std::map<Access, ir::Expr> reducedValueVars;

/// Set of locate-capable iterators that can be legally accessed.
util::ScopedSet<Iterator> accessibleIterators;


/// Visitor methods can add code to emit it to the function header.
std::vector<ir::Stmt> header;

/// Visitor methods can add code to emit it to the function footer.
std::vector<ir::Stmt> footer;


private:

class Visitor;
friend class Visitor;
std::shared_ptr<Visitor> visitor;
Expand Down
4 changes: 3 additions & 1 deletion src/lower/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "taco/lower/lowerer_impl.h"
#include "taco/lower/lowerer_impl_imperative.h"
#include "taco/lower/lowerer_impl_c.h"
#include "taco/lower/iterator.h"
#include "mode_access.h"

Expand All @@ -36,7 +37,8 @@ namespace taco {
// class Lowerer
Lowerer::Lowerer() : impl(new LowererImplImperative()) {
}

//Lowerer::Lowerer() :impl(new LowererImplC()) {
//}
Lowerer::Lowerer(LowererImpl* impl) : impl(impl) {
}

Expand Down
Loading