Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dense shared memory workspaces #302

Open
wants to merge 6 commits into
base: gpu-workspaces
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/taco/codegen/module.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ class Module {

/// Compile the source into a library, returning its full path
std::string compile();

/// Recompile. This is a debugging tool that, given the path to the temporary
/// file generated by taco, will compile the file. This function is useful
/// for development and facilitates experimentation with generated code by
/// allowing developers to modify the generated code and compile it again.
std::string recompile(std::string file_path);

/// Compile the module into a source file located at the specified location
/// path and prefix. The generated source will be path/prefix.{.c|.bc, .h}
Expand Down
5 changes: 4 additions & 1 deletion include/taco/index_notation/index_notation.h
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,10 @@ class TensorVar : public util::Comparable<TensorVar> {
TensorVar(const Type& type);
TensorVar(const std::string& name, const Type& type);
TensorVar(const Type& type, const Format& format);
TensorVar(const std::string& name, const Type& type, const Format& format);
TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None);

// Returns the type of GPU workspace this TensorVar is, which is None by default.
GPUWorkspace getGPUWorkspace();

/// Returns the name of the tensor variable.
std::string getName() const;
Expand Down
3 changes: 2 additions & 1 deletion include/taco/ir/ir.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,10 @@ struct Var : public ExprNode<Var> {
std::string name;
bool is_ptr;
bool is_tensor;
GPUWorkspace gpuworkspace;

static Expr make(std::string name, Datatype type, bool is_ptr=false,
bool is_tensor=false);
bool is_tensor=false, GPUWorkspace gpuworkspace=GPUWorkspace::None);

static const IRNodeType _type_info = IRNodeType::Var;
};
Expand Down
6 changes: 6 additions & 0 deletions include/taco/ir_tags.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,10 @@ enum class BoundType {
extern const char *BoundType_NAMES[];
}

/* TODO: Not sure if this is the right place for these. */
enum class GPUWorkspace {
None, DenseSharedMemory
};
extern const char *GPUWorkspace_NAMES[];

#endif //TACO_IR_TAGS_H
6 changes: 6 additions & 0 deletions include/taco/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -413,12 +413,18 @@ class TensorBase {

void compile(IndexStmt stmt, bool assembleWhileCompute=false);

void recompile(std::string file_path);

/// Assemble the tensor storage, including index and value arrays.
void assemble();

void reassemble();

/// Compute the given expression and put the values in the tensor storage.
void compute();

void recompute();

/// Compile, assemble and compute as needed.
void evaluate();

Expand Down
156 changes: 120 additions & 36 deletions src/codegen/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
vector<pair<string, Expr>> warpIDVars;
vector<Expr> numThreads;
vector<Expr> numWarps;
vector<Expr> sizeSharedMemory;
std::string typeSharedMemory;

CodeGen_CUDA *codeGen;
// copy inputs and outputs into the map
Expand Down Expand Up @@ -359,6 +361,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {

threadFors.push_back(op);
threadIDVars.push_back(pair<string, Expr>(scopeMap[op->var], op->var));

Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment));
numThreads.push_back(blockSize);
}
Expand All @@ -378,6 +381,16 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
}

virtual void visit(const Var *op) {

if (isa<Var>(op)){
if (to<Var>(op)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
{
string elementType = printCUDAType( op->type, false);
typeSharedMemory = elementType;
// sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
}
}

if (scopeMap.count(op) == 0) {
string name = codeGen->genUniqueName(op->name);
if (!inDeviceFunction) {
Expand Down Expand Up @@ -580,7 +593,23 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
gridSize.accept(this);
stream << ", ";
blockSize.accept(this);
stream << ">>>";

if (usesshared == false)
{
stream << ">>>";
}
else
{
// BIG TODO: no hard code 2048
// Should be num_threads * num_prec_elems * sizeof(prec_type)
stream << ", " ;
sizeofshared.accept(this);
stream << " * sizeof(" << typeofshared << ")>>>";
// 2048*sizeof(double)>>>";
}



stream << "(";

string delimiter = "";
Expand Down Expand Up @@ -627,6 +656,9 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) {
parentParallelUnits = {};
parallelUnitSizes = {};
parallelUnitIDVars = {};
sizeofshared = Expr();
typeofshared = "";
usesshared = false;
emittedTimerStartCode = false;
isHostFunction = true;
if (isFirst) {
Expand Down Expand Up @@ -1021,51 +1053,103 @@ void CodeGen_CUDA::visit(const Max* op) {

void CodeGen_CUDA::visit(const Allocate* op) {
string elementType = printCUDAType(op->var.type(), false);

if (!isHostFunction) {
if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
// double w_GPUThread[num];
// for threads allocate thread local memory

if (to<Var>(op->var)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
{
taco_iassert(!op->is_realloc);
doIndent();
stream << elementType << " ";
stream << "__shared__ " << elementType << " ";
op->var.accept(this);
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
stream << "_ALL";
}
stream << "[";
op->num_elements.accept(this);
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
Expr numElements = Mul::make(op->num_elements,
parallelUnitSizes[ParallelUnit::GPUBlock]);

sizeofshared = numElements;
typeofshared = elementType;
usesshared = true;

ir::simplify(numElements).accept(this);
}
else {
doIndent();
stream << elementType << " ";
op->var.accept(this);
stream << "[";
op->num_elements.accept(this);
stream << "];" << endl;
return;
}
stream << "];" << endl;
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
doIndent();
stream << elementType << " * ";
op->var.accept(this);

stream << " = ";
op->var.accept(this);
stream << "_ALL + threadIdx.x";
// parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
stream << " * ";
op->num_elements.accept(this);
// parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
stream << ";" << endl;
}
return;
}
// __shared__ double w_GPUThread[32]; if no warps
// __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
// double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
taco_iassert(!op->is_realloc);
doIndent();
stream << "__shared__ " << elementType << " ";
op->var.accept(this);
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
stream << "_ALL";
}
stream << "[";
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
ir::simplify(numElements).accept(this);
}
else {
op->num_elements.accept(this);
}
stream << "];" << endl;
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
else
{
if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
// double w_GPUThread[num];
// for threads allocate thread local memory
doIndent();
stream << elementType << " ";
op->var.accept(this);
stream << "[";
op->num_elements.accept(this);
stream << "];" << endl;
return;
}
// __shared__ double w_GPUThread[32]; if no warps
// __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
// double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
taco_iassert(!op->is_realloc);
doIndent();
stream << elementType << " * ";
op->var.accept(this);

stream << " = ";
stream << "__shared__ " << elementType << " ";
op->var.accept(this);
stream << "_ALL + ";
parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
stream << " * ";
parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
stream << ";" << endl;
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
stream << "_ALL";
}
stream << "[";
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
ir::simplify(numElements).accept(this);
}
else {
op->num_elements.accept(this);
}
stream << "];" << endl;
if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
doIndent();
stream << elementType << " * ";
op->var.accept(this);

stream << " = ";
op->var.accept(this);
stream << "_ALL + ";
parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
stream << " * ";
parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
stream << ";" << endl;
}
return;
}
return;

}
string variable_name;
if (op->is_realloc) {
Expand Down
5 changes: 5 additions & 0 deletions src/codegen/codegen_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ class CodeGen_CUDA : public CodeGen {
std::map<ParallelUnit, Expr> parallelUnitSizes;
std::map<ParallelUnit, Expr> parallelUnitIDVars;


Expr sizeofshared;
std::string typeofshared;
bool usesshared;

bool emittedTimerStartCode = false;

std::ostream &out;
Expand Down
45 changes: 45 additions & 0 deletions src/codegen/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,51 @@ string Module::compile() {
return fullpath;
}

string Module::recompile(string file_path) {
string prefix = file_path;
string fullpath = prefix + ".so";

string cc;
string cflags;
string file_ending;
string shims_file;
if (should_use_CUDA_codegen()) {
cc = "nvcc";
cflags = util::getFromEnv("TACO_NVCCFLAGS",
get_default_CUDA_compiler_flags());
file_ending = ".cu";
shims_file = prefix + "_shims.cpp";
}
else {
cc = util::getFromEnv(target.compiler_env, target.compiler);
cflags = util::getFromEnv("TACO_CFLAGS",
"-O3 -ffast-math -std=c99") + " -shared -fPIC";
file_ending = ".c";
shims_file = "";
}
#if USE_OPENMP
cflags += " -fopenmp";
#endif

string cmd = cc + " " + cflags + " " +
prefix + file_ending + " " + shims_file + " " +
"-o " + fullpath + " -lm";

// now compile it
int err = system(cmd.data());
taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
<< "\nreturned " << err;

// use dlsym() to open the compiled library
if (lib_handle) {
dlclose(lib_handle);
}
lib_handle = dlopen(fullpath.data(), RTLD_NOW | RTLD_LOCAL);
taco_uassert(lib_handle) << "Failed to load generated code";

return fullpath;
}

void Module::setSource(string source) {
this->source << source;
moduleFromUserSource = true;
Expand Down
8 changes: 7 additions & 1 deletion src/index_notation/index_notation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1819,6 +1819,7 @@ struct TensorVar::Content {
Type type;
Format format;
Schedule schedule;
GPUWorkspace gpuworkspace;
};

TensorVar::TensorVar() : content(nullptr) {
Expand All @@ -1840,11 +1841,16 @@ TensorVar::TensorVar(const Type& type, const Format& format)
: TensorVar(util::uniqueName('A'), type, format) {
}

TensorVar::TensorVar(const string& name, const Type& type, const Format& format)
TensorVar::TensorVar(const string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace)
: content(new Content) {
content->name = name;
content->type = type;
content->format = format;
content->gpuworkspace = gpuworkspace;
}

GPUWorkspace TensorVar::getGPUWorkspace() {
return content->gpuworkspace;
}

std::string TensorVar::getName() const {
Expand Down
1 change: 1 addition & 0 deletions src/index_notation/transformations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {

IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}}));

Where where(consumer, producer);

stmt = where;
Expand Down
3 changes: 2 additions & 1 deletion src/ir/ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,14 +241,15 @@ bool Literal::equalsScalar(double scalar) const {
return false;
}

Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) {
Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, GPUWorkspace gpuworkspace) {
Var *var = new Var;
var->type = type;
var->name = name;

// TODO: is_ptr and is_tensor should be part of type
var->is_ptr = is_ptr;
var->is_tensor = is_tensor;
var->gpuworkspace = gpuworkspace;

return var;
}
Expand Down
1 change: 1 addition & 0 deletions src/ir_tags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ namespace taco {
const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"};
const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"};
const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"};
const char *GPUWorkspace_NAMES[] = {"None", "DenseSharedMemory"};
}
Loading