From 13e51dc6f9b66a304a7f5d1016a74c62240ff9ae Mon Sep 17 00:00:00 2001 From: Changwan Hong Date: Sun, 4 Oct 2020 19:10:01 -0400 Subject: [PATCH 1/7] init update --- include/taco/format.h | 3 ++- include/taco/lower/iterator.h | 1 + include/taco/lower/mode_format_compressed.h | 2 +- include/taco/lower/mode_format_dense.h | 2 +- include/taco/lower/mode_format_impl.h | 3 ++- include/taco/lower/mode_format_singleton.h | 2 +- src/format.cpp | 15 +++++++++++++++ src/lower/iterator.cpp | 6 ++++++ src/lower/lowerer_impl.cpp | 14 +++++++++++--- src/lower/mode_format_compressed.cpp | 15 +++++++++++---- src/lower/mode_format_dense.cpp | 15 +++++++++++---- src/lower/mode_format_impl.cpp | 4 ++-- src/lower/mode_format_singleton.cpp | 15 +++++++++++---- test/tests-merge_lattice.cpp | 3 ++- 14 files changed, 77 insertions(+), 23 deletions(-) diff --git a/include/taco/format.h b/include/taco/format.h index 29f6cbd70..ec1028d5e 100644 --- a/include/taco/format.h +++ b/include/taco/format.h @@ -95,7 +95,7 @@ class ModeFormat { /// Properties of a mode format enum Property { FULL, NOT_FULL, ORDERED, NOT_ORDERED, UNIQUE, NOT_UNIQUE, BRANCHLESS, - NOT_BRANCHLESS, COMPACT, NOT_COMPACT + NOT_BRANCHLESS, COMPACT, NOT_COMPACT, ZEROLESS, NOT_ZEROLESS }; /// Instantiates an undefined mode format @@ -126,6 +126,7 @@ class ModeFormat { bool isUnique() const; bool isBranchless() const; bool isCompact() const; + bool isZeroless() const; /// Returns true if a mode format has a specific capability, false otherwise bool hasCoordValIter() const; diff --git a/include/taco/lower/iterator.h b/include/taco/lower/iterator.h index 1d871ffaa..63609439e 100644 --- a/include/taco/lower/iterator.h +++ b/include/taco/lower/iterator.h @@ -69,6 +69,7 @@ class Iterator : public util::Comparable { bool isUnique() const; bool isBranchless() const; bool isCompact() const; + bool isZeroless() const; /// Capabilities supported by levels being iterated. bool hasCoordIter() const; diff --git a/include/taco/lower/mode_format_compressed.h b/include/taco/lower/mode_format_compressed.h index 62a4c173b..55f7c42a7 100644 --- a/include/taco/lower/mode_format_compressed.h +++ b/include/taco/lower/mode_format_compressed.h @@ -9,7 +9,7 @@ class CompressedModeFormat : public ModeFormatImpl { public: CompressedModeFormat(); CompressedModeFormat(bool isFull, bool isOrdered, - bool isUnique, long long allocSize = DEFAULT_ALLOC_SIZE); + bool isUnique, bool isZeroless, long long allocSize = DEFAULT_ALLOC_SIZE); ~CompressedModeFormat() override {} diff --git a/include/taco/lower/mode_format_dense.h b/include/taco/lower/mode_format_dense.h index 3426d548c..b99c9b416 100644 --- a/include/taco/lower/mode_format_dense.h +++ b/include/taco/lower/mode_format_dense.h @@ -8,7 +8,7 @@ namespace taco { class DenseModeFormat : public ModeFormatImpl { public: DenseModeFormat(); - DenseModeFormat(const bool isOrdered, const bool isUnique); + DenseModeFormat(const bool isOrdered, const bool isUnique, const bool isZeroless); ~DenseModeFormat() override {} diff --git a/include/taco/lower/mode_format_impl.h b/include/taco/lower/mode_format_impl.h index a0ac74fce..6d4254bd6 100644 --- a/include/taco/lower/mode_format_impl.h +++ b/include/taco/lower/mode_format_impl.h @@ -59,7 +59,7 @@ std::ostream& operator<<(std::ostream&, const ModeFunction&); class ModeFormatImpl { public: ModeFormatImpl(std::string name, bool isFull, bool isOrdered, bool isUnique, - bool isBranchless, bool isCompact, bool hasCoordValIter, + bool isBranchless, bool isCompact, bool isZeroless, bool hasCoordValIter, bool hasCoordPosIter, bool hasLocate, bool hasInsert, bool hasAppend); @@ -162,6 +162,7 @@ class ModeFormatImpl { const bool isUnique; const bool isBranchless; const bool isCompact; + const bool isZeroless; const bool hasCoordValIter; const bool hasCoordPosIter; diff --git a/include/taco/lower/mode_format_singleton.h b/include/taco/lower/mode_format_singleton.h index a370478dd..9122a66a4 100644 --- a/include/taco/lower/mode_format_singleton.h +++ b/include/taco/lower/mode_format_singleton.h @@ -9,7 +9,7 @@ class SingletonModeFormat : public ModeFormatImpl { public: SingletonModeFormat(); SingletonModeFormat(bool isFull, bool isOrdered, - bool isUnique, long long allocSize = DEFAULT_ALLOC_SIZE); + bool isUnique, bool isZeroless, long long allocSize = DEFAULT_ALLOC_SIZE); ~SingletonModeFormat() override {} diff --git a/src/format.cpp b/src/format.cpp index e1a7b4444..27a4abfb5 100644 --- a/src/format.cpp +++ b/src/format.cpp @@ -182,6 +182,11 @@ bool ModeFormat::hasProperties(const std::vector& properties) const { return false; } break; + case ZEROLESS: + if (!isZeroless()) { + return false; + } + break; case NOT_FULL: if (isFull()) { return false; @@ -207,6 +212,11 @@ bool ModeFormat::hasProperties(const std::vector& properties) const { return false; } break; + case NOT_ZEROLESS: + if (isZeroless()) { + return false; + } + break; } } return true; @@ -237,6 +247,11 @@ bool ModeFormat::isCompact() const { return impl->isCompact; } +bool ModeFormat::isZeroless() const { + taco_iassert(defined()); + return impl->isZeroless; +} + bool ModeFormat::hasCoordValIter() const { taco_iassert(defined()); return impl->hasCoordValIter; diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index 4d1ebd644..f4641fcfd 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -191,6 +191,12 @@ bool Iterator::isCompact() const { return getMode().defined() && getMode().getModeFormat().isCompact(); } +bool Iterator::isZeroless() const { + taco_iassert(defined()); + if (isDimensionIterator()) return true; + return getMode().defined() && getMode().getModeFormat().isZeroless(); +} + bool Iterator::hasCoordIter() const { taco_iassert(defined()); if (isDimensionIterator()) return false; diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index 7a0af13b3..1186495b2 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -1384,9 +1384,17 @@ Expr LowererImpl::lowerAccess(Access access) { return getTensorVar(var); } - return getIterators(access).back().isUnique() - ? Load::make(getValuesArray(var), generateValueLocExpr(access)) - : getReducedValueVar(access); + + if (getIterators(access).back().isUnique()) { + if (var.getType().getDataType() == Datatype::Bool && getIterators(access).back().isZeroless()) { + return true; + } else { + return Load::make(getValuesArray(var), generateValueLocExpr(access)); + } + } else { + return getReducedValueVar(access); + } + } diff --git a/src/lower/mode_format_compressed.cpp b/src/lower/mode_format_compressed.cpp index 366b34e8d..19679553e 100644 --- a/src/lower/mode_format_compressed.cpp +++ b/src/lower/mode_format_compressed.cpp @@ -10,12 +10,12 @@ using namespace taco::ir; namespace taco { CompressedModeFormat::CompressedModeFormat() : - CompressedModeFormat(false, true, true) { + CompressedModeFormat(false, true, true, false) { } CompressedModeFormat::CompressedModeFormat(bool isFull, bool isOrdered, - bool isUnique, long long allocSize) : - ModeFormatImpl("compressed", isFull, isOrdered, isUnique, false, true, + bool isUnique, bool isZeroless, long long allocSize) : + ModeFormatImpl("compressed", isFull, isOrdered, isUnique, false, true, isZeroless, false, true, false, false, true), allocSize(allocSize) { } @@ -25,6 +25,7 @@ ModeFormat CompressedModeFormat::copy( bool isFull = this->isFull; bool isOrdered = this->isOrdered; bool isUnique = this->isUnique; + bool isZeroless = this->isZeroless; for (const auto property : properties) { switch (property) { case ModeFormat::FULL: @@ -45,12 +46,18 @@ ModeFormat CompressedModeFormat::copy( case ModeFormat::NOT_UNIQUE: isUnique = false; break; + case ModeFormat::ZEROLESS: + isZeroless = true; + break; + case ModeFormat::NOT_ZEROLESS: + isZeroless = false; + break; default: break; } } const auto compressedVariant = - std::make_shared(isFull, isOrdered, isUnique); + std::make_shared(isFull, isOrdered, isUnique, isZeroless); return ModeFormat(compressedVariant); } diff --git a/src/lower/mode_format_dense.cpp b/src/lower/mode_format_dense.cpp index 9ed9a78db..63c7ac01e 100644 --- a/src/lower/mode_format_dense.cpp +++ b/src/lower/mode_format_dense.cpp @@ -5,11 +5,11 @@ using namespace taco::ir; namespace taco { -DenseModeFormat::DenseModeFormat() : DenseModeFormat(true, true) { +DenseModeFormat::DenseModeFormat() : DenseModeFormat(true, true, false) { } -DenseModeFormat::DenseModeFormat(const bool isOrdered, const bool isUnique) : - ModeFormatImpl("dense", true, isOrdered, isUnique, false, true, false, +DenseModeFormat::DenseModeFormat(const bool isOrdered, const bool isUnique, const bool isZeroless) : + ModeFormatImpl("dense", true, isOrdered, isUnique, false, true, isZeroless, false, false, true, true, false) { } @@ -17,6 +17,7 @@ ModeFormat DenseModeFormat::copy( std::vector properties) const { bool isOrdered = this->isOrdered; bool isUnique = this->isUnique; + bool isZeroless = this->isZeroless; for (const auto property : properties) { switch (property) { case ModeFormat::ORDERED: @@ -31,11 +32,17 @@ ModeFormat DenseModeFormat::copy( case ModeFormat::NOT_UNIQUE: isUnique = false; break; + case ModeFormat::ZEROLESS: + isZeroless = true; + break; + case ModeFormat::NOT_ZEROLESS: + isZeroless = false; + break; default: break; } } - return ModeFormat(std::make_shared(isOrdered, isUnique)); + return ModeFormat(std::make_shared(isOrdered, isUnique, isZeroless)); } ModeFunction DenseModeFormat::locate(ir::Expr parentPos, diff --git a/src/lower/mode_format_impl.cpp b/src/lower/mode_format_impl.cpp index bd5332150..8b27c8999 100644 --- a/src/lower/mode_format_impl.cpp +++ b/src/lower/mode_format_impl.cpp @@ -56,11 +56,11 @@ std::ostream& operator<<(std::ostream& os, const ModeFunction& modeFunction) { // class ModeTypeImpl ModeFormatImpl::ModeFormatImpl(const std::string name, bool isFull, bool isOrdered, bool isUnique, bool isBranchless, - bool isCompact, bool hasCoordValIter, + bool isCompact, bool isZeroless, bool hasCoordValIter, bool hasCoordPosIter, bool hasLocate, bool hasInsert, bool hasAppend) : name(name), isFull(isFull), isOrdered(isOrdered), isUnique(isUnique), - isBranchless(isBranchless), isCompact(isCompact), + isBranchless(isBranchless), isCompact(isCompact), isZeroless(isZeroless), hasCoordValIter(hasCoordValIter), hasCoordPosIter(hasCoordPosIter), hasLocate(hasLocate), hasInsert(hasInsert), hasAppend(hasAppend) { } diff --git a/src/lower/mode_format_singleton.cpp b/src/lower/mode_format_singleton.cpp index 402fcaa80..ae4fc5328 100644 --- a/src/lower/mode_format_singleton.cpp +++ b/src/lower/mode_format_singleton.cpp @@ -10,12 +10,12 @@ using namespace taco::ir; namespace taco { SingletonModeFormat::SingletonModeFormat() : - SingletonModeFormat(false, true, true) { + SingletonModeFormat(false, true, true, false) { } SingletonModeFormat::SingletonModeFormat(bool isFull, bool isOrdered, - bool isUnique, long long allocSize) : - ModeFormatImpl("singleton", isFull, isOrdered, isUnique, true, true, + bool isUnique, bool isZeroless, long long allocSize) : + ModeFormatImpl("singleton", isFull, isOrdered, isUnique, true, true, isZeroless, false, true, false, false, true), allocSize(allocSize) { } @@ -25,6 +25,7 @@ ModeFormat SingletonModeFormat::copy( bool isFull = this->isFull; bool isOrdered = this->isOrdered; bool isUnique = this->isUnique; + bool isZeroless = this->isZeroless; for (const auto property : properties) { switch (property) { case ModeFormat::FULL: @@ -45,12 +46,18 @@ ModeFormat SingletonModeFormat::copy( case ModeFormat::NOT_UNIQUE: isUnique = false; break; + case ModeFormat::ZEROLESS: + isZeroless = true; + break; + case ModeFormat::NOT_ZEROLESS: + isZeroless = false; + break; default: break; } } const auto singletonVariant = - std::make_shared(isFull, isOrdered, isUnique); + std::make_shared(isFull, isOrdered, isUnique, isZeroless); return ModeFormat(singletonVariant); } diff --git a/test/tests-merge_lattice.cpp b/test/tests-merge_lattice.cpp index a05e8db73..b07cd6baa 100644 --- a/test/tests-merge_lattice.cpp +++ b/test/tests-merge_lattice.cpp @@ -24,7 +24,8 @@ namespace tests { class HashedModeFormat : public ModeFormatImpl { public: HashedModeFormat() : ModeFormatImpl("hashed", false, false, true, false, - false, false, true, true, true, false) {} + false, false, false, true, true, true, false) {} + ModeFormat copy(std::vector properties) const { return ModeFormat(std::make_shared()); From 1fae902e77413e8ee41682ae067f9195e60b514c Mon Sep 17 00:00:00 2001 From: Changwan Hong Date: Mon, 5 Oct 2020 02:16:33 -0400 Subject: [PATCH 2/7] fix a bug --- test/tests-storage_alloc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tests-storage_alloc.cpp b/test/tests-storage_alloc.cpp index ed5a33923..611b50d3d 100644 --- a/test/tests-storage_alloc.cpp +++ b/test/tests-storage_alloc.cpp @@ -57,7 +57,7 @@ TEST_P(alloc, storage) { } IndexVar i("i"), j("j"), m("m"), n("n"), k("k"), l("l"); -ModeFormat SparseSmall(std::make_shared(false, true, true, +ModeFormat SparseSmall(std::make_shared(false, true, true, false, 32)); IndexArray dlab_indices() { From 0b21e535771a7434edce8304597e0685b51988a6 Mon Sep 17 00:00:00 2001 From: Changwan Hong Date: Sat, 17 Oct 2020 14:21:54 -0400 Subject: [PATCH 3/7] update Stephen's suggestion --- src/lower/iterator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index f4641fcfd..f5ddc31e4 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -193,7 +193,7 @@ bool Iterator::isCompact() const { bool Iterator::isZeroless() const { taco_iassert(defined()); - if (isDimensionIterator()) return true; + if (isDimensionIterator()) return false; return getMode().defined() && getMode().getModeFormat().isZeroless(); } From 1dcdd0952af700fd18529e8776f5fd8047914319 Mon Sep 17 00:00:00 2001 From: Changwan Hong Date: Thu, 19 Nov 2020 04:56:24 -0500 Subject: [PATCH 4/7] update for memcpy --- src/codegen/codegen.cpp | 93 ++++++++++++++++++++---- src/codegen/codegen.h | 8 ++- src/codegen/codegen_c.cpp | 2 +- src/codegen/codegen_cuda.cpp | 135 +++++++++++++++++++++++++++++++---- src/codegen/codegen_cuda.h | 2 +- 5 files changed, 208 insertions(+), 32 deletions(-) diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index f0c09d98a..1015c133f 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -230,16 +230,47 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool } string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, - bool is_output_prop) { + bool is_output_prop, int flag, string output_tensor) { stringstream ret; ret << " "; auto tensor = op->tensor.as(); if (op->property == TensorProperty::Values) { // for the values, it's in the last slot - ret << printType(tensor->type, true); - ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")("; - ret << tensor->name << "->vals);\n"; + switch(flag) { + case PRINT_FUNC: + ret << printType(tensor->type, true); + ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")("; + ret << tensor->name << "->vals);\n"; + break; + case PRINT_MEM_HOST_TO_DEV: + ret << "gpuErrchk(cudaMalloc((void **)&"; + ret << tensor->name << "_dev" << "->vals, "; + ret << "malloc_usable_size("; + ret << tensor->name << "->vals)));\n"; + + ret << " "; + ret << "cudaMemcpy("; + ret << tensor->name << "_dev" << "->vals, "; + ret << tensor->name << "->vals, "; + ret << "malloc_usable_size("; + ret << tensor->name << "->vals), "; + ret << "cudaMemcpyHostToDevice);\n"; + break; + case PRINT_MEM_DEV_TO_HOST: + if(output_tensor == tensor->name) { + ret << "cudaMemcpy("; + ret << tensor->name << "->vals, "; + ret << tensor->name << "_dev->vals, "; + ret << "malloc_usable_size("; + ret << tensor->name << "->vals), "; + ret << "cudaMemcpyDevicetToHost);\n"; + ret << " "; + } + ret << "cudaFree("; + ret << tensor->name << "_dev" << "->vals);\n"; + break; + } return ret.str(); } else if (op->property == TensorProperty::ValuesSize) { ret << "int " << varname << " = " << tensor->name << "->vals_size;\n"; @@ -252,18 +283,54 @@ string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, // for a Fixed level, ptr is an int // all others are int* if (op->property == TensorProperty::Dimension) { - tp = "int"; - ret << tp << " " << varname << " = (int)(" << tensor->name - << "->dimensions[" << op->mode << "]);\n"; + switch(flag) { + case PRINT_FUNC: + tp = "int"; + ret << tp << " " << varname << " = (int)(" << tensor->name + << "->dimensions[" << op->mode << "]);\n"; + break; + case PRINT_MEM_HOST_TO_DEV: + ret << tensor->name << "_dev->dimension[" << op->mode << "] = " << tensor->name << "->dimension[" << op->mode << "];\n"; + break; + } } else { taco_iassert(op->property == TensorProperty::Indices); tp = "int*"; auto nm = op->index; - ret << tp << " " << restrictKeyword() << " " << varname << " = "; - ret << "(int*)(" << tensor->name << "->indices[" << op->mode; - ret << "][" << nm << "]);\n"; + switch(flag) { + case PRINT_FUNC: + ret << tp << " " << restrictKeyword() << " " << varname << " = "; + ret << "(int*)(" << tensor->name << "->indices[" << op->mode; + ret << "][" << nm << "]);\n"; + break; + case PRINT_MEM_HOST_TO_DEV: + ret << "gpuErrchk(cudaMalloc((void **)&"; + ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], "; + ret << "malloc_usable_size("; + ret << tensor->name << "->indices[" << op->mode << "][" << nm << "])));\n"; + + ret << " "; + ret << "cudaMemcpy("; + ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], "; + ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], "; + ret << "malloc_usable_size("; + ret << tensor->name << "->indices[" << op->mode << "][" << nm << "]), "; + ret << "cudaMemcpyHostToDevice);\n"; + break; + case PRINT_MEM_DEV_TO_HOST: + if(output_tensor == tensor->name) { + ret << "cudaMemcpy("; + ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], "; + ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], "; + ret << "malloc_usable_size("; + ret << tensor->name << "_dev->indices[" << op->mode << "][" << nm << "]), "; + ret << "cudaMemcpyDevicetToHost);\n"; + } + ret << "cudaFree("; + ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "]);\n"; + break; + } } - return ret.str(); } @@ -312,7 +379,7 @@ string CodeGen::pointTensorProperty(std::string varname) { // helper to print declarations string CodeGen::printDecls(map varMap, - vector inputs, vector outputs) { + vector inputs, vector outputs, int flag, string output_tensor) { stringstream ret; unordered_set propsAlreadyGenerated; @@ -367,7 +434,7 @@ string CodeGen::printDecls(map varMap, break; } } else { - ret << unpackTensorProperty(varMap[prop], prop, isOutputProp); + ret << unpackTensorProperty(varMap[prop], prop, isOutputProp, flag, output_tensor); } propsAlreadyGenerated.insert(varMap[prop]); } diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index cc25c80d6..b73cbbb25 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -1,6 +1,10 @@ #ifndef TACO_CODEGEN_H #define TACO_CODEGEN_H +#define PRINT_FUNC 0 +#define PRINT_MEM_HOST_TO_DEV 1 +#define PRINT_MEM_DEV_TO_HOST 2 + #include #include "taco/ir/ir.h" #include "taco/ir/ir_printer.h" @@ -43,7 +47,7 @@ class CodeGen : public IRPrinter { std::vector localVars, int labels, std::string funcName); std::string printDecls(std::map varMap, - std::vector inputs, std::vector outputs); + std::vector inputs, std::vector outputs, int flag, std::string output_tensor); std::string printPack(std::map, std::string> outputProperties, std::vector outputs); std::string printCoroutineFinish(int numYields, std::string funcName); @@ -63,7 +67,7 @@ class CodeGen : public IRPrinter { std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr); std::string unpackTensorProperty(std::string varname, const GetProperty* op, - bool is_output_prop); + bool is_output_prop, int flag, std::string output_tensor); std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property, int mode, int index); std::string pointTensorProperty(std::string varname); diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp index 204aa1e2d..d9cb7bdcf 100644 --- a/src/codegen/codegen_c.cpp +++ b/src/codegen/codegen_c.cpp @@ -290,7 +290,7 @@ void CodeGen_C::visit(const Function* func) { localVars = varFinder.localVars; // Print variable declarations - out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl; if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 5eb57c7ad..578b4e01a 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -282,6 +282,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { vector threadFors; // contents is device function vector warpFors; map scopeMap; + string output_tensor; // the variables to pass to each device function vector>> functionParameters; @@ -312,7 +313,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { taco_iassert(var) << "Outputs must be vars in codegen"; taco_iassert(scopeMap.count(var) == 0) << "Duplicate output found in codegen"; - + output_tensor = var->name; // Isn't there only one output? scopeMap[var] = var->name; } } @@ -436,10 +437,20 @@ Stmt CodeGen_CUDA::simplifyFunctionBodies(Stmt stmt) { return FunctionBodySimplifier().rewrite(stmt); } -string CodeGen_CUDA::printDeviceFuncName(const vector> currentParameters, int index) { +string CodeGen_CUDA::printDeviceFuncName(const vector> currentParameters, int index, int flag) { stringstream ret; - ret << "__global__" << endl; - ret << "void " << funcName << "DeviceKernel" << index << "("; + switch(flag) { + case PRINT_FUNC: + ret << "__global__" << endl; + ret << "void " << funcName << "DeviceKernel" << index << "("; + break; + case PRINT_MEM_HOST_TO_DEV: + ret << "void " << funcName << "MemcpyHostToDev" << index << "("; + break; + case PRINT_MEM_DEV_TO_HOST: + ret << "void " << funcName << "MemcpyDevToHost" << index << "("; + break; + } string delimiter = ""; for (size_t i=0; i> curren // No non-tensor parameters delimiter = ", "; } + if(flag == PRINT_MEM_HOST_TO_DEV || flag == PRINT_MEM_DEV_TO_HOST) { + ret << ", "; + string delimiter = ""; + for (size_t i=0; i(); + taco_iassert(var) << "Unable to convert output " << currentParameters[i].second + << " to Var"; + string varName = currentParameters[i].first; + + if (var->is_tensor) { + ret << delimiter << "taco_tensor_t * __restrict__ " << varName << "_dev"; + } + else { + auto tp = printCUDAType(var->type, var->is_ptr); + ret << delimiter << tp << " "; + if (!var->is_ptr) { + ret << "&"; + } + ret << var->name; + } + // No non-tensor parameters + delimiter = ", "; + } + } ret << ")"; + return ret.str(); } @@ -574,7 +610,33 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP emittedTimerStartCode = true; } + // for malloc + string delimiter = ""; + for (size_t i=0; i()) << "Unable to convert output " << currentParameters[i].second + << " to Var"; + string varName = currentParameters[i].first; + stream << "taco_tensor_t *"<< varName << "_dev = (taco_tensor_t *)malloc(sizeof(taco_tensor_t *));\n"; + doIndent(); + } + + // for MemcpyHostToDev + stream << funcName << "MemcpyHostToDev" << index << "("; + for (size_t l=0; l<2; l++) { + for (size_t i=0; i()) << "Unable to convert output " << currentParameters[i].second + << " to Var"; + string varName = currentParameters[i].first; + stream << delimiter << varName; + if(l == 1) stream << "_dev"; + + delimiter = ", "; + } + } + stream << ");\n\n"; + doIndent(); + // for DeviceKernel stream << funcName << "DeviceKernel" << index << "<<<"; gridSize = ir::simplify(gridSize); gridSize.accept(this); @@ -583,7 +645,7 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP stream << ">>>"; stream << "("; - string delimiter = ""; + delimiter = ""; for (size_t i=0; i()) << "Unable to convert output " << currentParameters[i].second << " to Var"; @@ -605,8 +667,30 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP stream << "cudaEventElapsedTime(&tot_ms, event1, event2);\n"; } doIndent(); - stream << "cudaDeviceSynchronize();\n"; + stream << "cudaDeviceSynchronize();\n\n"; + // for MemcpyDevToHost + doIndent(); + stream << funcName << "DeviceFree" << index << "("; + delimiter = ""; + for (size_t i=0; i()) << "Unable to convert output " << currentParameters[i].second + << " to Var"; + string varName = currentParameters[i].first; + stream << delimiter << varName << "_dev"; + + delimiter = ", "; + } + stream << ");\n"; + + // for free + for (size_t i=0; i()) << "Unable to convert output " << currentParameters[i].second + << " to Var"; + string varName = currentParameters[i].first; + doIndent(); + stream << "free("<< varName << "_dev);\n"; + } } @@ -679,12 +763,6 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { } } - // Generate device function header - doIndent(); - out << printDeviceFuncName(parameters, i); - out << "{\n"; - indent++; - // Generate device function code resetUniqueNameCounters(); vector inputs; @@ -710,8 +788,35 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) { blockloop->accept(&varFinder); varMap = varFinder.varMap; + + + // Print MemcpyHostToDev function + out << printDeviceFuncName(parameters, i, PRINT_MEM_HOST_TO_DEV); + out << "{\n"; + indent++; + out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_HOST_TO_DEV, deviceFunctionCollector.output_tensor) << endl; + indent--; + doIndent(); + out << "}\n\n"; + + // Print MemcpyDevtToHost function + out << printDeviceFuncName(parameters, i, PRINT_MEM_DEV_TO_HOST); + out << "{\n"; + indent++; + out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_DEV_TO_HOST, deviceFunctionCollector.output_tensor) << endl; + indent--; + doIndent(); + out << "}\n\n"; + + + // Generate device function header + doIndent(); + out << printDeviceFuncName(parameters, i, PRINT_FUNC); + out << "{\n"; + indent++; + // Print variable declarations - out << printDecls(varFinder.varDecls, inputs, {}) << endl; + out << printDecls(varFinder.varDecls, inputs, {}, PRINT_FUNC, deviceFunctionCollector.output_tensor) << endl; doIndent(); printBlockIDVariable(deviceFunctionCollector.blockIDVars[i], blockloop->start, blockloop->increment); doIndent(); @@ -779,7 +884,7 @@ void CodeGen_CUDA::visit(const Function* func) { localVars = varFinder.localVars; // Print variable declarations - out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl; + out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl; if (emittingCoroutine) { out << printContextDeclAndInit(varMap, localVars, numYields, func->name) @@ -1082,7 +1187,7 @@ void CodeGen_CUDA::visit(const Allocate* op) { } doIndent(); - stream << "gpuErrchk(cudaMallocManaged((void**)&"; + stream << "gpuErrchk(cudaMalloc((void**)&"; if (op->is_realloc) { stream << variable_name; } diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h index 2bc8e000d..53f116ebd 100644 --- a/src/codegen/codegen_cuda.h +++ b/src/codegen/codegen_cuda.h @@ -48,7 +48,7 @@ class CodeGen_CUDA : public CodeGen { void visit(const Assign*); void visit(const Break*); void visit(const Free* op); - std::string printDeviceFuncName(const std::vector> currentParameters, int index); + std::string printDeviceFuncName(const std::vector> currentParameters, int index, int flag); void printDeviceFuncCall(const std::vector> currentParameters, Expr blockSize, int index, Expr gridSize); void printThreadIDVariable(std::pair threadIDVar, Expr start, Expr increment, Expr numThreads); void printBlockIDVariable(std::pair blockIDVar, Expr start, Expr increment); From d6235cfd89583c363e2da1fde84fc784904e69ba Mon Sep 17 00:00:00 2001 From: Changwan Hong Date: Fri, 20 Nov 2020 08:34:49 -0500 Subject: [PATCH 5/7] update --- src/codegen/codegen.h | 7 ++++--- src/codegen/codegen_cuda.cpp | 10 ++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h index b73cbbb25..af501ca82 100644 --- a/src/codegen/codegen.h +++ b/src/codegen/codegen.h @@ -1,9 +1,10 @@ #ifndef TACO_CODEGEN_H #define TACO_CODEGEN_H -#define PRINT_FUNC 0 -#define PRINT_MEM_HOST_TO_DEV 1 -#define PRINT_MEM_DEV_TO_HOST 2 +//#define PRINT_FUNC 0 +//#define PRINT_MEM_HOST_TO_DEV 1 +//#define PRINT_MEM_DEV_TO_HOST 2 +enum func_selector {PRINT_FUNC, PRINT_MEM_HOST_TO_DEV, PRINT_MEM_DEV_TO_HOST}; #include #include "taco/ir/ir.h" diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 578b4e01a..312930b50 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -313,6 +313,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { taco_iassert(var) << "Outputs must be vars in codegen"; taco_iassert(scopeMap.count(var) == 0) << "Duplicate output found in codegen"; + taco_iassert(outputs.size() == 1) << "The number of outputs should be 1"; output_tensor = var->name; // Isn't there only one output? scopeMap[var] = var->name; } @@ -616,10 +617,13 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP taco_iassert(currentParameters[i].second.as()) << "Unable to convert output " << currentParameters[i].second << " to Var"; string varName = currentParameters[i].first; - stream << "taco_tensor_t *"<< varName << "_dev = (taco_tensor_t *)malloc(sizeof(taco_tensor_t *));\n"; + //stream << "taco_tensor_t *"<< varName << "_dev = (taco_tensor_t *)malloc(sizeof(taco_tensor_t *));\n"; + stream << "taco_tensor_t *"<< varName << "_dev = init_taco_tensor_t(" << varName << "->order, " << varName << "->csize, " << varName << "->dimensions, " + << varName << "->mode_ordering, " << varName << "->mode_types);\n"; doIndent(); } + // for MemcpyHostToDev stream << funcName << "MemcpyHostToDev" << index << "("; for (size_t l=0; l<2; l++) { @@ -689,7 +693,9 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP << " to Var"; string varName = currentParameters[i].first; doIndent(); - stream << "free("<< varName << "_dev);\n"; + + stream << "deinit_taco_tensor_t(" << varName << "->dev);\n"; + //stream << "free("<< varName << "_dev);\n"; } } From c73cb105230298ba90f30e2232588c357b0d7e81 Mon Sep 17 00:00:00 2001 From: Changwan Hong Date: Fri, 20 Nov 2020 10:27:18 -0500 Subject: [PATCH 6/7] update --- src/codegen/codegen.cpp | 4 ++-- src/codegen/codegen_cuda.cpp | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp index 1015c133f..1154d0acc 100644 --- a/src/codegen/codegen.cpp +++ b/src/codegen/codegen.cpp @@ -290,7 +290,7 @@ string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, << "->dimensions[" << op->mode << "]);\n"; break; case PRINT_MEM_HOST_TO_DEV: - ret << tensor->name << "_dev->dimension[" << op->mode << "] = " << tensor->name << "->dimension[" << op->mode << "];\n"; + ret << tensor->name << "_dev->dimensions[" << op->mode << "] = " << tensor->name << "->dimensions[" << op->mode << "];\n"; break; } } else { @@ -324,7 +324,7 @@ string CodeGen::unpackTensorProperty(string varname, const GetProperty* op, ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], "; ret << "malloc_usable_size("; ret << tensor->name << "_dev->indices[" << op->mode << "][" << nm << "]), "; - ret << "cudaMemcpyDevicetToHost);\n"; + ret << "cudaMemcpyDeviceToHost);\n"; } ret << "cudaFree("; ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "]);\n"; diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 312930b50..b22644599 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -34,6 +34,7 @@ const string cHeaders = "#include \n" "#include \n" "#include \n" + "#include \n" "#include \n" "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n" "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n" @@ -675,7 +676,7 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP // for MemcpyDevToHost doIndent(); - stream << funcName << "DeviceFree" << index << "("; + stream << funcName << "DMemcpyDevToHost" << index << "("; delimiter = ""; for (size_t i=0; i()) << "Unable to convert output " << currentParameters[i].second @@ -694,7 +695,7 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP string varName = currentParameters[i].first; doIndent(); - stream << "deinit_taco_tensor_t(" << varName << "->dev);\n"; + stream << "deinit_taco_tensor_t(" << varName << "_dev);\n"; //stream << "free("<< varName << "_dev);\n"; } } @@ -1193,7 +1194,8 @@ void CodeGen_CUDA::visit(const Allocate* op) { } doIndent(); - stream << "gpuErrchk(cudaMalloc((void**)&"; + stream << "gpuErrchk(cudaMallocManaged((void**)&"; + //stream << "gpuErrchk(cudaMalloc((void**)&"; if (op->is_realloc) { stream << variable_name; } From 91bafbe95cd4bcae0f7588d50ff30a6c11e391ac Mon Sep 17 00:00:00 2001 From: Changwan Hong Date: Fri, 20 Nov 2020 16:15:52 -0500 Subject: [PATCH 7/7] update --- src/codegen/codegen_cuda.cpp | 61 +++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index b22644599..d31717241 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -158,6 +158,48 @@ const string gpuAssertMacro = " }\n" "}\n"; +const string tensor_allocation= +"taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n" +" int32_t* dimensions, int32_t* modeOrdering,\n" +" taco_mode_t* mode_types) {\n" +" taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n" +" t->order = order;\n" +" t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n" +" t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n" +" t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n" +" t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n" +" t->csize = csize;\n" +"\n" +" for (int32_t i = 0; i < order; i++) {\n" +" t->dimensions[i] = dimensions[i];\n" +" t->mode_ordering[i] = modeOrdering[i];\n" +" t->mode_types[i] = mode_types[i];\n" +" switch (t->mode_types[i]) {\n" +" case taco_mode_dense:\n" +" t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n" +" break;\n" +" case taco_mode_sparse:\n" +" t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n" +" break;\n" +" }\n" +" }\n" +" return t;\n" +"}\n" +"\n" +"void deinit_taco_tensor_t(taco_tensor_t* t) {\n" +" for (int i = 0; i < t->order; i++) {\n" +" free(t->indices[i]);\n" +" }\n" +" free(t->indices);\n" +"\n" +" free(t->dimensions);\n" +" free(t->mode_ordering);\n" +" free(t->mode_types);\n" +" free(t);\n" +"}\n"; + + + const std::string blue="\033[38;5;67m"; const std::string nc="\033[0m"; } // anonymous namespace @@ -725,6 +767,7 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) { out << cHeaders; if (outputKind == ImplementationGen) { out << endl << gpuAssertMacro; + out << endl << tensor_allocation; } } out << endl; @@ -1194,8 +1237,9 @@ void CodeGen_CUDA::visit(const Allocate* op) { } doIndent(); - stream << "gpuErrchk(cudaMallocManaged((void**)&"; //stream << "gpuErrchk(cudaMalloc((void**)&"; + + stream << "gpuErrchk(cudaMallocManaged((void**)&"; if (op->is_realloc) { stream << variable_name; } @@ -1210,6 +1254,21 @@ void CodeGen_CUDA::visit(const Allocate* op) { parentPrecedence = TOP; stream << "));" << endl; + /* + if (op->is_realloc) { + stream << variable_name; + } + else { + op->var.accept(this); + } + stream << "= (" << elementType << "*)malloc("; + stream << "sizeof(" << elementType << ")"; + stream << " * "; + parentPrecedence = MUL; + op->num_elements.accept(this); + parentPrecedence = TOP; + stream << ");" << endl;*/ + if(op->is_realloc) { doIndent(); stream << "memcpy(" << variable_name << ", ";