diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h index 788156fdb..3c26cc9f1 100644 --- a/include/taco/codegen/module.h +++ b/include/taco/codegen/module.h @@ -25,6 +25,12 @@ class Module { /// Compile the source into a library, returning its full path std::string compile(); + + /// Recompile. This is a debugging tool that, given the path to the temporary + /// file generated by taco, will compile the file. This function is useful + /// for development and facilitates experimentation with generated code by + /// allowing developers to modify the generated code and compile it again. + std::string recompile(std::string file_path); /// Compile the module into a source file located at the specified location /// path and prefix. The generated source will be path/prefix.{.c|.bc, .h} diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index a96d56090..331a3c70b 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -850,7 +850,10 @@ class TensorVar : public util::Comparable { TensorVar(const Type& type); TensorVar(const std::string& name, const Type& type); TensorVar(const Type& type, const Format& format); - TensorVar(const std::string& name, const Type& type, const Format& format); + TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None); + + // Returns the type of GPU workspace this TensorVar is, which is None by default. + GPUWorkspace getGPUWorkspace(); /// Returns the name of the tensor variable. std::string getName() const; diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h index 15dbdc7aa..5445f91ea 100644 --- a/include/taco/ir/ir.h +++ b/include/taco/ir/ir.h @@ -254,9 +254,10 @@ struct Var : public ExprNode { std::string name; bool is_ptr; bool is_tensor; + GPUWorkspace gpuworkspace; static Expr make(std::string name, Datatype type, bool is_ptr=false, - bool is_tensor=false); + bool is_tensor=false, GPUWorkspace gpuworkspace=GPUWorkspace::None); static const IRNodeType _type_info = IRNodeType::Var; }; diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h index 2b1a4a4f1..39ed25eb4 100644 --- a/include/taco/ir_tags.h +++ b/include/taco/ir_tags.h @@ -29,4 +29,10 @@ enum class BoundType { extern const char *BoundType_NAMES[]; } +/* TODO: Not sure if this is the right place for these. */ +enum class GPUWorkspace { + None, DenseSharedMemory +}; +extern const char *GPUWorkspace_NAMES[]; + #endif //TACO_IR_TAGS_H diff --git a/include/taco/tensor.h b/include/taco/tensor.h index 86ea4120c..3327eeced 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -413,12 +413,18 @@ class TensorBase { void compile(IndexStmt stmt, bool assembleWhileCompute=false); + void recompile(std::string file_path); + /// Assemble the tensor storage, including index and value arrays. void assemble(); + void reassemble(); + /// Compute the given expression and put the values in the tensor storage. void compute(); + void recompute(); + /// Compile, assemble and compute as needed. void evaluate(); diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp index 5eb57c7ad..fabc0d5ca 100644 --- a/src/codegen/codegen_cuda.cpp +++ b/src/codegen/codegen_cuda.cpp @@ -295,6 +295,8 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { vector> warpIDVars; vector numThreads; vector numWarps; + vector sizeSharedMemory; + std::string typeSharedMemory; CodeGen_CUDA *codeGen; // copy inputs and outputs into the map @@ -359,6 +361,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { threadFors.push_back(op); threadIDVars.push_back(pair(scopeMap[op->var], op->var)); + Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment)); numThreads.push_back(blockSize); } @@ -378,6 +381,16 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor { } virtual void visit(const Var *op) { + + if (isa(op)){ + if (to(op)->gpuworkspace == GPUWorkspace::DenseSharedMemory) + { + string elementType = printCUDAType( op->type, false); + typeSharedMemory = elementType; + // sizeSharedMemory.push_back(Mul::make(to()->num_elements, Literal::make(256))); + } + } + if (scopeMap.count(op) == 0) { string name = codeGen->genUniqueName(op->name); if (!inDeviceFunction) { @@ -580,7 +593,23 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector> currentP gridSize.accept(this); stream << ", "; blockSize.accept(this); - stream << ">>>"; + + if (usesshared == false) + { + stream << ">>>"; + } + else + { + // BIG TODO: no hard code 2048 + // Should be num_threads * num_prec_elems * sizeof(prec_type) + stream << ", " ; + sizeofshared.accept(this); + stream << " * sizeof(" << typeofshared << ")>>>"; + // 2048*sizeof(double)>>>"; + } + + + stream << "("; string delimiter = ""; @@ -627,6 +656,9 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) { parentParallelUnits = {}; parallelUnitSizes = {}; parallelUnitIDVars = {}; + sizeofshared = Expr(); + typeofshared = ""; + usesshared = false; emittedTimerStartCode = false; isHostFunction = true; if (isFirst) { @@ -1021,51 +1053,103 @@ void CodeGen_CUDA::visit(const Max* op) { void CodeGen_CUDA::visit(const Allocate* op) { string elementType = printCUDAType(op->var.type(), false); + if (!isHostFunction) { - if (parentParallelUnits.count(ParallelUnit::GPUThread)) { - // double w_GPUThread[num]; - // for threads allocate thread local memory + + if (to(op->var)->gpuworkspace == GPUWorkspace::DenseSharedMemory) + { + taco_iassert(!op->is_realloc); doIndent(); - stream << elementType << " "; + stream << "__shared__ " << elementType << " "; op->var.accept(this); + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + stream << "_ALL"; + } stream << "["; - op->num_elements.accept(this); + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + Expr numElements = Mul::make(op->num_elements, + parallelUnitSizes[ParallelUnit::GPUBlock]); + + sizeofshared = numElements; + typeofshared = elementType; + usesshared = true; + + ir::simplify(numElements).accept(this); + } + else { + doIndent(); + stream << elementType << " "; + op->var.accept(this); + stream << "["; + op->num_elements.accept(this); + stream << "];" << endl; + return; + } stream << "];" << endl; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + doIndent(); + stream << elementType << " * "; + op->var.accept(this); + + stream << " = "; + op->var.accept(this); + stream << "_ALL + threadIdx.x"; + // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); + stream << " * "; + op->num_elements.accept(this); + // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); + stream << ";" << endl; + } return; } - // __shared__ double w_GPUThread[32]; if no warps - // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps - // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32; - taco_iassert(!op->is_realloc); - doIndent(); - stream << "__shared__ " << elementType << " "; - op->var.accept(this); - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - stream << "_ALL"; - } - stream << "["; - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { - Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp])); - ir::simplify(numElements).accept(this); - } - else { - op->num_elements.accept(this); - } - stream << "];" << endl; - if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + else + { + if (parentParallelUnits.count(ParallelUnit::GPUThread)) { + // double w_GPUThread[num]; + // for threads allocate thread local memory + doIndent(); + stream << elementType << " "; + op->var.accept(this); + stream << "["; + op->num_elements.accept(this); + stream << "];" << endl; + return; + } + // __shared__ double w_GPUThread[32]; if no warps + // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps + // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32; + taco_iassert(!op->is_realloc); doIndent(); - stream << elementType << " * "; - op->var.accept(this); - - stream << " = "; + stream << "__shared__ " << elementType << " "; op->var.accept(this); - stream << "_ALL + "; - parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); - stream << " * "; - parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); - stream << ";" << endl; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + stream << "_ALL"; + } + stream << "["; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp])); + ir::simplify(numElements).accept(this); + } + else { + op->num_elements.accept(this); + } + stream << "];" << endl; + if (parentParallelUnits.count(ParallelUnit::GPUWarp)) { + doIndent(); + stream << elementType << " * "; + op->var.accept(this); + + stream << " = "; + op->var.accept(this); + stream << "_ALL + "; + parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this); + stream << " * "; + parallelUnitSizes[ParallelUnit::GPUWarp].accept(this); + stream << ";" << endl; + } + return; } - return; + } string variable_name; if (op->is_realloc) { diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h index 2bc8e000d..654344b39 100644 --- a/src/codegen/codegen_cuda.h +++ b/src/codegen/codegen_cuda.h @@ -73,6 +73,11 @@ class CodeGen_CUDA : public CodeGen { std::map parallelUnitSizes; std::map parallelUnitIDVars; + + Expr sizeofshared; + std::string typeofshared; + bool usesshared; + bool emittedTimerStartCode = false; std::ostream &out; diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp index fc52c409d..6c84bc56d 100644 --- a/src/codegen/module.cpp +++ b/src/codegen/module.cpp @@ -164,6 +164,51 @@ string Module::compile() { return fullpath; } +string Module::recompile(string file_path) { + string prefix = file_path; + string fullpath = prefix + ".so"; + + string cc; + string cflags; + string file_ending; + string shims_file; + if (should_use_CUDA_codegen()) { + cc = "nvcc"; + cflags = util::getFromEnv("TACO_NVCCFLAGS", + get_default_CUDA_compiler_flags()); + file_ending = ".cu"; + shims_file = prefix + "_shims.cpp"; + } + else { + cc = util::getFromEnv(target.compiler_env, target.compiler); + cflags = util::getFromEnv("TACO_CFLAGS", + "-O3 -ffast-math -std=c99") + " -shared -fPIC"; + file_ending = ".c"; + shims_file = ""; + } +#if USE_OPENMP + cflags += " -fopenmp"; +#endif + + string cmd = cc + " " + cflags + " " + + prefix + file_ending + " " + shims_file + " " + + "-o " + fullpath + " -lm"; + + // now compile it + int err = system(cmd.data()); + taco_uassert(err == 0) << "Compilation command failed:\n" << cmd + << "\nreturned " << err; + + // use dlsym() to open the compiled library + if (lib_handle) { + dlclose(lib_handle); + } + lib_handle = dlopen(fullpath.data(), RTLD_NOW | RTLD_LOCAL); + taco_uassert(lib_handle) << "Failed to load generated code"; + + return fullpath; +} + void Module::setSource(string source) { this->source << source; moduleFromUserSource = true; diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index d73aae3dd..7b9c10bf5 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -1819,6 +1819,7 @@ struct TensorVar::Content { Type type; Format format; Schedule schedule; + GPUWorkspace gpuworkspace; }; TensorVar::TensorVar() : content(nullptr) { @@ -1840,11 +1841,16 @@ TensorVar::TensorVar(const Type& type, const Format& format) : TensorVar(util::uniqueName('A'), type, format) { } -TensorVar::TensorVar(const string& name, const Type& type, const Format& format) +TensorVar::TensorVar(const string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace) : content(new Content) { content->name = name; content->type = type; content->format = format; + content->gpuworkspace = gpuworkspace; +} + +GPUWorkspace TensorVar::getGPUWorkspace() { + return content->gpuworkspace; } std::string TensorVar::getName() const { diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index 752416955..03ddba8b4 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -239,6 +239,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const { IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}})); IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}})); + Where where(consumer, producer); stmt = where; diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp index dbe941fe6..60d7d24f6 100644 --- a/src/ir/ir.cpp +++ b/src/ir/ir.cpp @@ -241,7 +241,7 @@ bool Literal::equalsScalar(double scalar) const { return false; } -Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) { +Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, GPUWorkspace gpuworkspace) { Var *var = new Var; var->type = type; var->name = name; @@ -249,6 +249,7 @@ Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) { // TODO: is_ptr and is_tensor should be part of type var->is_ptr = is_ptr; var->is_tensor = is_tensor; + var->gpuworkspace = gpuworkspace; return var; } diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp index a7155438c..122805776 100644 --- a/src/ir_tags.cpp +++ b/src/ir_tags.cpp @@ -4,4 +4,5 @@ namespace taco { const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"}; const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"}; const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"}; +const char *GPUWorkspace_NAMES[] = {"None", "DenseSharedMemory"}; } diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp index acc1b11bf..890aee524 100644 --- a/src/lower/lowerer_impl.cpp +++ b/src/lower/lowerer_impl.cpp @@ -126,9 +126,18 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute) // Create variables for temporaries // TODO Remove this for (auto& temp : temporaries) { - ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), + if (((TensorVar)(temp)).getGPUWorkspace() != GPUWorkspace::None){ + ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), + true, true, ((TensorVar)(temp)).getGPUWorkspace()); + tensorVars.insert({temp, irVar}); + } + else{ + ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(), true, true); - tensorVars.insert({temp, irVar}); + tensorVars.insert({temp, irVar}); + } + + } // Create variables for keeping track of result values array capacity @@ -1278,9 +1287,19 @@ Stmt LowererImpl::lowerWhere(Where where) { } else { if (generateComputeCode()) { - Expr values = ir::Var::make(temporary.getName(), + Expr values; + if (temporary.getGPUWorkspace() != GPUWorkspace::None){ + values = ir::Var::make(temporary.getName(), + temporary.getType().getDataType(), + true, false, temporary.getGPUWorkspace()); + } + else + { + values = ir::Var::make(temporary.getName(), temporary.getType().getDataType(), true, false); + } + taco_iassert(temporary.getType().getOrder() == 1); // TODO Dimension temporarySize = temporary.getType().getShape().getDimension(0); Expr size; @@ -1305,6 +1324,9 @@ Stmt LowererImpl::lowerWhere(Where where) { Expr p = Var::make("p" + temporary.getName(), Int()); Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType())); + + // TODO: Should this zero init loop even exist for precompute for dense shared + // memory workspaces on GPUs? Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial); freeTemporary = Free::make(values); diff --git a/src/tensor.cpp b/src/tensor.cpp index bc400ec71..d1bbfdbd9 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -573,6 +573,10 @@ void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { cacheComputeKernel(concretizedAssign, content->module); } +void TensorBase::recompile(std::string file_path) { + content->module->recompile(file_path); +} + taco_tensor_t* TensorBase::getTacoTensorT() { return getStorage(); } @@ -701,6 +705,17 @@ void TensorBase::assemble() { } } +void TensorBase::reassemble() { + + auto arguments = packArguments(*this); + content->module->callFuncPacked("assemble", arguments.data()); + + if (!content->assembleWhileCompute) { + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + void TensorBase::compute() { taco_uassert(!needsCompile()) << error::compute_without_compile; if (!needsCompute()) { @@ -724,6 +739,17 @@ void TensorBase::compute() { } } +void TensorBase::recompute() { + + auto arguments = packArguments(*this); + this->content->module->callFuncPacked("compute", arguments.data()); + + if (content->assembleWhileCompute) { + taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]); + content->valuesSize = unpackTensorData(*tensorData, *this); + } +} + void TensorBase::evaluate() { this->compile(); if (!getAssignment().getOperator().defined()) { diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index 5be64a4ff..3b5b94c71 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -129,6 +129,24 @@ IndexStmt scheduleSpMVGPU(IndexStmt stmt, Tensor A, IndexExpr precompute .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); } +IndexStmt scheduleSpMVGPU_dsm(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) { + int NNZ_PER_WARP = NNZ_PER_THREAD * WARP_SIZE; + int NNZ_PER_TB = NNZ_PER_THREAD * BLOCK_SIZE; + IndexVar f("f"), fpos("fpos"), fpos1("fpos1"), fpos2("fpos2"), block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre"); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory); + return stmt.fuse(i, j, f) + .pos(f, fpos, A(i, j)) + .split(fpos, block, fpos1, NNZ_PER_TB) + .split(fpos1, warp, fpos2, NNZ_PER_WARP) + .split(fpos2, thread, thread_nz, NNZ_PER_THREAD) + .reorder({block, warp, thread, thread_nz}) + .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed) + .unroll(thread_nz_pre, NNZ_PER_THREAD) + .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) + .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) + .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); +} + IndexStmt scheduleSpMVRowsGPU(IndexStmt stmt, Tensor A, IndexExpr precomputedExpr, int ROWS_PER_WARP=1, int BLOCK_SIZE=256) { int ROWS_PER_TB = ROWS_PER_WARP * BLOCK_SIZE; IndexVar block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), i1("i1"), jpos("jpos"), block_row("block_row"), warp_row("warp_row"); @@ -245,6 +263,25 @@ IndexStmt scheduleTTVGPU(IndexStmt stmt, Tensor B, IndexExpr precomputed .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); } +IndexStmt scheduleTTVGPU_dsm(IndexStmt stmt, Tensor B, IndexExpr precomputedExpr, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256) { + int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); + IndexVar jk("jk"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), fpos2("fpos2"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre"); + TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory); + + return stmt.fuse(j, k, jk) + .fuse(i, jk, f) + .pos(f, fpos, B(i,j,k)) + .split(fpos, block, fpos1, NNZ_PER_TB) + .split(fpos1, warp, fpos2, NNZ_PER_WARP) + .split(fpos2, thread, thread_nz, NNZ_PER_WARP/WARP_SIZE) + .reorder({block, warp, thread, thread_nz}) + .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed) + .unroll(thread_nz_pre, NNZ_PER_WARP/WARP_SIZE) + .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces) + .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces) + .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics); +} + IndexStmt scheduleMTTKRPGPU(IndexStmt stmt, Tensor B, int NNZ_PER_WARP=16, int BLOCK_SIZE=256) { int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE); IndexVar kl("kl"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread"); @@ -797,6 +834,59 @@ TEST(scheduling_eval, spmvGPU) { ASSERT_TENSOR_EQ(expected, y); } +TEST(scheduling_eval, spmvGPU_dsm) { + if (!should_use_CUDA_codegen()) { + return; + } + int NUM_I = 425; + int NUM_J = 425; + float SPARSITY = .19; + Tensor A("A", {NUM_I, NUM_J}, CSR); + Tensor x("x", {NUM_J}, {Dense}); + Tensor y("y", {NUM_I}, {Dense}); + + srand(94353); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + if (rand_float < SPARSITY) { + A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + + for (int j = 0; j < NUM_J; j++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + x.insert({j}, (double)rand_float); + } + + x.pack(); + A.pack(); + IndexExpr precomputed = A(i, j) * x(j); + y(i) = precomputed; + + IndexStmt stmt = y.getAssignment().concretize(); + stmt = scheduleSpMVGPU_dsm(stmt, A, precomputed); + //printToFile("spmv_gpu", stmt); + + + y.compile(stmt); + y.assemble(); + y.compute(); + + // Example of using "recompile" to debug + // y.recompile("/tmp/taco_tmp_88888/xxxxxxx"); + // y.reassemble(); + // y.recompute(); + + Tensor expected("expected", {NUM_I}, {Dense}); + expected(i) = A(i, j) * x(j); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, y); +} + TEST(scheduling_eval, spmmGPU) { if (!should_use_CUDA_codegen()) { return; @@ -1065,6 +1155,58 @@ TEST(scheduling_eval, ttvGPU) { ASSERT_TENSOR_EQ(expected, A); } +TEST(scheduling_eval, ttvGPU_dsm) { + if (!should_use_CUDA_codegen()) { + return; + } + int NUM_I = 1021/10; + int NUM_J = 1039/10; + int NUM_K = 1057/10; + float SPARSITY = .3; + Tensor A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor c("c", {NUM_K}, {Dense}); + + srand(353252); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + c.insert({k}, (double) ((int) (rand_float*3))); + } + + B.pack(); + c.pack(); + + IndexExpr precomputedExpr = B(i,j,k) * c(k); + A(i,j) = precomputedExpr; + + IndexStmt stmt = A.getAssignment().concretize(); + stmt = scheduleTTVGPU_dsm(stmt, B, precomputedExpr); + + //printToFile("ttv_gpu", stmt); + + A.compile(stmt); + A.assemble(); + A.compute(); + + Tensor expected("expected", {NUM_I, NUM_J}, {Dense, Dense}); + expected(i,j) = B(i,j,k) * c(k); + expected.compile(); + expected.assemble(); + expected.compute(); + ASSERT_TENSOR_EQ(expected, A); +} + TEST(scheduling_eval, mttkrpGPU) { if (!should_use_CUDA_codegen()) { return;