diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h
index 788156fdb..3c26cc9f1 100644
--- a/include/taco/codegen/module.h
+++ b/include/taco/codegen/module.h
@@ -25,6 +25,12 @@ class Module {
 
   /// Compile the source into a library, returning its full path
   std::string compile();
+
+  /// Recompile. This is a debugging tool that, given the path to the temporary
+  /// file generated by taco, will compile the file. This function is useful
+  /// for development and facilitates experimentation with generated code by
+  /// allowing developers to modify the generated code and compile it again. 
+  std::string recompile(std::string file_path);
   
   /// Compile the module into a source file located at the specified location
   /// path and prefix.  The generated source will be path/prefix.{.c|.bc, .h}
diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index a96d56090..331a3c70b 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -850,7 +850,10 @@ class TensorVar : public util::Comparable<TensorVar> {
   TensorVar(const Type& type);
   TensorVar(const std::string& name, const Type& type);
   TensorVar(const Type& type, const Format& format);
-  TensorVar(const std::string& name, const Type& type, const Format& format);
+  TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None);
+
+  // Returns the type of GPU workspace this TensorVar is, which is None by default. 
+  GPUWorkspace getGPUWorkspace();
 
   /// Returns the name of the tensor variable.
   std::string getName() const;
diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
index 15dbdc7aa..5445f91ea 100644
--- a/include/taco/ir/ir.h
+++ b/include/taco/ir/ir.h
@@ -254,9 +254,10 @@ struct Var : public ExprNode<Var> {
   std::string name;
   bool is_ptr;
   bool is_tensor;
+  GPUWorkspace gpuworkspace;
 
   static Expr make(std::string name, Datatype type, bool is_ptr=false, 
-                   bool is_tensor=false);
+                   bool is_tensor=false, GPUWorkspace gpuworkspace=GPUWorkspace::None);
 
   static const IRNodeType _type_info = IRNodeType::Var;
 };
diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h
index 2b1a4a4f1..39ed25eb4 100644
--- a/include/taco/ir_tags.h
+++ b/include/taco/ir_tags.h
@@ -29,4 +29,10 @@ enum class BoundType {
 extern const char *BoundType_NAMES[];
 }
 
+/* TODO: Not sure if this is the right place for these. */
+enum class GPUWorkspace {
+  None, DenseSharedMemory
+};
+extern const char *GPUWorkspace_NAMES[];
+
 #endif //TACO_IR_TAGS_H
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
index 86ea4120c..3327eeced 100644
--- a/include/taco/tensor.h
+++ b/include/taco/tensor.h
@@ -413,12 +413,18 @@ class TensorBase {
 
   void compile(IndexStmt stmt, bool assembleWhileCompute=false);
 
+  void recompile(std::string file_path);
+
   /// Assemble the tensor storage, including index and value arrays.
   void assemble();
 
+  void reassemble();
+
   /// Compute the given expression and put the values in the tensor storage.
   void compute();
 
+  void recompute();
+
   /// Compile, assemble and compute as needed.
   void evaluate();
 
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index 5eb57c7ad..fabc0d5ca 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -295,6 +295,8 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   vector<pair<string, Expr>> warpIDVars;
   vector<Expr> numThreads;
   vector<Expr> numWarps;
+  vector<Expr> sizeSharedMemory;
+  std::string typeSharedMemory;
 
   CodeGen_CUDA *codeGen;
   // copy inputs and outputs into the map
@@ -359,6 +361,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
 
       threadFors.push_back(op);
       threadIDVars.push_back(pair<string, Expr>(scopeMap[op->var], op->var));
+      
       Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment));
       numThreads.push_back(blockSize);
     }
@@ -378,6 +381,16 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   }
 
   virtual void visit(const Var *op) {
+
+    if (isa<Var>(op)){
+        if (to<Var>(op)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
+        {
+          string elementType = printCUDAType( op->type, false);
+          typeSharedMemory = elementType;
+          // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
+        }
+      }
+
     if (scopeMap.count(op) == 0) {
       string name = codeGen->genUniqueName(op->name);
       if (!inDeviceFunction) {
@@ -580,7 +593,23 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
   gridSize.accept(this);
   stream << ", ";
   blockSize.accept(this);
-  stream << ">>>";
+
+  if (usesshared == false)
+  {
+    stream << ">>>";
+  }
+  else
+  {
+    // BIG TODO: no hard code 2048
+    // Should be num_threads * num_prec_elems * sizeof(prec_type)
+    stream << ", " ;
+    sizeofshared.accept(this);
+    stream << " * sizeof(" << typeofshared << ")>>>"; 
+    // 2048*sizeof(double)>>>";
+  }
+  
+
+  
   stream << "(";
 
   string delimiter = "";
@@ -627,6 +656,9 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) {
   parentParallelUnits = {};
   parallelUnitSizes = {};
   parallelUnitIDVars = {};
+  sizeofshared = Expr();
+  typeofshared = "";
+  usesshared = false;
   emittedTimerStartCode = false;
   isHostFunction = true;
   if (isFirst) {
@@ -1021,51 +1053,103 @@ void CodeGen_CUDA::visit(const Max* op) {
 
 void CodeGen_CUDA::visit(const Allocate* op) {
   string elementType = printCUDAType(op->var.type(), false);
+
   if (!isHostFunction) {
-    if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
-      // double w_GPUThread[num];
-      // for threads allocate thread local memory
+
+    if (to<Var>(op->var)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
+    {
+      taco_iassert(!op->is_realloc);
       doIndent();
-      stream << elementType << " ";
+      stream << "__shared__ " << elementType << " ";
       op->var.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        stream << "_ALL";
+      }
       stream << "[";
-      op->num_elements.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        Expr numElements = Mul::make(op->num_elements, 
+              parallelUnitSizes[ParallelUnit::GPUBlock]);
+
+        sizeofshared = numElements;
+        typeofshared = elementType;
+        usesshared = true;
+
+        ir::simplify(numElements).accept(this);
+      }
+      else {
+        doIndent();
+        stream << elementType << " ";
+        op->var.accept(this);
+        stream << "[";
+        op->num_elements.accept(this);
+        stream << "];" << endl;
+        return;
+      }
       stream << "];" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        doIndent();
+        stream << elementType << " * ";
+        op->var.accept(this);
+
+        stream << " = ";
+        op->var.accept(this);
+        stream << "_ALL + threadIdx.x";
+        // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
+        stream << " * ";
+        op->num_elements.accept(this);
+        // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
+        stream << ";" << endl;
+      }
       return;
     }
-    // __shared__ double w_GPUThread[32]; if no warps
-    // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
-    // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
-    taco_iassert(!op->is_realloc);
-    doIndent();
-    stream << "__shared__ " << elementType << " ";
-    op->var.accept(this);
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      stream << "_ALL";
-    }
-    stream << "[";
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
-      ir::simplify(numElements).accept(this);
-    }
-    else {
-      op->num_elements.accept(this);
-    }
-    stream << "];" << endl;
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+    else
+    {
+      if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
+        // double w_GPUThread[num];
+        // for threads allocate thread local memory
+        doIndent();
+        stream << elementType << " ";
+        op->var.accept(this);
+        stream << "[";
+        op->num_elements.accept(this);
+        stream << "];" << endl;
+        return;
+      }
+      // __shared__ double w_GPUThread[32]; if no warps
+      // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
+      // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
+      taco_iassert(!op->is_realloc);
       doIndent();
-      stream << elementType << " * ";
-      op->var.accept(this);
-
-      stream << " = ";
+      stream << "__shared__ " << elementType << " ";
       op->var.accept(this);
-      stream << "_ALL + ";
-      parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
-      stream << " * ";
-      parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
-      stream << ";" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        stream << "_ALL";
+      }
+      stream << "[";
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
+        ir::simplify(numElements).accept(this);
+      }
+      else {
+        op->num_elements.accept(this);
+      }
+      stream << "];" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        doIndent();
+        stream << elementType << " * ";
+        op->var.accept(this);
+
+        stream << " = ";
+        op->var.accept(this);
+        stream << "_ALL + ";
+        parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
+        stream << " * ";
+        parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
+        stream << ";" << endl;
+      }
+      return;
     }
-    return;
+    
   }
   string variable_name;
   if (op->is_realloc) {
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 2bc8e000d..654344b39 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -73,6 +73,11 @@ class CodeGen_CUDA : public CodeGen {
   std::map<ParallelUnit, Expr> parallelUnitSizes;
   std::map<ParallelUnit, Expr> parallelUnitIDVars;
 
+
+  Expr sizeofshared;
+  std::string typeofshared;
+  bool usesshared;
+
   bool emittedTimerStartCode = false;
 
   std::ostream &out;
diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp
index fc52c409d..6c84bc56d 100644
--- a/src/codegen/module.cpp
+++ b/src/codegen/module.cpp
@@ -164,6 +164,51 @@ string Module::compile() {
   return fullpath;
 }
 
+string Module::recompile(string file_path) {
+  string prefix = file_path;
+  string fullpath = prefix + ".so";
+  
+  string cc;
+  string cflags;
+  string file_ending;
+  string shims_file;
+  if (should_use_CUDA_codegen()) {
+    cc = "nvcc";
+    cflags = util::getFromEnv("TACO_NVCCFLAGS",
+    get_default_CUDA_compiler_flags());
+    file_ending = ".cu";
+    shims_file = prefix + "_shims.cpp";
+  }
+  else {
+    cc = util::getFromEnv(target.compiler_env, target.compiler);
+    cflags = util::getFromEnv("TACO_CFLAGS",
+    "-O3 -ffast-math -std=c99") + " -shared -fPIC";
+    file_ending = ".c";
+    shims_file = "";
+  }
+#if USE_OPENMP
+  cflags += " -fopenmp";
+#endif
+  
+  string cmd = cc + " " + cflags + " " +
+    prefix + file_ending + " " + shims_file + " " + 
+    "-o " + fullpath + " -lm";
+  
+  // now compile it
+  int err = system(cmd.data());
+  taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+    << "\nreturned " << err;
+
+  // use dlsym() to open the compiled library
+  if (lib_handle) {
+    dlclose(lib_handle);
+  }
+  lib_handle = dlopen(fullpath.data(), RTLD_NOW | RTLD_LOCAL);
+  taco_uassert(lib_handle) << "Failed to load generated code";
+
+  return fullpath;
+}
+
 void Module::setSource(string source) {
   this->source << source;
   moduleFromUserSource = true;
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index d73aae3dd..7b9c10bf5 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -1819,6 +1819,7 @@ struct TensorVar::Content {
   Type type;
   Format format;
   Schedule schedule;
+  GPUWorkspace gpuworkspace;
 };
 
 TensorVar::TensorVar() : content(nullptr) {
@@ -1840,11 +1841,16 @@ TensorVar::TensorVar(const Type& type, const Format& format)
     : TensorVar(util::uniqueName('A'), type, format) {
 }
 
-TensorVar::TensorVar(const string& name, const Type& type, const Format& format)
+TensorVar::TensorVar(const string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace)
     : content(new Content) {
   content->name = name;
   content->type = type;
   content->format = format;
+  content->gpuworkspace = gpuworkspace;
+}
+
+GPUWorkspace TensorVar::getGPUWorkspace() {
+  return content->gpuworkspace;
 }
 
 std::string TensorVar::getName() const {
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index 752416955..03ddba8b4 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -239,6 +239,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
 
         IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
         IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}}));
+
         Where where(consumer, producer);
 
         stmt = where;
diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp
index dbe941fe6..60d7d24f6 100644
--- a/src/ir/ir.cpp
+++ b/src/ir/ir.cpp
@@ -241,7 +241,7 @@ bool Literal::equalsScalar(double scalar) const {
   return false;
 }
 
-Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) {
+Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, GPUWorkspace gpuworkspace) {
   Var *var = new Var;
   var->type = type;
   var->name = name;
@@ -249,6 +249,7 @@ Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) {
   // TODO: is_ptr and is_tensor should be part of type
   var->is_ptr = is_ptr;
   var->is_tensor = is_tensor;
+  var->gpuworkspace = gpuworkspace;
 
   return var;
 }
diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp
index a7155438c..122805776 100644
--- a/src/ir_tags.cpp
+++ b/src/ir_tags.cpp
@@ -4,4 +4,5 @@ namespace taco {
 const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"};
 const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"};
 const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"};
+const char *GPUWorkspace_NAMES[] = {"None", "DenseSharedMemory"};
 }
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index acc1b11bf..890aee524 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -126,9 +126,18 @@ LowererImpl::lower(IndexStmt stmt, string name, bool assemble, bool compute)
   // Create variables for temporaries
   // TODO Remove this
   for (auto& temp : temporaries) {
-    ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
+    if (((TensorVar)(temp)).getGPUWorkspace() != GPUWorkspace::None){
+      ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
+                                   true, true, ((TensorVar)(temp)).getGPUWorkspace());
+      tensorVars.insert({temp, irVar});
+    }
+    else{
+      ir::Expr irVar = ir::Var::make(temp.getName(), temp.getType().getDataType(),
                                    true, true);
-    tensorVars.insert({temp, irVar});
+      tensorVars.insert({temp, irVar});
+    }
+    
+    
   }
 
   // Create variables for keeping track of result values array capacity
@@ -1278,9 +1287,19 @@ Stmt LowererImpl::lowerWhere(Where where) {
   }
   else {
     if (generateComputeCode()) {
-      Expr values = ir::Var::make(temporary.getName(),
+      Expr values;
+      if (temporary.getGPUWorkspace() != GPUWorkspace::None){
+        values = ir::Var::make(temporary.getName(),
+                                  temporary.getType().getDataType(),
+                                  true, false, temporary.getGPUWorkspace());
+      }
+      else
+      {
+        values = ir::Var::make(temporary.getName(),
                                   temporary.getType().getDataType(),
                                   true, false);
+      }
+      
       taco_iassert(temporary.getType().getOrder() == 1); // TODO
       Dimension temporarySize = temporary.getType().getShape().getDimension(0);
       Expr size;
@@ -1305,6 +1324,9 @@ Stmt LowererImpl::lowerWhere(Where where) {
 
       Expr p = Var::make("p" + temporary.getName(), Int());
       Stmt zeroInit = Store::make(values, p, ir::Literal::zero(temporary.getType().getDataType()));
+
+      // TODO: Should this zero init loop even exist for precompute for dense shared
+      // memory workspaces on GPUs? 
       Stmt zeroInitLoop = For::make(p, 0, size, 1, zeroInit, LoopKind::Serial);
 
       freeTemporary = Free::make(values);
diff --git a/src/tensor.cpp b/src/tensor.cpp
index bc400ec71..d1bbfdbd9 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -573,6 +573,10 @@ void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) {
   cacheComputeKernel(concretizedAssign, content->module);
 }
 
+void TensorBase::recompile(std::string file_path) {
+  content->module->recompile(file_path);
+}
+
 taco_tensor_t* TensorBase::getTacoTensorT() {
   return getStorage();
 }
@@ -701,6 +705,17 @@ void TensorBase::assemble() {
   }
 }
 
+void TensorBase::reassemble() {
+
+  auto arguments = packArguments(*this);
+  content->module->callFuncPacked("assemble", arguments.data());
+
+  if (!content->assembleWhileCompute) {
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
 void TensorBase::compute() {
   taco_uassert(!needsCompile()) << error::compute_without_compile;
   if (!needsCompute()) {
@@ -724,6 +739,17 @@ void TensorBase::compute() {
   }
 }
 
+void TensorBase::recompute() {
+
+  auto arguments = packArguments(*this);
+  this->content->module->callFuncPacked("compute", arguments.data());
+
+  if (content->assembleWhileCompute) {
+    taco_tensor_t* tensorData = ((taco_tensor_t*)arguments[0]);
+    content->valuesSize = unpackTensorData(*tensorData, *this);
+  }
+}
+
 void TensorBase::evaluate() {
   this->compile();
   if (!getAssignment().getOperator().defined()) {
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index 5be64a4ff..3b5b94c71 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -129,6 +129,24 @@ IndexStmt scheduleSpMVGPU(IndexStmt stmt, Tensor<double> A, IndexExpr precompute
           .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
 }
 
+IndexStmt scheduleSpMVGPU_dsm(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int NNZ_PER_THREAD=8, int BLOCK_SIZE=256) {
+  int NNZ_PER_WARP = NNZ_PER_THREAD * WARP_SIZE;
+  int NNZ_PER_TB = NNZ_PER_THREAD * BLOCK_SIZE;
+  IndexVar f("f"), fpos("fpos"), fpos1("fpos1"), fpos2("fpos2"), block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre");
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory);
+  return stmt.fuse(i, j, f)
+          .pos(f, fpos, A(i, j))
+          .split(fpos, block, fpos1, NNZ_PER_TB)
+          .split(fpos1, warp, fpos2, NNZ_PER_WARP)
+          .split(fpos2, thread, thread_nz, NNZ_PER_THREAD)
+          .reorder({block, warp, thread, thread_nz})
+          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed)
+          .unroll(thread_nz_pre, NNZ_PER_THREAD)
+          .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
+}
+
 IndexStmt scheduleSpMVRowsGPU(IndexStmt stmt, Tensor<double> A, IndexExpr precomputedExpr, int ROWS_PER_WARP=1, int BLOCK_SIZE=256) {
   int ROWS_PER_TB = ROWS_PER_WARP * BLOCK_SIZE;
   IndexVar block("block"), warp("warp"), thread("thread"), thread_nz("thread_nz"), i1("i1"), jpos("jpos"), block_row("block_row"), warp_row("warp_row");
@@ -245,6 +263,25 @@ IndexStmt scheduleTTVGPU(IndexStmt stmt, Tensor<double> B, IndexExpr precomputed
           .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
 }
 
+IndexStmt scheduleTTVGPU_dsm(IndexStmt stmt, Tensor<double> B, IndexExpr precomputedExpr, int NNZ_PER_WARP=8*32, int BLOCK_SIZE=256) {
+  int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
+  IndexVar jk("jk"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), fpos2("fpos2"), thread("thread"), thread_nz("thread_nz"), thread_nz_pre("thread_nz_pre");
+  TensorVar precomputed("precomputed", Type(Float64, {Dimension(thread_nz)}), taco::dense, GPUWorkspace::DenseSharedMemory);
+
+  return stmt.fuse(j, k, jk)
+          .fuse(i, jk, f)
+          .pos(f, fpos, B(i,j,k))
+          .split(fpos, block, fpos1, NNZ_PER_TB)
+          .split(fpos1, warp, fpos2, NNZ_PER_WARP)
+          .split(fpos2, thread, thread_nz, NNZ_PER_WARP/WARP_SIZE)
+          .reorder({block, warp, thread, thread_nz})
+          .precompute(precomputedExpr, thread_nz, thread_nz_pre, precomputed)
+          .unroll(thread_nz_pre, NNZ_PER_WARP/WARP_SIZE)
+          .parallelize(block, ParallelUnit::GPUBlock, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(warp, ParallelUnit::GPUWarp, OutputRaceStrategy::IgnoreRaces)
+          .parallelize(thread, ParallelUnit::GPUThread, OutputRaceStrategy::Atomics);
+}
+
 IndexStmt scheduleMTTKRPGPU(IndexStmt stmt, Tensor<double> B, int NNZ_PER_WARP=16, int BLOCK_SIZE=256) {
   int NNZ_PER_TB = NNZ_PER_WARP * (BLOCK_SIZE / WARP_SIZE);
   IndexVar kl("kl"), f("f"), fpos("fpos"), block("block"), fpos1("fpos1"), warp("warp"), nnz("nnz"), dense_val_unbounded("dense_val_unbounded"), dense_val("dense_val"), thread("thread");
@@ -797,6 +834,59 @@ TEST(scheduling_eval, spmvGPU) {
   ASSERT_TENSOR_EQ(expected, y);
 }
 
+TEST(scheduling_eval, spmvGPU_dsm) {
+  if (!should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 425;
+  int NUM_J = 425;
+  float SPARSITY = .19;
+  Tensor<double> A("A", {NUM_I, NUM_J}, CSR);
+  Tensor<double> x("x", {NUM_J}, {Dense});
+  Tensor<double> y("y", {NUM_I}, {Dense});
+
+  srand(94353);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      if (rand_float < SPARSITY) {
+        A.insert({i, j}, (double) ((int) (rand_float * 3 / SPARSITY)));
+      }
+    }
+  }
+
+  for (int j = 0; j < NUM_J; j++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    x.insert({j}, (double)rand_float);
+  }
+
+  x.pack();
+  A.pack();
+  IndexExpr precomputed = A(i, j) * x(j);
+  y(i) = precomputed;
+
+  IndexStmt stmt = y.getAssignment().concretize();
+  stmt = scheduleSpMVGPU_dsm(stmt, A, precomputed);
+  //printToFile("spmv_gpu", stmt);
+
+
+  y.compile(stmt);
+  y.assemble();
+  y.compute();
+
+  // Example of using "recompile" to debug
+  // y.recompile("/tmp/taco_tmp_88888/xxxxxxx");
+  // y.reassemble();
+  // y.recompute();
+
+  Tensor<double> expected("expected", {NUM_I}, {Dense});
+  expected(i) = A(i, j) * x(j);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, y);
+}
+
 TEST(scheduling_eval, spmmGPU) {
   if (!should_use_CUDA_codegen()) {
     return;
@@ -1065,6 +1155,58 @@ TEST(scheduling_eval, ttvGPU) {
   ASSERT_TENSOR_EQ(expected, A);
 }
 
+TEST(scheduling_eval, ttvGPU_dsm) {
+  if (!should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 1021/10;
+  int NUM_J = 1039/10;
+  int NUM_K = 1057/10;
+  float SPARSITY = .3;
+  Tensor<double> A("A", {NUM_I, NUM_J}, {Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> c("c", {NUM_K}, {Dense});
+
+  srand(353252);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (rand_float * 3 / SPARSITY)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    float rand_float = (float)rand()/(float)(RAND_MAX);
+    c.insert({k}, (double) ((int) (rand_float*3)));
+  }
+
+  B.pack();
+  c.pack();
+
+  IndexExpr precomputedExpr = B(i,j,k) * c(k);
+  A(i,j) = precomputedExpr;
+
+  IndexStmt stmt = A.getAssignment().concretize();
+  stmt = scheduleTTVGPU_dsm(stmt, B, precomputedExpr);
+
+  //printToFile("ttv_gpu", stmt);
+
+  A.compile(stmt);
+  A.assemble();
+  A.compute();
+
+  Tensor<double> expected("expected", {NUM_I, NUM_J}, {Dense, Dense});
+  expected(i,j) = B(i,j,k) * c(k);
+  expected.compile();
+  expected.assemble();
+  expected.compute();
+  ASSERT_TENSOR_EQ(expected, A);
+}
+
 TEST(scheduling_eval, mttkrpGPU) {
   if (!should_use_CUDA_codegen()) {
     return;