diff --git a/include/taco/format.h b/include/taco/format.h
index 29f6cbd70..ec1028d5e 100644
--- a/include/taco/format.h
+++ b/include/taco/format.h
@@ -95,7 +95,7 @@ class ModeFormat {
   /// Properties of a mode format
   enum Property {
     FULL, NOT_FULL, ORDERED, NOT_ORDERED, UNIQUE, NOT_UNIQUE, BRANCHLESS,
-    NOT_BRANCHLESS, COMPACT, NOT_COMPACT
+    NOT_BRANCHLESS, COMPACT, NOT_COMPACT, ZEROLESS, NOT_ZEROLESS
   };
 
   /// Instantiates an undefined mode format
@@ -126,6 +126,7 @@ class ModeFormat {
   bool isUnique() const;
   bool isBranchless() const;
   bool isCompact() const;
+  bool isZeroless() const;
 
   /// Returns true if a mode format has a specific capability, false otherwise
   bool hasCoordValIter() const;
diff --git a/include/taco/lower/iterator.h b/include/taco/lower/iterator.h
index 1d871ffaa..63609439e 100644
--- a/include/taco/lower/iterator.h
+++ b/include/taco/lower/iterator.h
@@ -69,6 +69,7 @@ class Iterator : public util::Comparable<Iterator> {
   bool isUnique() const;
   bool isBranchless() const;
   bool isCompact() const;
+  bool isZeroless() const;
 
   /// Capabilities supported by levels being iterated.
   bool hasCoordIter() const;
diff --git a/include/taco/lower/mode_format_compressed.h b/include/taco/lower/mode_format_compressed.h
index 62a4c173b..55f7c42a7 100644
--- a/include/taco/lower/mode_format_compressed.h
+++ b/include/taco/lower/mode_format_compressed.h
@@ -9,7 +9,7 @@ class CompressedModeFormat : public ModeFormatImpl {
 public:
   CompressedModeFormat();
   CompressedModeFormat(bool isFull, bool isOrdered,
-                       bool isUnique, long long allocSize = DEFAULT_ALLOC_SIZE);
+                       bool isUnique, bool isZeroless, long long allocSize = DEFAULT_ALLOC_SIZE);
 
   ~CompressedModeFormat() override {}
 
diff --git a/include/taco/lower/mode_format_dense.h b/include/taco/lower/mode_format_dense.h
index 3426d548c..b99c9b416 100644
--- a/include/taco/lower/mode_format_dense.h
+++ b/include/taco/lower/mode_format_dense.h
@@ -8,7 +8,7 @@ namespace taco {
 class DenseModeFormat : public ModeFormatImpl {
 public:
   DenseModeFormat();
-  DenseModeFormat(const bool isOrdered, const bool isUnique);
+  DenseModeFormat(const bool isOrdered, const bool isUnique, const bool isZeroless);
 
   ~DenseModeFormat() override {}
 
diff --git a/include/taco/lower/mode_format_impl.h b/include/taco/lower/mode_format_impl.h
index a0ac74fce..6d4254bd6 100644
--- a/include/taco/lower/mode_format_impl.h
+++ b/include/taco/lower/mode_format_impl.h
@@ -59,7 +59,7 @@ std::ostream& operator<<(std::ostream&, const ModeFunction&);
 class ModeFormatImpl {
 public:
   ModeFormatImpl(std::string name, bool isFull, bool isOrdered, bool isUnique, 
-                 bool isBranchless, bool isCompact, bool hasCoordValIter, 
+                 bool isBranchless, bool isCompact, bool isZeroless, bool hasCoordValIter, 
                  bool hasCoordPosIter, bool hasLocate, bool hasInsert, 
                  bool hasAppend);
 
@@ -162,6 +162,7 @@ class ModeFormatImpl {
   const bool isUnique;
   const bool isBranchless;
   const bool isCompact;
+  const bool isZeroless;
 
   const bool hasCoordValIter;
   const bool hasCoordPosIter;
diff --git a/include/taco/lower/mode_format_singleton.h b/include/taco/lower/mode_format_singleton.h
index a370478dd..9122a66a4 100644
--- a/include/taco/lower/mode_format_singleton.h
+++ b/include/taco/lower/mode_format_singleton.h
@@ -9,7 +9,7 @@ class SingletonModeFormat : public ModeFormatImpl {
 public:
   SingletonModeFormat();
   SingletonModeFormat(bool isFull, bool isOrdered,
-                      bool isUnique, long long allocSize = DEFAULT_ALLOC_SIZE);
+                      bool isUnique, bool isZeroless, long long allocSize = DEFAULT_ALLOC_SIZE);
 
   ~SingletonModeFormat() override {}
 
diff --git a/src/codegen/codegen.cpp b/src/codegen/codegen.cpp
index f0c09d98a..1154d0acc 100644
--- a/src/codegen/codegen.cpp
+++ b/src/codegen/codegen.cpp
@@ -230,16 +230,47 @@ string CodeGen::printTensorProperty(string varname, const GetProperty* op, bool
 }
 
 string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
-                            bool is_output_prop) {
+                            bool is_output_prop, int flag, string output_tensor) {
   stringstream ret;
   ret << "  ";
 
   auto tensor = op->tensor.as<Var>();
   if (op->property == TensorProperty::Values) {
     // for the values, it's in the last slot
-    ret << printType(tensor->type, true);
-    ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")(";
-    ret << tensor->name << "->vals);\n";
+    switch(flag) {
+      case PRINT_FUNC:
+        ret << printType(tensor->type, true);
+        ret << " " << restrictKeyword() << " " << varname << " = (" << printType(tensor->type, true) << ")(";
+        ret << tensor->name << "->vals);\n";
+        break;
+      case PRINT_MEM_HOST_TO_DEV:
+        ret << "gpuErrchk(cudaMalloc((void **)&";
+        ret << tensor->name << "_dev" << "->vals, ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->vals)));\n";
+
+        ret << "  ";
+        ret << "cudaMemcpy(";
+        ret << tensor->name << "_dev" << "->vals, ";
+        ret << tensor->name << "->vals, ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->vals), ";
+        ret << "cudaMemcpyHostToDevice);\n";
+        break;
+      case PRINT_MEM_DEV_TO_HOST:
+        if(output_tensor == tensor->name) {
+          ret << "cudaMemcpy(";
+          ret << tensor->name << "->vals, ";
+          ret << tensor->name << "_dev->vals, ";
+          ret << "malloc_usable_size(";
+          ret << tensor->name << "->vals), ";
+          ret << "cudaMemcpyDevicetToHost);\n";
+          ret << "  ";
+        }
+        ret << "cudaFree(";
+        ret << tensor->name << "_dev" << "->vals);\n";
+        break;
+    }
     return ret.str();
   } else if (op->property == TensorProperty::ValuesSize) {
     ret << "int " << varname << " = " << tensor->name << "->vals_size;\n";
@@ -252,18 +283,54 @@ string CodeGen::unpackTensorProperty(string varname, const GetProperty* op,
   // for a Fixed level, ptr is an int
   // all others are int*
   if (op->property == TensorProperty::Dimension) {
-    tp = "int";
-    ret << tp << " " << varname << " = (int)(" << tensor->name
-        << "->dimensions[" << op->mode << "]);\n";
+    switch(flag) {
+      case PRINT_FUNC:
+        tp = "int";
+        ret << tp << " " << varname << " = (int)(" << tensor->name
+          << "->dimensions[" << op->mode << "]);\n";
+        break;
+      case PRINT_MEM_HOST_TO_DEV:
+        ret << tensor->name << "_dev->dimensions[" << op->mode << "] = " << tensor->name << "->dimensions[" << op->mode << "];\n";
+        break;
+    }
   } else {
     taco_iassert(op->property == TensorProperty::Indices);
     tp = "int*";
     auto nm = op->index;
-    ret << tp << " " << restrictKeyword() << " " << varname << " = ";
-    ret << "(int*)(" << tensor->name << "->indices[" << op->mode;
-    ret << "][" << nm << "]);\n";
+    switch(flag) {
+      case PRINT_FUNC:
+        ret << tp << " " << restrictKeyword() << " " << varname << " = ";
+        ret << "(int*)(" << tensor->name << "->indices[" << op->mode;
+        ret << "][" << nm << "]);\n";
+        break;
+      case PRINT_MEM_HOST_TO_DEV:
+        ret << "gpuErrchk(cudaMalloc((void **)&";
+        ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->indices[" << op->mode << "][" << nm << "])));\n";
+
+        ret << "  ";
+        ret << "cudaMemcpy(";
+        ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "], ";
+        ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
+        ret << "malloc_usable_size(";
+        ret << tensor->name << "->indices[" << op->mode << "][" << nm << "]), ";
+        ret << "cudaMemcpyHostToDevice);\n";
+        break;
+      case PRINT_MEM_DEV_TO_HOST:
+        if(output_tensor == tensor->name) {
+          ret << "cudaMemcpy(";
+          ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
+          ret << tensor->name << "->indices[" << op->mode << "][" << nm << "], ";
+          ret << "malloc_usable_size(";
+          ret << tensor->name << "_dev->indices[" << op->mode << "][" << nm << "]), ";
+          ret << "cudaMemcpyDeviceToHost);\n";
+        }
+        ret << "cudaFree(";
+        ret << tensor->name << "_dev" << "->indices[" << op->mode << "][" << nm << "]);\n";
+        break;
+    }
   }
-
   return ret.str();
 }
 
@@ -312,7 +379,7 @@ string CodeGen::pointTensorProperty(std::string varname) {
 
 // helper to print declarations
 string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
-                           vector<Expr> inputs, vector<Expr> outputs) {
+                           vector<Expr> inputs, vector<Expr> outputs, int flag, string output_tensor) {
   stringstream ret;
   unordered_set<string> propsAlreadyGenerated;
 
@@ -367,7 +434,7 @@ string CodeGen::printDecls(map<Expr, string, ExprCompare> varMap,
         break; 
       }
     } else {
-      ret << unpackTensorProperty(varMap[prop], prop, isOutputProp);
+      ret << unpackTensorProperty(varMap[prop], prop, isOutputProp, flag, output_tensor);
     }
     propsAlreadyGenerated.insert(varMap[prop]);
   }
diff --git a/src/codegen/codegen.h b/src/codegen/codegen.h
index cc25c80d6..af501ca82 100644
--- a/src/codegen/codegen.h
+++ b/src/codegen/codegen.h
@@ -1,6 +1,11 @@
 #ifndef TACO_CODEGEN_H
 #define TACO_CODEGEN_H
 
+//#define PRINT_FUNC 0
+//#define PRINT_MEM_HOST_TO_DEV 1
+//#define PRINT_MEM_DEV_TO_HOST 2
+enum func_selector {PRINT_FUNC, PRINT_MEM_HOST_TO_DEV, PRINT_MEM_DEV_TO_HOST};
+
 #include <memory>
 #include "taco/ir/ir.h"
 #include "taco/ir/ir_printer.h"
@@ -43,7 +48,7 @@ class CodeGen : public IRPrinter {
                                           std::vector<Expr> localVars, int labels,
                                           std::string funcName);
   std::string printDecls(std::map<Expr, std::string, ExprCompare> varMap,
-                         std::vector<Expr> inputs, std::vector<Expr> outputs);
+                         std::vector<Expr> inputs, std::vector<Expr> outputs, int flag, std::string output_tensor);
   std::string printPack(std::map<std::tuple<Expr, TensorProperty, int, int>,
           std::string> outputProperties, std::vector<Expr> outputs);
   std::string printCoroutineFinish(int numYields, std::string funcName);
@@ -63,7 +68,7 @@ class CodeGen : public IRPrinter {
 
   std::string printTensorProperty(std::string varname, const GetProperty* op, bool is_ptr);
   std::string unpackTensorProperty(std::string varname, const GetProperty* op,
-                              bool is_output_prop);
+                              bool is_output_prop, int flag, std::string output_tensor);
   std::string packTensorProperty(std::string varname, Expr tnsr, TensorProperty property,
                             int mode, int index);
   std::string pointTensorProperty(std::string varname);
diff --git a/src/codegen/codegen_c.cpp b/src/codegen/codegen_c.cpp
index 204aa1e2d..d9cb7bdcf 100644
--- a/src/codegen/codegen_c.cpp
+++ b/src/codegen/codegen_c.cpp
@@ -290,7 +290,7 @@ void CodeGen_C::visit(const Function* func) {
   localVars = varFinder.localVars;
 
   // Print variable declarations
-  out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+  out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl;
 
   if (emittingCoroutine) {
     out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
index 5eb57c7ad..d31717241 100644
--- a/src/codegen/codegen_cuda.cpp
+++ b/src/codegen/codegen_cuda.cpp
@@ -34,6 +34,7 @@ const string cHeaders =
   "#include <stdlib.h>\n"
   "#include <stdint.h>\n"
   "#include <math.h>\n"
+  "#include <malloc.h>\n"
   "#include <thrust/complex.h>\n"
   "#define TACO_MIN(_a,_b) ((_a) < (_b) ? (_a) : (_b))\n"
   "#define TACO_MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))\n"
@@ -157,6 +158,48 @@ const string gpuAssertMacro =
   "  }\n"
   "}\n";
 
+const string tensor_allocation=
+"taco_tensor_t* init_taco_tensor_t(int32_t order, int32_t csize,\n"
+"                        int32_t* dimensions, int32_t* modeOrdering,\n"
+"                        taco_mode_t* mode_types) {\n"
+"  taco_tensor_t* t = (taco_tensor_t *) malloc(sizeof(taco_tensor_t));\n"
+"  t->order         = order;\n"
+"  t->dimensions = (int32_t *) malloc(order * sizeof(int32_t));\n"
+"  t->mode_ordering = (int32_t *) malloc(order * sizeof(int32_t));\n"
+"  t->mode_types = (taco_mode_t *) malloc(order * sizeof(taco_mode_t));\n"
+"  t->indices = (uint8_t ***) malloc(order * sizeof(uint8_t***));\n"
+"  t->csize         = csize;\n"
+"\n"
+"  for (int32_t i = 0; i < order; i++) {\n"
+"    t->dimensions[i]    = dimensions[i];\n"
+"    t->mode_ordering[i] = modeOrdering[i];\n"
+"    t->mode_types[i]    = mode_types[i];\n"
+"    switch (t->mode_types[i]) {\n"
+"      case taco_mode_dense:\n"
+"        t->indices[i] = (uint8_t **) malloc(1 * sizeof(uint8_t **));\n"
+"        break;\n"
+"      case taco_mode_sparse:\n"
+"        t->indices[i] = (uint8_t **) malloc(2 * sizeof(uint8_t **));\n"
+"        break;\n"
+"    }\n"
+"  }\n"
+"  return t;\n"
+"}\n"
+"\n"
+"void deinit_taco_tensor_t(taco_tensor_t* t) {\n"
+"  for (int i = 0; i < t->order; i++) {\n"
+"    free(t->indices[i]);\n"
+"  }\n"
+"  free(t->indices);\n"
+"\n"
+"  free(t->dimensions);\n"
+"  free(t->mode_ordering);\n"
+"  free(t->mode_types);\n"
+"  free(t);\n"
+"}\n";
+
+
+
 const std::string blue="\033[38;5;67m";
 const std::string nc="\033[0m";
 } // anonymous namespace
@@ -282,6 +325,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   vector<Stmt> threadFors; // contents is device function
   vector<Stmt> warpFors;
   map<Expr, string, ExprCompare> scopeMap;
+  string output_tensor;
 
   // the variables to pass to each device function
   vector<vector<pair<string, Expr>>> functionParameters;
@@ -312,7 +356,8 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
       taco_iassert(var) << "Outputs must be vars in codegen";
       taco_iassert(scopeMap.count(var) == 0) <<
                                              "Duplicate output found in codegen";
-
+      taco_iassert(outputs.size() == 1) << "The number of outputs should be 1";
+      output_tensor = var->name; // Isn't there only one output?
       scopeMap[var] = var->name;
     }
   }
@@ -436,10 +481,20 @@ Stmt CodeGen_CUDA::simplifyFunctionBodies(Stmt stmt) {
   return FunctionBodySimplifier().rewrite(stmt);
 }
 
-string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> currentParameters, int index) {
+string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> currentParameters, int index, int flag) {
   stringstream ret;
-  ret << "__global__" << endl;
-  ret << "void " << funcName << "DeviceKernel" << index << "(";
+  switch(flag) {
+    case PRINT_FUNC:
+      ret << "__global__" << endl;
+      ret << "void " << funcName << "DeviceKernel" << index << "(";
+      break;
+    case PRINT_MEM_HOST_TO_DEV:
+      ret << "void " << funcName << "MemcpyHostToDev" << index << "(";
+      break;
+    case PRINT_MEM_DEV_TO_HOST:
+      ret << "void " << funcName << "MemcpyDevToHost" << index << "(";
+      break;
+  }
 
   string delimiter = "";
   for (size_t i=0; i<currentParameters.size(); i++) {
@@ -462,7 +517,32 @@ string CodeGen_CUDA::printDeviceFuncName(const vector<pair<string, Expr>> curren
     // No non-tensor parameters
     delimiter = ", ";
   }
+  if(flag == PRINT_MEM_HOST_TO_DEV || flag == PRINT_MEM_DEV_TO_HOST) {
+    ret << ", ";
+    string delimiter = "";
+    for (size_t i=0; i<currentParameters.size(); i++) {
+      auto var = currentParameters[i].second.as<Var>();
+      taco_iassert(var) << "Unable to convert output " << currentParameters[i].second
+                        << " to Var";
+      string varName = currentParameters[i].first;
+
+      if (var->is_tensor) {
+        ret << delimiter << "taco_tensor_t * __restrict__ " << varName << "_dev";
+      }
+      else {
+        auto tp = printCUDAType(var->type, var->is_ptr);
+        ret << delimiter << tp << " ";
+        if (!var->is_ptr) {
+          ret << "&";
+        }
+        ret << var->name;
+      }
+      // No non-tensor parameters
+      delimiter = ", ";
+    }
+  }
   ret << ")";
+
   return ret.str();
 }
 
@@ -574,7 +654,36 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
     emittedTimerStartCode = true;
   }
 
+  // for malloc
+  string delimiter = "";
+  for (size_t i=0; i<currentParameters.size(); i++) {
+    taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+      << " to Var";
+    string varName = currentParameters[i].first;
+    //stream << "taco_tensor_t *"<< varName << "_dev = (taco_tensor_t *)malloc(sizeof(taco_tensor_t *));\n";
+    stream << "taco_tensor_t *"<< varName << "_dev = init_taco_tensor_t(" << varName << "->order, " << varName << "->csize, " << varName << "->dimensions, "
+	   << varName << "->mode_ordering, " << varName << "->mode_types);\n"; 
+    doIndent();
+  }
 
+
+  // for MemcpyHostToDev
+  stream << funcName << "MemcpyHostToDev" << index << "(";
+  for (size_t l=0; l<2; l++) {
+    for (size_t i=0; i<currentParameters.size(); i++) {
+      taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+        << " to Var";
+      string varName = currentParameters[i].first;
+      stream << delimiter << varName;
+      if(l == 1) stream << "_dev";
+
+      delimiter = ", ";
+    }
+  }
+  stream << ");\n\n";
+  doIndent();
+
+  // for DeviceKernel
   stream << funcName << "DeviceKernel" << index << "<<<";
   gridSize = ir::simplify(gridSize);
   gridSize.accept(this);
@@ -583,7 +692,7 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
   stream << ">>>";
   stream << "(";
 
-  string delimiter = "";
+  delimiter = "";
   for (size_t i=0; i<currentParameters.size(); i++) {
     taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
                                                         << " to Var";
@@ -605,8 +714,32 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
     stream << "cudaEventElapsedTime(&tot_ms, event1, event2);\n";
   }
   doIndent();
-  stream << "cudaDeviceSynchronize();\n";
+  stream << "cudaDeviceSynchronize();\n\n";
+
+  // for MemcpyDevToHost
+  doIndent();
+  stream << funcName << "DMemcpyDevToHost" << index << "(";
+  delimiter = "";
+  for (size_t i=0; i<currentParameters.size(); i++) {
+    taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+      << " to Var";
+    string varName = currentParameters[i].first;
+    stream << delimiter << varName << "_dev";
+
+    delimiter = ", ";
+  }
+  stream << ");\n";
 
+  // for free
+  for (size_t i=0; i<currentParameters.size(); i++) {
+    taco_iassert(currentParameters[i].second.as<Var>()) << "Unable to convert output " << currentParameters[i].second
+	  << " to Var";
+    string varName = currentParameters[i].first;
+    doIndent();
+
+    stream << "deinit_taco_tensor_t(" << varName << "_dev);\n"; 
+    //stream << "free("<< varName << "_dev);\n";
+  }
 }
 
 
@@ -634,6 +767,7 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) {
     out << cHeaders;
     if (outputKind == ImplementationGen) {
       out << endl << gpuAssertMacro;
+      out << endl << tensor_allocation;
     }
   }
   out << endl;
@@ -679,12 +813,6 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
       }
     }
 
-    // Generate device function header
-    doIndent();
-    out << printDeviceFuncName(parameters, i);
-    out << "{\n";
-    indent++;
-
     // Generate device function code
     resetUniqueNameCounters();
     vector<Expr> inputs;
@@ -710,8 +838,35 @@ void CodeGen_CUDA::printDeviceFunctions(const Function* func) {
     blockloop->accept(&varFinder);
     varMap = varFinder.varMap;
 
+
+
+    // Print MemcpyHostToDev function
+    out << printDeviceFuncName(parameters, i, PRINT_MEM_HOST_TO_DEV);
+    out << "{\n";
+    indent++;
+    out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_HOST_TO_DEV, deviceFunctionCollector.output_tensor) << endl;
+    indent--;
+    doIndent();
+    out << "}\n\n";
+
+    // Print MemcpyDevtToHost function
+    out << printDeviceFuncName(parameters, i, PRINT_MEM_DEV_TO_HOST);
+    out << "{\n";
+    indent++;
+    out << printDecls(varFinder.varDecls, inputs, {}, PRINT_MEM_DEV_TO_HOST, deviceFunctionCollector.output_tensor) << endl;
+    indent--;
+    doIndent();
+    out << "}\n\n";
+
+
+    // Generate device function header
+    doIndent();
+    out << printDeviceFuncName(parameters, i, PRINT_FUNC);
+    out << "{\n";
+    indent++;
+
     // Print variable declarations
-    out << printDecls(varFinder.varDecls, inputs, {}) << endl;
+    out << printDecls(varFinder.varDecls, inputs, {}, PRINT_FUNC, deviceFunctionCollector.output_tensor) << endl;
     doIndent();
     printBlockIDVariable(deviceFunctionCollector.blockIDVars[i], blockloop->start, blockloop->increment);
     doIndent();
@@ -779,7 +934,7 @@ void CodeGen_CUDA::visit(const Function* func) {
   localVars = varFinder.localVars;
 
   // Print variable declarations
-  out << printDecls(varFinder.varDecls, func->inputs, func->outputs) << endl;
+  out << printDecls(varFinder.varDecls, func->inputs, func->outputs, PRINT_FUNC, "") << endl;
 
   if (emittingCoroutine) {
     out << printContextDeclAndInit(varMap, localVars, numYields, func->name)
@@ -1082,6 +1237,8 @@ void CodeGen_CUDA::visit(const Allocate* op) {
   }
 
   doIndent();
+  //stream << "gpuErrchk(cudaMalloc((void**)&";
+  
   stream << "gpuErrchk(cudaMallocManaged((void**)&";
   if (op->is_realloc) {
     stream << variable_name;
@@ -1097,6 +1254,21 @@ void CodeGen_CUDA::visit(const Allocate* op) {
   parentPrecedence = TOP;
   stream << "));" << endl;
 
+  /*
+  if (op->is_realloc) {
+    stream << variable_name;
+  }
+  else {
+    op->var.accept(this);
+  }
+  stream << "= (" << elementType << "*)malloc(";
+  stream << "sizeof(" << elementType << ")";
+  stream << " * ";
+  parentPrecedence = MUL;
+  op->num_elements.accept(this);
+  parentPrecedence = TOP;
+  stream << ");" << endl;*/
+
   if(op->is_realloc) {
     doIndent();
     stream << "memcpy(" << variable_name << ", ";
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 2bc8e000d..53f116ebd 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -48,7 +48,7 @@ class CodeGen_CUDA : public CodeGen {
   void visit(const Assign*);
   void visit(const Break*);
   void visit(const Free* op);
-  std::string printDeviceFuncName(const std::vector<std::pair<std::string, Expr>> currentParameters, int index);
+  std::string printDeviceFuncName(const std::vector<std::pair<std::string, Expr>> currentParameters, int index, int flag);
   void printDeviceFuncCall(const std::vector<std::pair<std::string, Expr>> currentParameters, Expr blockSize, int index, Expr gridSize);
   void printThreadIDVariable(std::pair<std::string, Expr> threadIDVar, Expr start, Expr increment, Expr numThreads);
   void printBlockIDVariable(std::pair<std::string, Expr> blockIDVar, Expr start, Expr increment);
diff --git a/src/format.cpp b/src/format.cpp
index e1a7b4444..27a4abfb5 100644
--- a/src/format.cpp
+++ b/src/format.cpp
@@ -182,6 +182,11 @@ bool ModeFormat::hasProperties(const std::vector<Property>& properties) const {
           return false;
         }
         break;
+      case ZEROLESS:
+        if (!isZeroless()) {
+          return false;
+        }
+        break;	
       case NOT_FULL:
         if (isFull()) {
           return false;
@@ -207,6 +212,11 @@ bool ModeFormat::hasProperties(const std::vector<Property>& properties) const {
           return false;
         }
         break;
+      case NOT_ZEROLESS:
+        if (isZeroless()) {
+          return false;
+        }
+        break;	
     }
   }
   return true;
@@ -237,6 +247,11 @@ bool ModeFormat::isCompact() const {
   return impl->isCompact;
 }
 
+bool ModeFormat::isZeroless() const {
+  taco_iassert(defined());
+  return impl->isZeroless;
+}
+
 bool ModeFormat::hasCoordValIter() const {
   taco_iassert(defined());
   return impl->hasCoordValIter;
diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp
index 4d1ebd644..f5ddc31e4 100644
--- a/src/lower/iterator.cpp
+++ b/src/lower/iterator.cpp
@@ -191,6 +191,12 @@ bool Iterator::isCompact() const {
   return getMode().defined() && getMode().getModeFormat().isCompact();
 }
 
+bool Iterator::isZeroless() const {
+  taco_iassert(defined());
+  if (isDimensionIterator()) return false;
+  return getMode().defined() && getMode().getModeFormat().isZeroless();
+}
+
 bool Iterator::hasCoordIter() const {
   taco_iassert(defined());
   if (isDimensionIterator()) return false;
diff --git a/src/lower/lowerer_impl.cpp b/src/lower/lowerer_impl.cpp
index 7a0af13b3..1186495b2 100644
--- a/src/lower/lowerer_impl.cpp
+++ b/src/lower/lowerer_impl.cpp
@@ -1384,9 +1384,17 @@ Expr LowererImpl::lowerAccess(Access access) {
     return getTensorVar(var);
   }
 
-  return getIterators(access).back().isUnique()
-         ? Load::make(getValuesArray(var), generateValueLocExpr(access))
-         : getReducedValueVar(access);
+
+  if (getIterators(access).back().isUnique()) {
+	  if (var.getType().getDataType() == Datatype::Bool && getIterators(access).back().isZeroless())  {
+		  return true;
+	  } else {
+		  return Load::make(getValuesArray(var), generateValueLocExpr(access));
+	  }
+  } else {
+	  return getReducedValueVar(access);
+  }
+
 }
 
 
diff --git a/src/lower/mode_format_compressed.cpp b/src/lower/mode_format_compressed.cpp
index 366b34e8d..19679553e 100644
--- a/src/lower/mode_format_compressed.cpp
+++ b/src/lower/mode_format_compressed.cpp
@@ -10,12 +10,12 @@ using namespace taco::ir;
 namespace taco {
 
 CompressedModeFormat::CompressedModeFormat() : 
-    CompressedModeFormat(false, true, true) {
+    CompressedModeFormat(false, true, true, false) {
 }
 
 CompressedModeFormat::CompressedModeFormat(bool isFull, bool isOrdered,
-                                       bool isUnique, long long allocSize) :
-    ModeFormatImpl("compressed", isFull, isOrdered, isUnique, false, true,
+                                       bool isUnique, bool isZeroless, long long allocSize) :
+    ModeFormatImpl("compressed", isFull, isOrdered, isUnique, false, true, isZeroless,
                    false, true, false, false, true), 
     allocSize(allocSize) {
 }
@@ -25,6 +25,7 @@ ModeFormat CompressedModeFormat::copy(
   bool isFull = this->isFull;
   bool isOrdered = this->isOrdered;
   bool isUnique = this->isUnique;
+  bool isZeroless = this->isZeroless;
   for (const auto property : properties) {
     switch (property) {
       case ModeFormat::FULL:
@@ -45,12 +46,18 @@ ModeFormat CompressedModeFormat::copy(
       case ModeFormat::NOT_UNIQUE:
         isUnique = false;
         break;
+      case ModeFormat::ZEROLESS:
+        isZeroless = true;
+        break;
+      case ModeFormat::NOT_ZEROLESS:
+        isZeroless = false;
+        break;		
       default:
         break;
     }
   }
   const auto compressedVariant = 
-      std::make_shared<CompressedModeFormat>(isFull, isOrdered, isUnique);
+      std::make_shared<CompressedModeFormat>(isFull, isOrdered, isUnique, isZeroless);
   return ModeFormat(compressedVariant);
 }
 
diff --git a/src/lower/mode_format_dense.cpp b/src/lower/mode_format_dense.cpp
index 9ed9a78db..63c7ac01e 100644
--- a/src/lower/mode_format_dense.cpp
+++ b/src/lower/mode_format_dense.cpp
@@ -5,11 +5,11 @@ using namespace taco::ir;
 
 namespace taco {
 
-DenseModeFormat::DenseModeFormat() : DenseModeFormat(true, true) {
+DenseModeFormat::DenseModeFormat() : DenseModeFormat(true, true, false) {
 }
 
-DenseModeFormat::DenseModeFormat(const bool isOrdered, const bool isUnique) : 
-    ModeFormatImpl("dense", true, isOrdered, isUnique, false, true, false,
+DenseModeFormat::DenseModeFormat(const bool isOrdered, const bool isUnique, const bool isZeroless) : 
+    ModeFormatImpl("dense", true, isOrdered, isUnique, false, true, isZeroless, false,
                    false, true, true, false) {
 }
 
@@ -17,6 +17,7 @@ ModeFormat DenseModeFormat::copy(
     std::vector<ModeFormat::Property> properties) const {
   bool isOrdered = this->isOrdered;
   bool isUnique = this->isUnique;
+  bool isZeroless = this->isZeroless;
   for (const auto property : properties) {
     switch (property) {
       case ModeFormat::ORDERED:
@@ -31,11 +32,17 @@ ModeFormat DenseModeFormat::copy(
       case ModeFormat::NOT_UNIQUE:
         isUnique = false;
         break;
+      case ModeFormat::ZEROLESS:
+        isZeroless = true;
+        break;
+      case ModeFormat::NOT_ZEROLESS:
+        isZeroless = false;
+        break;	
       default:
         break;
     }
   }
-  return ModeFormat(std::make_shared<DenseModeFormat>(isOrdered, isUnique));
+  return ModeFormat(std::make_shared<DenseModeFormat>(isOrdered, isUnique, isZeroless));
 }
 
 ModeFunction DenseModeFormat::locate(ir::Expr parentPos,
diff --git a/src/lower/mode_format_impl.cpp b/src/lower/mode_format_impl.cpp
index bd5332150..8b27c8999 100644
--- a/src/lower/mode_format_impl.cpp
+++ b/src/lower/mode_format_impl.cpp
@@ -56,11 +56,11 @@ std::ostream& operator<<(std::ostream& os, const ModeFunction& modeFunction) {
 // class ModeTypeImpl
 ModeFormatImpl::ModeFormatImpl(const std::string name, bool isFull, 
                                bool isOrdered, bool isUnique, bool isBranchless, 
-                               bool isCompact, bool hasCoordValIter, 
+                               bool isCompact, bool isZeroless, bool hasCoordValIter, 
                                bool hasCoordPosIter, bool hasLocate, 
                                bool hasInsert, bool hasAppend) :
     name(name), isFull(isFull), isOrdered(isOrdered), isUnique(isUnique),
-    isBranchless(isBranchless), isCompact(isCompact),
+    isBranchless(isBranchless), isCompact(isCompact), isZeroless(isZeroless),
     hasCoordValIter(hasCoordValIter), hasCoordPosIter(hasCoordPosIter),
     hasLocate(hasLocate), hasInsert(hasInsert), hasAppend(hasAppend) {
 }
diff --git a/src/lower/mode_format_singleton.cpp b/src/lower/mode_format_singleton.cpp
index 402fcaa80..ae4fc5328 100644
--- a/src/lower/mode_format_singleton.cpp
+++ b/src/lower/mode_format_singleton.cpp
@@ -10,12 +10,12 @@ using namespace taco::ir;
 namespace taco {
 
 SingletonModeFormat::SingletonModeFormat() : 
-    SingletonModeFormat(false, true, true) {
+    SingletonModeFormat(false, true, true, false) {
 }
 
 SingletonModeFormat::SingletonModeFormat(bool isFull, bool isOrdered,
-                                         bool isUnique, long long allocSize) :
-    ModeFormatImpl("singleton", isFull, isOrdered, isUnique, true, true,
+                                         bool isUnique, bool isZeroless, long long allocSize) :
+    ModeFormatImpl("singleton", isFull, isOrdered, isUnique, true, true, isZeroless,
                    false, true, false, false, true), 
     allocSize(allocSize) {
 }
@@ -25,6 +25,7 @@ ModeFormat SingletonModeFormat::copy(
   bool isFull = this->isFull;
   bool isOrdered = this->isOrdered;
   bool isUnique = this->isUnique;
+  bool isZeroless = this->isZeroless;
   for (const auto property : properties) {
     switch (property) {
       case ModeFormat::FULL:
@@ -45,12 +46,18 @@ ModeFormat SingletonModeFormat::copy(
       case ModeFormat::NOT_UNIQUE:
         isUnique = false;
         break;
+      case ModeFormat::ZEROLESS:
+        isZeroless = true;
+        break;
+      case ModeFormat::NOT_ZEROLESS:
+        isZeroless = false;
+        break;
       default:
         break;
     }
   }
   const auto singletonVariant = 
-      std::make_shared<SingletonModeFormat>(isFull, isOrdered, isUnique);
+      std::make_shared<SingletonModeFormat>(isFull, isOrdered, isUnique, isZeroless);
   return ModeFormat(singletonVariant);
 }
 
diff --git a/test/tests-merge_lattice.cpp b/test/tests-merge_lattice.cpp
index a05e8db73..b07cd6baa 100644
--- a/test/tests-merge_lattice.cpp
+++ b/test/tests-merge_lattice.cpp
@@ -24,7 +24,8 @@ namespace tests {
 class HashedModeFormat : public ModeFormatImpl {
 public:
   HashedModeFormat() : ModeFormatImpl("hashed", false, false, true, false,
-                                      false, false, true, true, true, false) {}
+                                      false, false, false, true, true, true, false) {}
+
 
   ModeFormat copy(std::vector<ModeFormat::Property> properties) const {
     return ModeFormat(std::make_shared<HashedModeFormat>());
diff --git a/test/tests-storage_alloc.cpp b/test/tests-storage_alloc.cpp
index ed5a33923..611b50d3d 100644
--- a/test/tests-storage_alloc.cpp
+++ b/test/tests-storage_alloc.cpp
@@ -57,7 +57,7 @@ TEST_P(alloc, storage) {
 }
 
 IndexVar i("i"), j("j"), m("m"), n("n"), k("k"), l("l");
-ModeFormat SparseSmall(std::make_shared<CompressedModeFormat>(false, true, true, 
+ModeFormat SparseSmall(std::make_shared<CompressedModeFormat>(false, true, true, false, 
                                                           32));
 
 IndexArray dlab_indices() {