tensor-compiler · amaleewilson · Jun 15, 2020 · Jun 15, 2020 · Jun 22, 2020 · Jun 22, 2020
diff --git a/include/taco/codegen/module.h b/include/taco/codegen/module.h
@@ -25,6 +25,12 @@ class Module {
 
   /// Compile the source into a library, returning its full path
   std::string compile();
+
+  /// Recompile. This is a debugging tool that, given the path to the temporary
+  /// file generated by taco, will compile the file. This function is useful
+  /// for development and facilitates experimentation with generated code by
+  /// allowing developers to modify the generated code and compile it again. 
+  std::string recompile(std::string file_path);
 
   /// Compile the module into a source file located at the specified location
   /// path and prefix.  The generated source will be path/prefix.{.c|.bc, .h}

diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
@@ -850,7 +850,10 @@ class TensorVar : public util::Comparable<TensorVar> {
   TensorVar(const Type& type);
   TensorVar(const std::string& name, const Type& type);
   TensorVar(const Type& type, const Format& format);
-  TensorVar(const std::string& name, const Type& type, const Format& format);
+  TensorVar(const std::string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace=GPUWorkspace::None);
+
+  // Returns the type of GPU workspace this TensorVar is, which is None by default. 
+  GPUWorkspace getGPUWorkspace();
 
   /// Returns the name of the tensor variable.
   std::string getName() const;

diff --git a/include/taco/ir/ir.h b/include/taco/ir/ir.h
@@ -254,9 +254,10 @@ struct Var : public ExprNode<Var> {
   std::string name;
   bool is_ptr;
   bool is_tensor;
+  GPUWorkspace gpuworkspace;
 
   static Expr make(std::string name, Datatype type, bool is_ptr=false, 
-                   bool is_tensor=false);
+                   bool is_tensor=false, GPUWorkspace gpuworkspace=GPUWorkspace::None);
 
   static const IRNodeType _type_info = IRNodeType::Var;
 };

diff --git a/include/taco/ir_tags.h b/include/taco/ir_tags.h
@@ -29,4 +29,10 @@ enum class BoundType {
 extern const char *BoundType_NAMES[];
 }
 
+/* TODO: Not sure if this is the right place for these. */
+enum class GPUWorkspace {
+  None, DenseSharedMemory
+};
+extern const char *GPUWorkspace_NAMES[];
+
 #endif //TACO_IR_TAGS_H
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
@@ -413,12 +413,18 @@ class TensorBase {
 
   void compile(IndexStmt stmt, bool assembleWhileCompute=false);
 
+  void recompile(std::string file_path);
+
   /// Assemble the tensor storage, including index and value arrays.
   void assemble();
 
+  void reassemble();
+
   /// Compute the given expression and put the values in the tensor storage.
   void compute();
 
+  void recompute();
+
   /// Compile, assemble and compute as needed.
   void evaluate();
 

diff --git a/src/codegen/codegen_cuda.cpp b/src/codegen/codegen_cuda.cpp
@@ -295,6 +295,8 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   vector<pair<string, Expr>> warpIDVars;
   vector<Expr> numThreads;
   vector<Expr> numWarps;
+  vector<Expr> sizeSharedMemory;
+  std::string typeSharedMemory;
 
   CodeGen_CUDA *codeGen;
   // copy inputs and outputs into the map
@@ -359,6 +361,7 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
 
       threadFors.push_back(op);
       threadIDVars.push_back(pair<string, Expr>(scopeMap[op->var], op->var));
+
       Expr blockSize = ir::simplify(ir::Div::make(ir::Sub::make(op->end, op->start), op->increment));
       numThreads.push_back(blockSize);
     }
@@ -378,6 +381,16 @@ class CodeGen_CUDA::DeviceFunctionCollector : public IRVisitor {
   }
 
   virtual void visit(const Var *op) {
+
+    if (isa<Var>(op)){
+        if (to<Var>(op)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
+        {
+          string elementType = printCUDAType( op->type, false);
+          typeSharedMemory = elementType;
+          // sizeSharedMemory.push_back(Mul::make(to<Expr>()->num_elements, Literal::make(256)));
+        }
+      }
+
     if (scopeMap.count(op) == 0) {
       string name = codeGen->genUniqueName(op->name);
       if (!inDeviceFunction) {
@@ -580,7 +593,23 @@ void CodeGen_CUDA::printDeviceFuncCall(const vector<pair<string, Expr>> currentP
   gridSize.accept(this);
   stream << ", ";
   blockSize.accept(this);
-  stream << ">>>";
+
+  if (usesshared == false)
+  {
+    stream << ">>>";
+  }
+  else
+  {
+    // BIG TODO: no hard code 2048
+    // Should be num_threads * num_prec_elems * sizeof(prec_type)
+    stream << ", " ;
+    sizeofshared.accept(this);
+    stream << " * sizeof(" << typeofshared << ")>>>"; 
+    // 2048*sizeof(double)>>>";
+  }
+
+
+
   stream << "(";
 
   string delimiter = "";
@@ -627,6 +656,9 @@ void CodeGen_CUDA::compile(Stmt stmt, bool isFirst) {
   parentParallelUnits = {};
   parallelUnitSizes = {};
   parallelUnitIDVars = {};
+  sizeofshared = Expr();
+  typeofshared = "";
+  usesshared = false;
   emittedTimerStartCode = false;
   isHostFunction = true;
   if (isFirst) {
@@ -1021,51 +1053,103 @@ void CodeGen_CUDA::visit(const Max* op) {
 
 void CodeGen_CUDA::visit(const Allocate* op) {
   string elementType = printCUDAType(op->var.type(), false);
+
   if (!isHostFunction) {
-    if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
-      // double w_GPUThread[num];
-      // for threads allocate thread local memory
+
+    if (to<Var>(op->var)->gpuworkspace == GPUWorkspace::DenseSharedMemory)
+    {
+      taco_iassert(!op->is_realloc);
       doIndent();
-      stream << elementType << " ";
+      stream << "__shared__ " << elementType << " ";
       op->var.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        stream << "_ALL";
+      }
       stream << "[";
-      op->num_elements.accept(this);
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        Expr numElements = Mul::make(op->num_elements, 
+              parallelUnitSizes[ParallelUnit::GPUBlock]);
+
+        sizeofshared = numElements;
+        typeofshared = elementType;
+        usesshared = true;
+
+        ir::simplify(numElements).accept(this);
+      }
+      else {
+        doIndent();
+        stream << elementType << " ";
+        op->var.accept(this);
+        stream << "[";
+        op->num_elements.accept(this);
+        stream << "];" << endl;
+        return;
+      }
       stream << "];" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        doIndent();
+        stream << elementType << " * ";
+        op->var.accept(this);
+
+        stream << " = ";
+        op->var.accept(this);
+        stream << "_ALL + threadIdx.x";
+        // parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
+        stream << " * ";
+        op->num_elements.accept(this);
+        // parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
+        stream << ";" << endl;
+      }
       return;
     }
-    // __shared__ double w_GPUThread[32]; if no warps
-    // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
-    // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
-    taco_iassert(!op->is_realloc);
-    doIndent();
-    stream << "__shared__ " << elementType << " ";
-    op->var.accept(this);
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      stream << "_ALL";
-    }
-    stream << "[";
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
-      Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
-      ir::simplify(numElements).accept(this);
-    }
-    else {
-      op->num_elements.accept(this);
-    }
-    stream << "];" << endl;
-    if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+    else
+    {
+      if (parentParallelUnits.count(ParallelUnit::GPUThread)) {
+        // double w_GPUThread[num];
+        // for threads allocate thread local memory
+        doIndent();
+        stream << elementType << " ";
+        op->var.accept(this);
+        stream << "[";
+        op->num_elements.accept(this);
+        stream << "];" << endl;
+        return;
+      }
+      // __shared__ double w_GPUThread[32]; if no warps
+      // __shared__ double w_GPUThread_ALL[32 * # num warps]; if warps
+      // double * w_GPUThread = w_GPUThread_ALL + warp_id * 32;
+      taco_iassert(!op->is_realloc);
       doIndent();
-      stream << elementType << " * ";
-      op->var.accept(this);
-
-      stream << " = ";
+      stream << "__shared__ " << elementType << " ";
       op->var.accept(this);
-      stream << "_ALL + ";
-      parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
-      stream << " * ";
-      parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
-      stream << ";" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        stream << "_ALL";
+      }
+      stream << "[";
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        Expr numElements = Mul::make(op->num_elements, Div::make(parallelUnitSizes[ParallelUnit::GPUBlock], parallelUnitSizes[ParallelUnit::GPUWarp]));
+        ir::simplify(numElements).accept(this);
+      }
+      else {
+        op->num_elements.accept(this);
+      }
+      stream << "];" << endl;
+      if (parentParallelUnits.count(ParallelUnit::GPUWarp)) {
+        doIndent();
+        stream << elementType << " * ";
+        op->var.accept(this);
+
+        stream << " = ";
+        op->var.accept(this);
+        stream << "_ALL + ";
+        parallelUnitIDVars[ParallelUnit::GPUWarp].accept(this);
+        stream << " * ";
+        parallelUnitSizes[ParallelUnit::GPUWarp].accept(this);
+        stream << ";" << endl;
+      }
+      return;
     }
-    return;
+
   }
   string variable_name;
   if (op->is_realloc) {

diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
@@ -73,6 +73,11 @@ class CodeGen_CUDA : public CodeGen {
   std::map<ParallelUnit, Expr> parallelUnitSizes;
   std::map<ParallelUnit, Expr> parallelUnitIDVars;
 
+
+  Expr sizeofshared;
+  std::string typeofshared;
+  bool usesshared;
+
   bool emittedTimerStartCode = false;
 
   std::ostream &out;

diff --git a/src/codegen/module.cpp b/src/codegen/module.cpp
@@ -164,6 +164,51 @@ string Module::compile() {
   return fullpath;
 }
 
+string Module::recompile(string file_path) {
+  string prefix = file_path;
+  string fullpath = prefix + ".so";
+
+  string cc;
+  string cflags;
+  string file_ending;
+  string shims_file;
+  if (should_use_CUDA_codegen()) {
+    cc = "nvcc";
+    cflags = util::getFromEnv("TACO_NVCCFLAGS",
+    get_default_CUDA_compiler_flags());
+    file_ending = ".cu";
+    shims_file = prefix + "_shims.cpp";
+  }
+  else {
+    cc = util::getFromEnv(target.compiler_env, target.compiler);
+    cflags = util::getFromEnv("TACO_CFLAGS",
+    "-O3 -ffast-math -std=c99") + " -shared -fPIC";
+    file_ending = ".c";
+    shims_file = "";
+  }
+#if USE_OPENMP
+  cflags += " -fopenmp";
+#endif
+
+  string cmd = cc + " " + cflags + " " +
+    prefix + file_ending + " " + shims_file + " " + 
+    "-o " + fullpath + " -lm";
+
+  // now compile it
+  int err = system(cmd.data());
+  taco_uassert(err == 0) << "Compilation command failed:\n" << cmd
+    << "\nreturned " << err;
+
+  // use dlsym() to open the compiled library
+  if (lib_handle) {
+    dlclose(lib_handle);
+  }
+  lib_handle = dlopen(fullpath.data(), RTLD_NOW | RTLD_LOCAL);
+  taco_uassert(lib_handle) << "Failed to load generated code";
+
+  return fullpath;
+}
+
 void Module::setSource(string source) {
   this->source << source;
   moduleFromUserSource = true;

diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
@@ -1819,6 +1819,7 @@ struct TensorVar::Content {
   Type type;
   Format format;
   Schedule schedule;
+  GPUWorkspace gpuworkspace;
 };
 
 TensorVar::TensorVar() : content(nullptr) {
@@ -1840,11 +1841,16 @@ TensorVar::TensorVar(const Type& type, const Format& format)
     : TensorVar(util::uniqueName('A'), type, format) {
 }
 
-TensorVar::TensorVar(const string& name, const Type& type, const Format& format)
+TensorVar::TensorVar(const string& name, const Type& type, const Format& format, GPUWorkspace gpuworkspace)
     : content(new Content) {
   content->name = name;
   content->type = type;
   content->format = format;
+  content->gpuworkspace = gpuworkspace;
+}
+
+GPUWorkspace TensorVar::getGPUWorkspace() {
+  return content->gpuworkspace;
 }
 
 std::string TensorVar::getName() const {

diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
@@ -239,6 +239,7 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
 
         IndexStmt consumer = forall(i, replace(s, {{e, ws(i)}}));
         IndexStmt producer = forall(iw, ws(iw) = replace(e, {{i,iw}}));
+
         Where where(consumer, producer);
 
         stmt = where;

diff --git a/src/ir/ir.cpp b/src/ir/ir.cpp
@@ -241,14 +241,15 @@ bool Literal::equalsScalar(double scalar) const {
   return false;
 }
 
-Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor) {
+Expr Var::make(std::string name, Datatype type, bool is_ptr, bool is_tensor, GPUWorkspace gpuworkspace) {
   Var *var = new Var;
   var->type = type;
   var->name = name;
 
   // TODO: is_ptr and is_tensor should be part of type
   var->is_ptr = is_ptr;
   var->is_tensor = is_tensor;
+  var->gpuworkspace = gpuworkspace;
 
   return var;
 }

diff --git a/src/ir_tags.cpp b/src/ir_tags.cpp
@@ -4,4 +4,5 @@ namespace taco {
 const char *ParallelUnit_NAMES[] = {"NotParallel", "DefaultUnit", "GPUBlock", "GPUWarp", "GPUThread", "CPUThread", "CPUVector", "CPUThreadGroupReduction", "GPUBlockReduction", "GPUWarpReduction"};
 const char *OutputRaceStrategy_NAMES[] = {"IgnoreRaces", "NoRaces", "Atomics", "Temporary", "ParallelReduction"};
 const char *BoundType_NAMES[] = {"MinExact", "MinConstraint", "MaxExact", "MaxConstraint"};
+const char *GPUWorkspace_NAMES[] = {"None", "DenseSharedMemory"};
 }