Merge pull request #6 from rohany/parallel-validate

Parallel validate
tensor-compiler · May 30, 2021 · ba8addd · ba8addd
2 parents f8f05ab + 759d534
commit ba8addd
Show file tree

Hide file tree

Showing 8 changed files with 197 additions and 13 deletions.
diff --git a/legion/CMakeLists.txt b/legion/CMakeLists.txt
@@ -20,10 +20,6 @@ if (TACO_USE_LOGGING_MAPPER)
     add_definitions(-DTACO_USE_LOGGING_MAPPER)
 endif()
 
-if (Legion_USE_CUDA)
-    add_definitions(-DTACO_USE_CUDA)
-endif()
-
 function(add_app_folder folder)
     file(GLOB SOURCES "${folder}/main.cpp" "${folder}/taco-generated.cpp" ${LG_SOURCES})
     add_executable("${folder}" ${SOURCES})
@@ -33,6 +29,7 @@ function(add_app_folder folder)
         if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${folder}/taco-generated.cu")
             cuda_add_executable("${folder}-cuda" "${folder}/main.cpp" "${folder}/taco-generated.cu" ${LG_SOURCES} ${LG_CU_SOURCES})
             target_link_libraries("${folder}-cuda" Legion::Legion cublas)
+	    target_compile_definitions("${folder}-cuda" PUBLIC TACO_USE_CUDA)
         endif()
     endif()
 endfunction()
@@ -42,6 +39,7 @@ function(add_cuda_folder folder)
         file(GLOB SOURCES "${folder}/*.cpp" "${folder}/*.cu" ${LG_SOURCES})
 	cuda_add_executable("${folder}" ${SOURCES} ${LG_CU_SOURCES})
         target_link_libraries("${folder}" Legion::Legion)
+	target_compile_definitions("${folder}" PUBLIC TACO_USE_CUDA)
     endif()
 endfunction()
 

diff --git a/legion/cannonMM/main.cpp b/legion/cannonMM/main.cpp
@@ -65,14 +65,8 @@ void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions
   // Compute on the tensors.
   benchmark([&]() { computeLegion(ctx, runtime, A, B, C, gx, gy); });
 
-  auto a_reg = getRegionToWrite(ctx, runtime, A, A);
-  FieldAccessor<READ_WRITE,valType,2,coord_t, Realm::AffineAccessor<valType, 2, coord_t>> a_rw(a_reg, FID_VAL);
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      assert(a_rw[Point<2>(i, j)] == n);
-    }
-  }
-  runtime->unmap_region(ctx, a_reg);
+  // The result should be equal to 1.
+  tacoValidate<valType>(ctx, runtime, A, valType(n));
 }
 
 TACO_MAIN(valType)
diff --git a/legion/include/fill.cuh b/legion/include/fill.cuh
@@ -4,7 +4,6 @@
 #include "legion.h"
 #include "pitches.h"
 #include "taco_legion_header.h"
-
 #include "fill.h"
 
 const int THREADS_PER_BLOCK = 256;

diff --git a/legion/include/fill.h b/legion/include/fill.h
@@ -6,6 +6,7 @@
 #include "taco_legion_header.h"
 #include "taco/version.h"
 
+// TODO (rohany): Move these predefined tasks to an enum.
 const int TACO_FILL_TASK = 1;
 
 template<int DIM, typename T>

diff --git a/legion/include/legion_utils.h b/legion/include/legion_utils.h
@@ -7,6 +7,7 @@
 #include "mappers/default_mapper.h"
 #include "taco/version.h"
 #include "fill.h"
+#include "validate.h"
 
 #ifdef TACO_USE_CUDA
 #include "cudalibs.h"
@@ -42,6 +43,7 @@ void initCUDA();
       Runtime::preregister_task_variant<top_level_task>(registrar, "top_level"); \
     }                       \
     registerTACOFillTasks<FillType>();             \
+    registerTACOValidateTasks<FillType>();             \
     Runtime::add_registration_callback(register_taco_mapper);     \
     initCUDA(); \
     registerTacoTasks();            \

diff --git a/legion/include/validate.cuh b/legion/include/validate.cuh
@@ -0,0 +1,59 @@
+#ifndef TACO_LG_VALIDATE_CUH
+#define TACO_LG_VALIDATE_CUH
+
+#include "legion.h"
+#include "pitches.h"
+#include "taco_legion_header.h"
+#include "validate.h"
+
+const int THREADS_PER_BLOCK = 256;
+
+template<int DIM, typename T>
+__global__
+void tacoValidateGPUKernel(
+    Legion::FieldAccessor <READ_ONLY, T, DIM, Legion::coord_t, Realm::AffineAccessor<T, DIM, Legion::coord_t>> a,
+    T value, Pitches<DIM - 1> pitches, Legion::Point<DIM> lo, size_t volume) {
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= volume) return;
+  assert(a[pitches.unflatten(idx, lo)] == value);
+}
+
+template<int DIM, typename T>
+void tacoValidateGPU(const Legion::Task* task, Legion::PhysicalRegion r, Legion::Rect<DIM> rect) {
+  typedef Legion::FieldAccessor<READ_ONLY,T,DIM,Legion::coord_t,Realm::AffineAccessor<T,DIM,Legion::coord_t>> Accessor;
+  Accessor ar(r, FID_VAL);
+  Pitches<DIM - 1> pitches;
+  auto volume = pitches.flatten(rect);
+  auto blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  tacoValidateGPUKernel<DIM, T><<<blocks, THREADS_PER_BLOCK>>>(ar, *(T*)(task->args), pitches, rect.lo, volume);
+}
+
+template<typename T>
+void tacoValidateGPUTask(const Legion::Task* task, const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx, Legion::Runtime* runtime) {
+  Legion::PhysicalRegion r = regions[0];
+  auto ispace = r.get_logical_region().get_index_space();
+  auto domain = runtime->get_index_space_domain(ispace);
+  switch (ispace.get_dim()) {
+#define BLOCK(DIM) \
+        case DIM:  \
+          {        \
+            tacoValidateGPU<DIM, T>(task, r, domain); \
+            break; \
+          }
+    LEGION_FOREACH_N(BLOCK)
+#undef BLOCK
+    default:
+      assert(false);
+  }
+}
+
+template <typename T>
+void registerGPUValidateTask() {
+  {
+    Legion::TaskVariantRegistrar registrar(TACO_VALIDATE_TASK, "taco_validate");
+    registrar.add_constraint(Legion::ProcessorConstraint(Legion::Processor::TOC_PROC));
+    Legion::Runtime::preregister_task_variant<tacoValidateGPUTask<T>>(registrar, "taco_validate");
+  }
+}
+
+#endif // TACO_LG_VALIDATE_CUH
diff --git a/legion/include/validate.h b/legion/include/validate.h
@@ -0,0 +1,124 @@
+#ifndef TACO_LG_VALIDATE_H
+#define TACO_LG_VALIDATE_H
+
+#include "legion.h"
+#include "pitches.h"
+#include "taco_legion_header.h"
+#include "taco/version.h"
+
+const int TACO_VALIDATE_TASK = 10;
+
+template<int DIM, typename T>
+void tacoValidateCPU(const Legion::Task* task, Legion::PhysicalRegion r, Legion::Rect<DIM> rect) {
+  typedef Legion::FieldAccessor<READ_ONLY,T,DIM,Legion::coord_t,Realm::AffineAccessor<T,DIM,Legion::coord_t>> Accessor;
+  Accessor ar(r, FID_VAL);
+  Pitches<DIM - 1> pitches;
+  auto volume = pitches.flatten(rect);
+  for (size_t i = 0; i < volume; i++) {
+    assert(ar[pitches.unflatten(i, rect.lo)] == *(T*)(task->args));
+  }
+}
+
+template<int DIM, typename T>
+void tacoValidateOMP(const Legion::Task* task, Legion::PhysicalRegion r, Legion::Rect<DIM> rect) {
+  typedef Legion::FieldAccessor<READ_ONLY,T,DIM,Legion::coord_t,Realm::AffineAccessor<T,DIM,Legion::coord_t>> Accessor;
+  Accessor ar(r, FID_VAL);
+  Pitches<DIM - 1> pitches;
+  auto volume = pitches.flatten(rect);
+#pragma omp parallel for schedule(static)
+  for (size_t i = 0; i < volume; i++) {
+    assert(ar[pitches.unflatten(i, rect.lo)] == *(T*)(task->args));
+  }
+}
+
+template<typename T>
+void tacoValidateCPUTask(const Legion::Task* task, const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx, Legion::Runtime* runtime) {
+  Legion::PhysicalRegion r = regions[0];
+  auto ispace = r.get_logical_region().get_index_space();
+  auto domain = runtime->get_index_space_domain(ispace);
+  switch (ispace.get_dim()) {
+#define BLOCK(DIM) \
+        case DIM:  \
+          {        \
+            tacoValidateCPU<DIM, T>(task, r, domain); \
+            break; \
+          }
+    LEGION_FOREACH_N(BLOCK)
+#undef BLOCK
+    default:
+      assert(false);
+  }
+}
+
+template<typename T>
+void tacoValidateOMPTask(const Legion::Task* task, const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx, Legion::Runtime* runtime) {
+  Legion::PhysicalRegion r = regions[0];
+  auto ispace = r.get_logical_region().get_index_space();
+  auto domain = runtime->get_index_space_domain(ispace);
+  switch (ispace.get_dim()) {
+#define BLOCK(DIM) \
+        case DIM:  \
+          {        \
+            tacoValidateOMP<DIM, T>(task, r, domain); \
+            break; \
+          }
+    LEGION_FOREACH_N(BLOCK)
+#undef BLOCK
+    default:
+      assert(false);
+  }
+}
+
+template<typename T>
+void tacoValidate(Legion::Context ctx, Legion::Runtime* runtime, Legion::LogicalRegion r, T val) {
+  size_t pieces = 0;
+  // Favor TOC proc > OMP proc > CPU proc. The default mapper performs this same heuristic
+  // as well, so there's nothing more we need to do.
+  auto numGPU = runtime->select_tunable_value(ctx, Legion::Mapping::DefaultMapper::DEFAULT_TUNABLE_GLOBAL_GPUS).get<size_t>();
+  auto numOMP = runtime->select_tunable_value(ctx, Legion::Mapping::DefaultMapper::DEFAULT_TUNABLE_GLOBAL_OMPS).get<size_t>();
+  auto numCPU = runtime->select_tunable_value(ctx, Legion::Mapping::DefaultMapper::DEFAULT_TUNABLE_GLOBAL_CPUS).get<size_t>();
+  if (numGPU != 0) {
+    pieces = numGPU;
+  } else if (numOMP != 0) {
+    pieces = numOMP;
+  } else if (numCPU != 0) {
+    pieces = numCPU;
+  } else {
+    assert(false);
+  }
+  auto ispace = runtime->create_index_space(ctx, Legion::Rect<1>(0, pieces - 1));
+  auto ipart = runtime->create_equal_partition(ctx, r.get_index_space(), ispace);
+  auto lpart = runtime->get_logical_partition(ctx, r, ipart);
+  Legion::IndexLauncher l(TACO_VALIDATE_TASK, runtime->get_index_space_domain(ispace), Legion::TaskArgument(&val, sizeof(T)), Legion::ArgumentMap());
+  l.add_region_requirement(Legion::RegionRequirement(lpart, 0, READ_ONLY, EXCLUSIVE, r).add_field(FID_VAL));
+  runtime->execute_index_space(ctx, l);
+}
+
+// TODO (rohany): Do the CUDA version of the validation.
+// If we're building with CUDA, then declare the fill kernel.
+#ifdef TACO_USE_CUDA
+template<typename T>
+void registerGPUValidateTask();
+#endif
+
+template <typename T>
+void registerTACOValidateTasks() {
+  // Register the CPU variant.
+  {
+    Legion::TaskVariantRegistrar registrar(TACO_VALIDATE_TASK, "taco_validate");
+    registrar.add_constraint(Legion::ProcessorConstraint(Legion::Processor::LOC_PROC));
+    Legion::Runtime::preregister_task_variant<tacoValidateCPUTask<T>>(registrar, "taco_validate");
+  }
+  // Register the OMP variant if present.
+  if (TACO_FEATURE_OPENMP) {
+    Legion::TaskVariantRegistrar registrar(TACO_VALIDATE_TASK, "taco_validate");
+    registrar.add_constraint(Legion::ProcessorConstraint(Legion::Processor::OMP_PROC));
+    Legion::Runtime::preregister_task_variant<tacoValidateOMPTask<T>>(registrar, "taco_validate");
+  }
+  // Register a CUDA variant if we have a CUDA build.
+#ifdef TACO_USE_CUDA
+  registerGPUValidateTask<T>();
+#endif
+}
+
+#endif // TACO_LG_VALIDATE_H
diff --git a/legion/src/validate.cu b/legion/src/validate.cu
@@ -0,0 +1,7 @@
+#include "validate.cuh"
+
+// Register the validate task for several common types that we use.
+template void registerGPUValidateTask<int32_t>();
+template void registerGPUValidateTask<int64_t>();
+template void registerGPUValidateTask<float>();
+template void registerGPUValidateTask<double>();