Skip to content

Commit

Permalink
Merge pull request #6 from rohany/parallel-validate
Browse files Browse the repository at this point in the history
Parallel validate
  • Loading branch information
rohany authored May 30, 2021
2 parents f8f05ab + 759d534 commit ba8addd
Show file tree
Hide file tree
Showing 8 changed files with 197 additions and 13 deletions.
6 changes: 2 additions & 4 deletions legion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ if (TACO_USE_LOGGING_MAPPER)
add_definitions(-DTACO_USE_LOGGING_MAPPER)
endif()

if (Legion_USE_CUDA)
add_definitions(-DTACO_USE_CUDA)
endif()

function(add_app_folder folder)
file(GLOB SOURCES "${folder}/main.cpp" "${folder}/taco-generated.cpp" ${LG_SOURCES})
add_executable("${folder}" ${SOURCES})
Expand All @@ -33,6 +29,7 @@ function(add_app_folder folder)
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${folder}/taco-generated.cu")
cuda_add_executable("${folder}-cuda" "${folder}/main.cpp" "${folder}/taco-generated.cu" ${LG_SOURCES} ${LG_CU_SOURCES})
target_link_libraries("${folder}-cuda" Legion::Legion cublas)
target_compile_definitions("${folder}-cuda" PUBLIC TACO_USE_CUDA)
endif()
endif()
endfunction()
Expand All @@ -42,6 +39,7 @@ function(add_cuda_folder folder)
file(GLOB SOURCES "${folder}/*.cpp" "${folder}/*.cu" ${LG_SOURCES})
cuda_add_executable("${folder}" ${SOURCES} ${LG_CU_SOURCES})
target_link_libraries("${folder}" Legion::Legion)
target_compile_definitions("${folder}" PUBLIC TACO_USE_CUDA)
endif()
endfunction()

Expand Down
10 changes: 2 additions & 8 deletions legion/cannonMM/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,8 @@ void top_level_task(const Task* task, const std::vector<PhysicalRegion>& regions
// Compute on the tensors.
benchmark([&]() { computeLegion(ctx, runtime, A, B, C, gx, gy); });

auto a_reg = getRegionToWrite(ctx, runtime, A, A);
FieldAccessor<READ_WRITE,valType,2,coord_t, Realm::AffineAccessor<valType, 2, coord_t>> a_rw(a_reg, FID_VAL);
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
assert(a_rw[Point<2>(i, j)] == n);
}
}
runtime->unmap_region(ctx, a_reg);
// The result should be equal to 1.
tacoValidate<valType>(ctx, runtime, A, valType(n));
}

TACO_MAIN(valType)
1 change: 0 additions & 1 deletion legion/include/fill.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include "legion.h"
#include "pitches.h"
#include "taco_legion_header.h"

#include "fill.h"

const int THREADS_PER_BLOCK = 256;
Expand Down
1 change: 1 addition & 0 deletions legion/include/fill.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "taco_legion_header.h"
#include "taco/version.h"

// TODO (rohany): Move these predefined tasks to an enum.
const int TACO_FILL_TASK = 1;

template<int DIM, typename T>
Expand Down
2 changes: 2 additions & 0 deletions legion/include/legion_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "mappers/default_mapper.h"
#include "taco/version.h"
#include "fill.h"
#include "validate.h"

#ifdef TACO_USE_CUDA
#include "cudalibs.h"
Expand Down Expand Up @@ -42,6 +43,7 @@ void initCUDA();
Runtime::preregister_task_variant<top_level_task>(registrar, "top_level"); \
} \
registerTACOFillTasks<FillType>(); \
registerTACOValidateTasks<FillType>(); \
Runtime::add_registration_callback(register_taco_mapper); \
initCUDA(); \
registerTacoTasks(); \
Expand Down
59 changes: 59 additions & 0 deletions legion/include/validate.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#ifndef TACO_LG_VALIDATE_CUH
#define TACO_LG_VALIDATE_CUH

#include "legion.h"
#include "pitches.h"
#include "taco_legion_header.h"
#include "validate.h"

const int THREADS_PER_BLOCK = 256;

template<int DIM, typename T>
__global__
void tacoValidateGPUKernel(
Legion::FieldAccessor <READ_ONLY, T, DIM, Legion::coord_t, Realm::AffineAccessor<T, DIM, Legion::coord_t>> a,
T value, Pitches<DIM - 1> pitches, Legion::Point<DIM> lo, size_t volume) {
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= volume) return;
assert(a[pitches.unflatten(idx, lo)] == value);
}

template<int DIM, typename T>
void tacoValidateGPU(const Legion::Task* task, Legion::PhysicalRegion r, Legion::Rect<DIM> rect) {
typedef Legion::FieldAccessor<READ_ONLY,T,DIM,Legion::coord_t,Realm::AffineAccessor<T,DIM,Legion::coord_t>> Accessor;
Accessor ar(r, FID_VAL);
Pitches<DIM - 1> pitches;
auto volume = pitches.flatten(rect);
auto blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
tacoValidateGPUKernel<DIM, T><<<blocks, THREADS_PER_BLOCK>>>(ar, *(T*)(task->args), pitches, rect.lo, volume);
}

template<typename T>
void tacoValidateGPUTask(const Legion::Task* task, const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx, Legion::Runtime* runtime) {
Legion::PhysicalRegion r = regions[0];
auto ispace = r.get_logical_region().get_index_space();
auto domain = runtime->get_index_space_domain(ispace);
switch (ispace.get_dim()) {
#define BLOCK(DIM) \
case DIM: \
{ \
tacoValidateGPU<DIM, T>(task, r, domain); \
break; \
}
LEGION_FOREACH_N(BLOCK)
#undef BLOCK
default:
assert(false);
}
}

template <typename T>
void registerGPUValidateTask() {
{
Legion::TaskVariantRegistrar registrar(TACO_VALIDATE_TASK, "taco_validate");
registrar.add_constraint(Legion::ProcessorConstraint(Legion::Processor::TOC_PROC));
Legion::Runtime::preregister_task_variant<tacoValidateGPUTask<T>>(registrar, "taco_validate");
}
}

#endif // TACO_LG_VALIDATE_CUH
124 changes: 124 additions & 0 deletions legion/include/validate.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#ifndef TACO_LG_VALIDATE_H
#define TACO_LG_VALIDATE_H

#include "legion.h"
#include "pitches.h"
#include "taco_legion_header.h"
#include "taco/version.h"

const int TACO_VALIDATE_TASK = 10;

template<int DIM, typename T>
void tacoValidateCPU(const Legion::Task* task, Legion::PhysicalRegion r, Legion::Rect<DIM> rect) {
typedef Legion::FieldAccessor<READ_ONLY,T,DIM,Legion::coord_t,Realm::AffineAccessor<T,DIM,Legion::coord_t>> Accessor;
Accessor ar(r, FID_VAL);
Pitches<DIM - 1> pitches;
auto volume = pitches.flatten(rect);
for (size_t i = 0; i < volume; i++) {
assert(ar[pitches.unflatten(i, rect.lo)] == *(T*)(task->args));
}
}

template<int DIM, typename T>
void tacoValidateOMP(const Legion::Task* task, Legion::PhysicalRegion r, Legion::Rect<DIM> rect) {
typedef Legion::FieldAccessor<READ_ONLY,T,DIM,Legion::coord_t,Realm::AffineAccessor<T,DIM,Legion::coord_t>> Accessor;
Accessor ar(r, FID_VAL);
Pitches<DIM - 1> pitches;
auto volume = pitches.flatten(rect);
#pragma omp parallel for schedule(static)
for (size_t i = 0; i < volume; i++) {
assert(ar[pitches.unflatten(i, rect.lo)] == *(T*)(task->args));
}
}

template<typename T>
void tacoValidateCPUTask(const Legion::Task* task, const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx, Legion::Runtime* runtime) {
Legion::PhysicalRegion r = regions[0];
auto ispace = r.get_logical_region().get_index_space();
auto domain = runtime->get_index_space_domain(ispace);
switch (ispace.get_dim()) {
#define BLOCK(DIM) \
case DIM: \
{ \
tacoValidateCPU<DIM, T>(task, r, domain); \
break; \
}
LEGION_FOREACH_N(BLOCK)
#undef BLOCK
default:
assert(false);
}
}

template<typename T>
void tacoValidateOMPTask(const Legion::Task* task, const std::vector<Legion::PhysicalRegion>& regions, Legion::Context ctx, Legion::Runtime* runtime) {
Legion::PhysicalRegion r = regions[0];
auto ispace = r.get_logical_region().get_index_space();
auto domain = runtime->get_index_space_domain(ispace);
switch (ispace.get_dim()) {
#define BLOCK(DIM) \
case DIM: \
{ \
tacoValidateOMP<DIM, T>(task, r, domain); \
break; \
}
LEGION_FOREACH_N(BLOCK)
#undef BLOCK
default:
assert(false);
}
}

template<typename T>
void tacoValidate(Legion::Context ctx, Legion::Runtime* runtime, Legion::LogicalRegion r, T val) {
size_t pieces = 0;
// Favor TOC proc > OMP proc > CPU proc. The default mapper performs this same heuristic
// as well, so there's nothing more we need to do.
auto numGPU = runtime->select_tunable_value(ctx, Legion::Mapping::DefaultMapper::DEFAULT_TUNABLE_GLOBAL_GPUS).get<size_t>();
auto numOMP = runtime->select_tunable_value(ctx, Legion::Mapping::DefaultMapper::DEFAULT_TUNABLE_GLOBAL_OMPS).get<size_t>();
auto numCPU = runtime->select_tunable_value(ctx, Legion::Mapping::DefaultMapper::DEFAULT_TUNABLE_GLOBAL_CPUS).get<size_t>();
if (numGPU != 0) {
pieces = numGPU;
} else if (numOMP != 0) {
pieces = numOMP;
} else if (numCPU != 0) {
pieces = numCPU;
} else {
assert(false);
}
auto ispace = runtime->create_index_space(ctx, Legion::Rect<1>(0, pieces - 1));
auto ipart = runtime->create_equal_partition(ctx, r.get_index_space(), ispace);
auto lpart = runtime->get_logical_partition(ctx, r, ipart);
Legion::IndexLauncher l(TACO_VALIDATE_TASK, runtime->get_index_space_domain(ispace), Legion::TaskArgument(&val, sizeof(T)), Legion::ArgumentMap());
l.add_region_requirement(Legion::RegionRequirement(lpart, 0, READ_ONLY, EXCLUSIVE, r).add_field(FID_VAL));
runtime->execute_index_space(ctx, l);
}

// TODO (rohany): Do the CUDA version of the validation.
// If we're building with CUDA, then declare the fill kernel.
#ifdef TACO_USE_CUDA
template<typename T>
void registerGPUValidateTask();
#endif

template <typename T>
void registerTACOValidateTasks() {
// Register the CPU variant.
{
Legion::TaskVariantRegistrar registrar(TACO_VALIDATE_TASK, "taco_validate");
registrar.add_constraint(Legion::ProcessorConstraint(Legion::Processor::LOC_PROC));
Legion::Runtime::preregister_task_variant<tacoValidateCPUTask<T>>(registrar, "taco_validate");
}
// Register the OMP variant if present.
if (TACO_FEATURE_OPENMP) {
Legion::TaskVariantRegistrar registrar(TACO_VALIDATE_TASK, "taco_validate");
registrar.add_constraint(Legion::ProcessorConstraint(Legion::Processor::OMP_PROC));
Legion::Runtime::preregister_task_variant<tacoValidateOMPTask<T>>(registrar, "taco_validate");
}
// Register a CUDA variant if we have a CUDA build.
#ifdef TACO_USE_CUDA
registerGPUValidateTask<T>();
#endif
}

#endif // TACO_LG_VALIDATE_H
7 changes: 7 additions & 0 deletions legion/src/validate.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#include "validate.cuh"

// Register the validate task for several common types that we use.
template void registerGPUValidateTask<int32_t>();
template void registerGPUValidateTask<int64_t>();
template void registerGPUValidateTask<float>();
template void registerGPUValidateTask<double>();

0 comments on commit ba8addd

Please sign in to comment.