Skip to content

Fix some precompute transformation algorithm bugs that arose #480

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 53 additions & 44 deletions src/index_notation/transformations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -383,67 +383,76 @@ IndexStmt Precompute::apply(IndexStmt stmt, std::string* reason) const {
Forall foralli(node);
std::vector<IndexVar> i_vars = precompute.getIVars();

vector<IndexVar> forallIndexVars;
bool containsWhere = false;
match(foralli,
function<void(const ForallNode*)>([&](const ForallNode* op) {
forallIndexVars.push_back(op->indexVar);
function<void(const WhereNode*)>([&](const WhereNode* op) {
containsWhere = true;
})
);

IndexStmt s = foralli.getStmt();
TensorVar ws = precompute.getWorkspace();
IndexExpr e = precompute.getExpr();
std::vector<IndexVar> iw_vars = precompute.getIWVars();
if (!containsWhere) {
vector<IndexVar> forallIndexVars;
match(foralli,
function<void(const ForallNode*)>([&](const ForallNode* op) {
forallIndexVars.push_back(op->indexVar);
})
);

map<IndexVar, IndexVar> substitutions;
taco_iassert(i_vars.size() == iw_vars.size()) << "i_vars and iw_vars lists must be the same size";
IndexStmt s = foralli.getStmt();
TensorVar ws = precompute.getWorkspace();
IndexExpr e = precompute.getExpr();
std::vector<IndexVar> iw_vars = precompute.getIWVars();

for (int index = 0; index < (int)i_vars.size(); index++) {
substitutions[i_vars[index]] = iw_vars[index];
}
map<IndexVar, IndexVar> substitutions;
taco_iassert(i_vars.size() == iw_vars.size()) << "i_vars and iw_vars lists must be the same size";

// Build consumer by replacing with temporary (in replacedStmt)
IndexStmt replacedStmt = replace(s, {{e, ws(i_vars) }});
if (replacedStmt != s) {
// Then modify the replacedStmt to have the correct foralls
// by concretizing the consumer assignment
for (int index = 0; index < (int)i_vars.size(); index++) {
substitutions[i_vars[index]] = iw_vars[index];
}

auto consumerAssignment = getConsumerAssignment(replacedStmt, ws);
auto consumerIndexVars = consumerAssignment.getIndexVars();
// Build consumer by replacing with temporary (in replacedStmt)
IndexStmt replacedStmt = replace(s, {{e, ws(i_vars) }});
if (replacedStmt != s) {
// Then modify the replacedStmt to have the correct foralls
// by concretizing the consumer assignment

auto producerAssignment = getProducerAssignment(ws, i_vars, iw_vars, e, substitutions);
auto producerIndexVars = producerAssignment.getIndexVars();
auto consumerAssignment = getConsumerAssignment(replacedStmt, ws);
auto consumerIndexVars = consumerAssignment.getIndexVars();

vector<IndexVar> producerForallIndexVars;
vector<IndexVar> consumerForallIndexVars;
vector<IndexVar> outerForallIndexVars;
auto producerAssignment = getProducerAssignment(ws, i_vars, iw_vars, e, substitutions);
auto producerIndexVars = producerAssignment.getIndexVars();

bool stopForallDistribution = false;
for (auto &i : util::reverse(forallIndexVars)) {
if (!stopForallDistribution && containsIndexVarScheduled(i_vars, i)) {
producerForallIndexVars.push_back(substitutions[i]);
consumerForallIndexVars.push_back(i);
} else {
auto consumerContains = containsIndexVarScheduled(consumerIndexVars, i);
auto producerContains = containsIndexVarScheduled(producerIndexVars, i);
if (stopForallDistribution || (producerContains && consumerContains)) {
outerForallIndexVars.push_back(i);
stopForallDistribution = true;
} else if (!stopForallDistribution && consumerContains) {
vector<IndexVar> producerForallIndexVars;
vector<IndexVar> consumerForallIndexVars;
vector<IndexVar> outerForallIndexVars;

bool stopForallDistribution = false;
for (auto &i : util::reverse(forallIndexVars)) {
if (!stopForallDistribution && containsIndexVarScheduled(i_vars, i)) {
producerForallIndexVars.push_back(substitutions[i]);
consumerForallIndexVars.push_back(i);
} else if (!stopForallDistribution && producerContains) {
producerForallIndexVars.push_back(i);
} else {
auto consumerContains = containsIndexVarScheduled(consumerIndexVars, i);
auto producerContains = containsIndexVarScheduled(producerIndexVars, i);
if (stopForallDistribution || (producerContains && consumerContains)) {
outerForallIndexVars.push_back(i);
stopForallDistribution = true;
} else if (!stopForallDistribution && consumerContains) {
consumerForallIndexVars.push_back(i);
} else if (!stopForallDistribution && producerContains) {
producerForallIndexVars.push_back(i);
}
}
}
}

IndexStmt consumer = generateForalls(consumerAssignment, consumerForallIndexVars);
IndexStmt consumer = generateForalls(consumerAssignment, consumerForallIndexVars);

IndexStmt producer = generateForalls(producerAssignment, producerForallIndexVars);
Where where(consumer, producer);
IndexStmt producer = generateForalls(producerAssignment, producerForallIndexVars);
Where where(consumer, producer);

stmt = generateForalls(where, outerForallIndexVars);
return;
stmt = generateForalls(where, outerForallIndexVars);
return;
}
}
IndexNotationRewriter::visit(node);
}
Expand Down
192 changes: 182 additions & 10 deletions test/tests-workspaces.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ TEST(workspaces, tile_vecElemMul_NoTail) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);
}

TEST(workspaces, tile_vecElemMul_Tail1) {
Expand Down Expand Up @@ -83,7 +83,7 @@ TEST(workspaces, tile_vecElemMul_Tail1) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);
}

TEST(workspaces, tile_vecElemMul_Tail2) {
Expand Down Expand Up @@ -121,7 +121,7 @@ TEST(workspaces, tile_vecElemMul_Tail2) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);

// ir::IRPrinter irp = ir::IRPrinter(cout);
//
Expand Down Expand Up @@ -171,7 +171,7 @@ TEST(workspaces, tile_denseMatMul) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);

// ir::IRPrinter irp = ir::IRPrinter(cout);
//
Expand Down Expand Up @@ -218,7 +218,7 @@ TEST(workspaces, precompute2D_add) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);

}

Expand Down Expand Up @@ -263,7 +263,7 @@ TEST(workspaces, precompute4D_add) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);
}

TEST(workspaces, precompute4D_multireduce) {
Expand Down Expand Up @@ -305,7 +305,7 @@ TEST(workspaces, precompute4D_multireduce) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);
}

TEST(workspaces, precompute3D_TspV) {
Expand Down Expand Up @@ -344,7 +344,7 @@ TEST(workspaces, precompute3D_TspV) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);

}

Expand Down Expand Up @@ -388,7 +388,7 @@ TEST(workspaces, precompute3D_multipleWS) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);

}

Expand Down Expand Up @@ -431,6 +431,178 @@ TEST(workspaces, precompute3D_renamedIVars_TspV) {
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(A, expected);
ASSERT_TENSOR_EQ(expected, A);

}

TEST(workspaces, DISABLED_tile_dotProduct_1) {
// FIXME: Disabled because currently the precompute algorithm does not appropriately
// find the correct forall substmt to next the WhereNode in after i has been
// split into i0 and i1. As an example, the first precompute below is incorrect
// since it should transform
// forall(i0, forall(i1, A() += B(i) * C(i))) -->
// forall(i0, where(forall(i1, A() += ws(i1)), forall(i1, ws(i1) += B(i) * C(i))))
//
// But currently the algorithm does
// forall(i0, forall(i1, A() += B(i) * C(i))) -->
// where(forall(i1, A() += ws(i1)), forall(i0, forall(i1, ws(i1) += B(i) * C(i))))

int N = 1024;
Tensor<double> A("A");
Tensor<double> B("B", {N}, Format({Dense}));
Tensor<double> C("C", {N}, Format({Dense}));

for (int i = 0; i < N; i++) {
B.insert({i}, (double) i);
C.insert({i}, (double) i);
}

B.pack();
C.pack();

IndexVar i("i");
IndexVar i_bounded("i_bounded");
IndexVar i0("i0"), i1("i1");
IndexExpr BExpr = B(i);
IndexExpr CExpr = C(i);
IndexExpr precomputedExpr = (BExpr) * (CExpr);
A() = precomputedExpr;

IndexStmt stmt = A.getAssignment().concretize();
TensorVar B_new("B_new", Type(Float64, {(size_t)N}), taco::dense);
TensorVar C_new("C_new", Type(Float64, {(size_t)N}), taco::dense);
TensorVar precomputed("precomputed", Type(Float64, {(size_t)N}), taco::dense);

stmt = stmt.bound(i, i_bounded, (size_t)N, BoundType::MaxExact)
.split(i_bounded, i0, i1, 32);
stmt = stmt.precompute(precomputedExpr, i1, i1, precomputed);
stmt = stmt.precompute(BExpr, i1, i1, B_new)
.precompute(CExpr, i1, i1, C_new);

stmt = stmt.concretize();

A.compile(stmt);
A.assemble();
A.compute();

ir::IRPrinter irp = ir::IRPrinter(cout);

cout << stmt << endl;

std::shared_ptr<ir::CodeGen> codegen = ir::CodeGen::init_default(cout, ir::CodeGen::ImplementationGen);
ir::Stmt compute = lower(stmt, "compute", false, true);

irp.print(compute);
cout << endl;
codegen->compile(compute, false);

Tensor<double> expected("expected");
expected() = B(i) * C(i);
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(expected, A);
}

TEST(workspaces, DISABLED_tile_dotProduct_2) {
// FIXME: This is also currently disabled since split(...) scheduling commands
// only split on the FIRST INSTANCE of an indexVar (assumes only one).
// This is wrong if the indexVar is not renamed across iw_vars since an indexVar can
// then occur on BOTH the consumer and producer side and should be split across both.

int N = 1024;
Tensor<double> A("A");
Tensor<double> B("B", {N}, Format({Dense}));
Tensor<double> C("C", {N}, Format({Dense}));

for (int i = 0; i < N; i++) {
B.insert({i}, (double) i);
C.insert({i}, (double) i);
}

B.pack();
C.pack();

IndexVar i("i");
IndexVar i_bounded("i_bounded");
IndexVar i0("i0"), i1("i1");
IndexExpr BExpr = B(i);
IndexExpr CExpr = C(i);
IndexExpr precomputedExpr = (BExpr) * (CExpr);
A() = precomputedExpr;

IndexStmt stmt = A.getAssignment().concretize();
TensorVar B_new("B_new", Type(Float64, {(size_t)N}), taco::dense);
TensorVar C_new("C_new", Type(Float64, {(size_t)N}), taco::dense);
TensorVar precomputed("precomputed", Type(Float64, {(size_t)N}), taco::dense);

stmt = stmt.precompute(precomputedExpr, i, i, precomputed);

stmt = stmt.precompute(BExpr, i, i, B_new)
.precompute(CExpr, i, i, C_new);

stmt = stmt.bound(i, i_bounded, (size_t)N, BoundType::MaxExact)
.split(i_bounded, i0, i1, 32);

stmt = stmt.concretize();

A.compile(stmt);
A.assemble();
A.compute();

Tensor<double> expected("expected");
expected() = B(i) * C(i);
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(expected, A);
}

TEST(workspaces, tile_dotProduct_3) {
int N = 1024;
Tensor<double> A("A");
Tensor<double> B("B", {N}, Format({Dense}));
Tensor<double> C("C", {N}, Format({Dense}));

for (int i = 0; i < N; i++) {
B.insert({i}, (double) i);
C.insert({i}, (double) i);
}

B.pack();
C.pack();

IndexVar i("i");
IndexVar i_bounded("i_bounded");
IndexVar i0("i0"), i1("i1");
IndexExpr BExpr = B(i);
IndexExpr CExpr = C(i);
IndexExpr precomputedExpr = (BExpr) * (CExpr);
A() = precomputedExpr;

IndexStmt stmt = A.getAssignment().concretize();
TensorVar B_new("B_new", Type(Float64, {(size_t)N}), taco::dense);
TensorVar C_new("C_new", Type(Float64, {(size_t)N}), taco::dense);
TensorVar precomputed("precomputed", Type(Float64, {(size_t)N}), taco::dense);

stmt = stmt.bound(i, i_bounded, (size_t)N, BoundType::MaxExact)
.split(i_bounded, i0, i1, 32);
stmt = stmt.precompute(precomputedExpr, i0, i0, precomputed);

stmt = stmt.precompute(BExpr, i1, i1, B_new)
.precompute(CExpr, i1, i1, C_new);


stmt = stmt.concretize();

A.compile(stmt);
A.assemble();
A.compute();

Tensor<double> expected("expected");
expected() = B(i) * C(i);
expected.compile();
expected.assemble();
expected.compute();
ASSERT_TENSOR_EQ(expected, A);
}