Skip to content

Commit 97e3292

Browse files
Fix hash build memory over use (facebookincubator#10534) (#500)
Summary: For duplicate rows memory usage, currently under parallel join build conditions, each build operator reserves memory big enough to accommodate total number of rows across all hash tables from all build operators. Instead each build operator should only reserve memory enough for its own hash table rows. This optimization reduced hash build operator memory usage by 10x and we see total memory reduction of some queries reduced by 70%. Pull Request resolved: facebookincubator#10534 Reviewed By: zacw7 Differential Revision: D60131886 Pulled By: tanjialiang fbshipit-source-id: a8c1c777df557dfcfc754ef31164a116fdb917c3 (cherry picked from commit 3fb9657) Co-authored-by: Jialiang Tan <[email protected]>
1 parent 4397808 commit 97e3292

File tree

3 files changed

+186
-75
lines changed

3 files changed

+186
-75
lines changed

velox/exec/HashBuild.cpp

Lines changed: 90 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,6 @@ bool HashBuild::finishHashBuild() {
678678

679679
std::vector<HashBuild*> otherBuilds;
680680
otherBuilds.reserve(peers.size());
681-
uint64_t numRows = table_->rows()->numRows();
682681
for (auto& peer : peers) {
683682
auto op = peer->findOperator(planNodeId());
684683
HashBuild* build = dynamic_cast<HashBuild*>(op);
@@ -696,13 +695,10 @@ bool HashBuild::finishHashBuild() {
696695
!build->stateCleared_,
697696
"Internal state for a peer is empty. It might have already"
698697
" been closed.");
699-
numRows += build->table_->rows()->numRows();
700698
}
701699
otherBuilds.push_back(build);
702700
}
703701

704-
ensureTableFits(numRows);
705-
706702
std::vector<std::unique_ptr<BaseHashTable>> otherTables;
707703
otherTables.reserve(peers.size());
708704
SpillPartitionSet spillPartitions;
@@ -723,20 +719,34 @@ bool HashBuild::finishHashBuild() {
723719
spiller->finishSpill(spillPartitions);
724720
}
725721
}
726-
bool allowDuplicateRows = table_->rows()->nextOffset() != 0;
727-
if (allowDuplicateRows) {
728-
ensureNextRowVectorFits(numRows, otherBuilds);
729-
}
730722

731723
if (spiller_ != nullptr) {
732724
spiller_->finishSpill(spillPartitions);
733725
removeEmptyPartitions(spillPartitions);
734726
}
735727

736-
// TODO: re-enable parallel join build with spilling triggered after
737-
// https://github.com/facebookincubator/velox/issues/3567 is fixed.
728+
// TODO: Get accurate signal if parallel join build is going to be applied
729+
// from hash table. Currently there is still a chance inside hash table that
730+
// it might decide it is not going to trigger parallel join build.
738731
const bool allowParallelJoinBuild =
739732
!otherTables.empty() && spillPartitions.empty();
733+
ensureTableFits(otherBuilds, otherTables, allowParallelJoinBuild);
734+
735+
SCOPE_EXIT {
736+
// Make a guard to release the unused memory reservation since we have
737+
// finished the merged table build. The guard makes sure we release the
738+
// memory reserved for other operators even when exceptions are thrown to
739+
// prevent memory leak. We cannot rely on other operator's cleanup mechanism
740+
// because when exceptions are thrown, other operator's cleanup mechanism
741+
// might already have finished.
742+
pool()->release();
743+
for (auto* build : otherBuilds) {
744+
build->pool()->release();
745+
}
746+
};
747+
748+
// TODO: Re-enable parallel join build with spilling triggered after
749+
// https://github.com/facebookincubator/velox/issues/3567 is fixed.
740750
CpuWallTiming timing;
741751
{
742752
CpuWallTimer cpuWallTimer{timing};
@@ -757,25 +767,19 @@ bool HashBuild::finishHashBuild() {
757767
if (spillEnabled()) {
758768
stateCleared_ = true;
759769
}
760-
761-
// Release the unused memory reservation since we have finished the merged
762-
// table build.
763-
pool()->release();
764-
if (allowDuplicateRows) {
765-
for (auto* build : otherBuilds) {
766-
build->pool()->release();
767-
}
768-
}
769770
return true;
770771
}
771772

772-
void HashBuild::ensureTableFits(uint64_t numRows) {
773+
void HashBuild::ensureTableFits(
774+
const std::vector<HashBuild*>& otherBuilds,
775+
const std::vector<std::unique_ptr<BaseHashTable>>& otherTables,
776+
bool isParallelJoin) {
773777
// NOTE: we don't need memory reservation if all the partitions have been
774778
// spilled as nothing need to be built.
775-
if (!spillEnabled() || spiller_ == nullptr || spiller_->isAllSpilled() ||
776-
numRows == 0) {
779+
if (!spillEnabled() || spiller_ == nullptr || spiller_->isAllSpilled()) {
777780
return;
778781
}
782+
VELOX_CHECK_EQ(otherBuilds.size(), otherTables.size());
779783

780784
// Test-only spill path.
781785
if (testingTriggerSpill(pool()->name())) {
@@ -784,58 +788,82 @@ void HashBuild::ensureTableFits(uint64_t numRows) {
784788
return;
785789
}
786790

787-
// NOTE: reserve a bit more memory to consider the extra memory used for
788-
// parallel table build operation.
789-
const uint64_t bytesToReserve = table_->estimateHashTableSize(numRows) * 1.1;
791+
TestValue::adjust("facebook::velox::exec::HashBuild::ensureTableFits", this);
792+
793+
const auto dupRowOverheadBytes = sizeof(char*) + sizeof(NextRowVector);
794+
795+
uint64_t totalNumRows{0};
796+
uint64_t lastBuildBytesToReserve{0};
797+
bool allowDuplicateRows{false};
790798
{
791-
Operator::ReclaimableSectionGuard guard(this);
792-
TestValue::adjust(
793-
"facebook::velox::exec::HashBuild::ensureTableFits", this);
794-
if (pool()->maybeReserve(bytesToReserve)) {
795-
return;
799+
std::lock_guard<std::mutex> l(mutex_);
800+
const auto numRows = table_->rows()->numRows();
801+
totalNumRows += numRows;
802+
allowDuplicateRows = table_->rows()->nextOffset() != 0;
803+
if (allowDuplicateRows) {
804+
lastBuildBytesToReserve += numRows * dupRowOverheadBytes;
796805
}
797806
}
798807

799-
LOG(WARNING) << "Failed to reserve " << succinctBytes(bytesToReserve)
800-
<< " for memory pool " << pool()->name()
801-
<< ", usage: " << succinctBytes(pool()->usedBytes())
802-
<< ", reservation: " << succinctBytes(pool()->reservedBytes());
803-
}
808+
for (auto i = 0; i < otherTables.size(); i++) {
809+
auto& otherTable = otherTables[i];
810+
VELOX_CHECK_NOT_NULL(otherTable);
811+
auto& otherBuild = otherBuilds[i];
812+
const auto& rowContainer = otherTable->rows();
813+
int64_t numRows{0};
814+
{
815+
std::lock_guard<std::mutex> l(otherBuild->mutex_);
816+
numRows = rowContainer->numRows();
817+
}
818+
if (numRows == 0) {
819+
continue;
820+
}
804821

805-
void HashBuild::ensureNextRowVectorFits(
806-
uint64_t numRows,
807-
const std::vector<HashBuild*>& otherBuilds) {
808-
if (!spillEnabled()) {
809-
return;
822+
totalNumRows += numRows;
823+
if (!allowDuplicateRows) {
824+
continue;
825+
}
826+
827+
const auto dupRowBytesToReserve = numRows * dupRowOverheadBytes;
828+
if (!isParallelJoin) {
829+
lastBuildBytesToReserve += dupRowBytesToReserve;
830+
continue;
831+
}
832+
833+
Operator::ReclaimableSectionGuard guard(otherBuild);
834+
auto* otherPool = otherBuild->pool();
835+
836+
// Reserve memory for memory allocations for next-row-vectors in
837+
// otherBuild operators if it is parallel join build. Otherwise all
838+
// next-row-vectors shall be allocated from the last build operator.
839+
if (!otherPool->maybeReserve(dupRowBytesToReserve)) {
840+
LOG(WARNING)
841+
<< "Failed to reserve " << succinctBytes(dupRowBytesToReserve)
842+
<< " for for duplicate row memory allocation from non-last memory pool "
843+
<< otherPool->name()
844+
<< ", usage: " << succinctBytes(otherPool->usedBytes())
845+
<< ", reservation: " << succinctBytes(otherPool->reservedBytes());
846+
}
810847
}
811848

812-
TestValue::adjust(
813-
"facebook::velox::exec::HashBuild::ensureNextRowVectorFits", this);
849+
if (totalNumRows == 0) {
850+
return;
851+
}
814852

815-
// The memory allocation for next-row-vectors may stuck in
816-
// 'SharedArbitrator::growCapacity' when memory arbitrating is also
817-
// triggered. Reserve memory for next-row-vectors to prevent this issue.
818-
auto bytesToReserve = numRows * (sizeof(char*) + sizeof(NextRowVector));
853+
// NOTE: reserve a bit more memory to consider the extra memory used for
854+
// parallel table build operation.
855+
lastBuildBytesToReserve += table_->estimateHashTableSize(totalNumRows) * 1.1;
819856
{
820857
Operator::ReclaimableSectionGuard guard(this);
821-
if (!pool()->maybeReserve(bytesToReserve)) {
822-
LOG(WARNING) << "Failed to reserve " << succinctBytes(bytesToReserve)
823-
<< " for memory pool " << pool()->name()
824-
<< ", usage: " << succinctBytes(pool()->usedBytes())
825-
<< ", reservation: "
826-
<< succinctBytes(pool()->reservedBytes());
827-
}
828-
}
829-
for (auto* build : otherBuilds) {
830-
Operator::ReclaimableSectionGuard guard(build);
831-
if (!build->pool()->maybeReserve(bytesToReserve)) {
832-
LOG(WARNING) << "Failed to reserve " << succinctBytes(bytesToReserve)
833-
<< " for memory pool " << build->pool()->name()
834-
<< ", usage: " << succinctBytes(build->pool()->usedBytes())
835-
<< ", reservation: "
836-
<< succinctBytes(build->pool()->reservedBytes());
858+
if (pool()->maybeReserve(lastBuildBytesToReserve)) {
859+
return;
837860
}
838861
}
862+
863+
LOG(WARNING) << "Failed to reserve " << succinctBytes(lastBuildBytesToReserve)
864+
<< " for last build memory pool " << pool()->name()
865+
<< ", usage: " << succinctBytes(pool()->usedBytes())
866+
<< ", reservation: " << succinctBytes(pool()->reservedBytes());
839867
}
840868

841869
void HashBuild::postHashBuildProcess() {

velox/exec/HashBuild.h

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -141,17 +141,12 @@ class HashBuild final : public Operator {
141141
// enabled.
142142
void ensureInputFits(RowVectorPtr& input);
143143

144-
// Invoked to ensure there is sufficient memory to build the join table with
145-
// the specified 'numRows' if spilling is enabled. The function throws to fail
146-
// the query if the memory reservation fails.
147-
void ensureTableFits(uint64_t numRows);
148-
149-
// Invoked to ensure there is sufficient memory to build the next-row-vectors
150-
// with the specified 'numRows' if spilling is enabled. The function throws to
151-
// fail the query if the memory reservation fails.
152-
void ensureNextRowVectorFits(
153-
uint64_t numRows,
154-
const std::vector<HashBuild*>& otherBuilds);
144+
// Invoked to ensure there is sufficient memory to build the join table. The
145+
// function throws to fail the query if the memory reservation fails.
146+
void ensureTableFits(
147+
const std::vector<HashBuild*>& otherBuilds,
148+
const std::vector<std::unique_ptr<BaseHashTable>>& otherTables,
149+
bool isParallelJoin);
155150

156151
// Invoked to compute spill partitions numbers for each row 'input' and spill
157152
// rows to spiller directly if the associated partition(s) is spilling. The

velox/exec/tests/HashJoinTest.cpp

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7077,6 +7077,91 @@ DEBUG_ONLY_TEST_F(HashJoinTest, reclaimDuringTableBuild) {
70777077
.run();
70787078
}
70797079

7080+
DEBUG_ONLY_TEST_F(HashJoinTest, exceptionDuringFinishJoinBuild) {
7081+
// This test is to make sure there is no memory leak when exceptions are
7082+
// thrown while parallelly preparing join table.
7083+
auto memoryManager = memory::memoryManager();
7084+
const auto& arbitrator = memoryManager->arbitrator();
7085+
const uint64_t numDrivers = 2;
7086+
const auto expectedFreeCapacityBytes = arbitrator->stats().freeCapacityBytes;
7087+
7088+
const uint64_t numBuildSideRows = 500;
7089+
auto buildKeyVector = makeFlatVector<int64_t>(
7090+
numBuildSideRows,
7091+
[](vector_size_t row) { return folly::Random::rand64(); });
7092+
auto buildSideVector =
7093+
makeRowVector({"b0", "b1"}, {buildKeyVector, buildKeyVector});
7094+
std::vector<RowVectorPtr> buildSideVectors;
7095+
for (int i = 0; i < numDrivers; ++i) {
7096+
buildSideVectors.push_back(buildSideVector);
7097+
}
7098+
createDuckDbTable("build", buildSideVectors);
7099+
7100+
const uint64_t numProbeSideRows = 10;
7101+
auto probeKeyVector = makeFlatVector<int64_t>(
7102+
numProbeSideRows,
7103+
[&](vector_size_t row) { return buildKeyVector->valueAt(row); });
7104+
auto probeSideVector =
7105+
makeRowVector({"p0", "p1"}, {probeKeyVector, probeKeyVector});
7106+
std::vector<RowVectorPtr> probeSideVectors;
7107+
for (int i = 0; i < numDrivers; ++i) {
7108+
probeSideVectors.push_back(probeSideVector);
7109+
}
7110+
createDuckDbTable("probe", probeSideVectors);
7111+
7112+
ASSERT_EQ(arbitrator->stats().freeCapacityBytes, expectedFreeCapacityBytes);
7113+
7114+
// We set the task to fail right before we reserve memory for other operators.
7115+
// We rely on the driver suspension before parallel join build to throw
7116+
// exceptions (suspension on an already terminated task throws).
7117+
SCOPED_TESTVALUE_SET(
7118+
"facebook::velox::exec::HashBuild::ensureTableFits",
7119+
std::function<void(HashBuild*)>([&](HashBuild* buildOp) {
7120+
try {
7121+
VELOX_FAIL("Simulated failure");
7122+
} catch (VeloxException& e) {
7123+
buildOp->testingOperatorCtx()->task()->setError(
7124+
std::current_exception());
7125+
}
7126+
}));
7127+
7128+
std::vector<RowVectorPtr> probeInput = {probeSideVector};
7129+
std::vector<RowVectorPtr> buildInput = {buildSideVector};
7130+
auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
7131+
const auto spillDirectory = exec::test::TempDirectoryPath::create();
7132+
7133+
ASSERT_EQ(arbitrator->stats().freeCapacityBytes, expectedFreeCapacityBytes);
7134+
VELOX_ASSERT_THROW(
7135+
AssertQueryBuilder(duckDbQueryRunner_)
7136+
.spillDirectory(spillDirectory->getPath())
7137+
.config(core::QueryConfig::kSpillEnabled, true)
7138+
.config(core::QueryConfig::kJoinSpillEnabled, true)
7139+
.queryCtx(
7140+
newQueryCtx(memoryManager, executor_.get(), kMemoryCapacity))
7141+
.maxDrivers(numDrivers)
7142+
.plan(PlanBuilder(planNodeIdGenerator)
7143+
.values(probeInput, true)
7144+
.hashJoin(
7145+
{"p0"},
7146+
{"b0"},
7147+
PlanBuilder(planNodeIdGenerator)
7148+
.values(buildInput, true)
7149+
.planNode(),
7150+
"",
7151+
{"p0", "p1", "b0", "b1"},
7152+
core::JoinType::kInner)
7153+
.planNode())
7154+
.assertResults(
7155+
"SELECT probe.p0, probe.p1, build.b0, build.b1 FROM probe "
7156+
"INNER JOIN build ON probe.p0 = build.b0"),
7157+
"Simulated failure");
7158+
// This test uses on-demand created memory manager instead of the global
7159+
// one. We need to make sure any used memory got cleaned up before exiting
7160+
// the scope
7161+
waitForAllTasksToBeDeleted();
7162+
ASSERT_EQ(arbitrator->stats().freeCapacityBytes, expectedFreeCapacityBytes);
7163+
}
7164+
70807165
DEBUG_ONLY_TEST_F(HashJoinTest, arbitrationTriggeredDuringParallelJoinBuild) {
70817166
std::unique_ptr<memory::MemoryManager> memoryManager = createMemoryManager();
70827167
const auto& arbitrator = memoryManager->arbitrator();
@@ -7165,8 +7250,11 @@ DEBUG_ONLY_TEST_F(HashJoinTest, arbitrationTriggeredByEnsureJoinTableFit) {
71657250
.injectSpill(false)
71667251
.verifier([&](const std::shared_ptr<Task>& task, bool /*unused*/) {
71677252
auto opStats = toOperatorStats(task->taskStats());
7168-
ASSERT_GT(opStats.at("HashProbe").spilledBytes, 0);
7169-
ASSERT_GT(opStats.at("HashBuild").spilledBytes, 0);
7253+
ASSERT_GT(
7254+
opStats.at("HashBuild")
7255+
.runtimeStats["memoryArbitrationWallNanos"]
7256+
.count,
7257+
0);
71707258
})
71717259
.run();
71727260
}

0 commit comments

Comments
 (0)