Skip to content

Commit 6006975

Browse files
committed
[HIP] Allow partial linking for -fgpu-rdc
`-fgpu-rdc` mode allows device functions call device functions in different TU. However, currently all device objects have to be linked together since only one fat binary is supported. This is time consuming for AMDGPU backend since it only supports LTO. There are use cases that objects can be divided into groups in which device functions are self-contained but host functions are not. It is desirable to link/optimize/codegen the device code and generate a fatbin for each group, whereas partially link the host code with `ld -r` or generate a static library by using the `-emit-static-lib` option of clang. This avoids linking all device code together, therefore decreases the linking time for `-fgpu-rdc`. Previously, clang emits an external symbol `__hip_fatbin` for all objects for `-fgpu-rdc`. With this patch, clang emits an unique external symbol `__hip_fatbin_{cuid}` for the fat binary for each object. When a group of objects are linked together to generate a fatbin, the symbols are merged by alias and point to the same fat binary. Each group has its own fat binary. One executable or shared library can have multiple fat binaries. Device linking is done for undefined fab binary symbols only to avoid repeated linking. `__hip_gpubin_handle` is also uniquefied and merged to avoid repeated registering. Symbol `__hip_cuid_{cuid}` is introduced to facilitate debugging and tooling. Fixes: #77018 Change-Id: Ia16ac3ddb05b66e6149288aacb0ba4a80120ad8c
1 parent 1c10821 commit 6006975

File tree

6 files changed

+278
-44
lines changed

6 files changed

+278
-44
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -760,10 +760,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
760760
// to contain the fat binary but will be populated somewhere else,
761761
// e.g. by lld through link script.
762762
FatBinStr = new llvm::GlobalVariable(
763-
CGM.getModule(), CGM.Int8Ty,
764-
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
765-
"__hip_fatbin", nullptr,
766-
llvm::GlobalVariable::NotThreadLocal);
763+
CGM.getModule(), CGM.Int8Ty,
764+
/*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
765+
"__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
766+
llvm::GlobalVariable::NotThreadLocal);
767767
cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
768768
}
769769

@@ -816,8 +816,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
816816
// thread safety of the loaded program. Therefore we can assume sequential
817817
// execution of constructor functions here.
818818
if (IsHIP) {
819-
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
820-
llvm::GlobalValue::LinkOnceAnyLinkage;
819+
auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
820+
: llvm::GlobalValue::ExternalLinkage;
821821
llvm::BasicBlock *IfBlock =
822822
llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
823823
llvm::BasicBlock *ExitBlock =
@@ -826,11 +826,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
826826
// of HIP ABI.
827827
GpuBinaryHandle = new llvm::GlobalVariable(
828828
TheModule, PtrTy, /*isConstant=*/false, Linkage,
829-
/*Initializer=*/llvm::ConstantPointerNull::get(PtrTy),
830-
"__hip_gpubin_handle");
831-
if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
832-
GpuBinaryHandle->setComdat(
833-
CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
829+
/*Initializer=*/
830+
CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
831+
CudaGpuBinary
832+
? "__hip_gpubin_handle"
833+
: "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
834834
GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
835835
// Prevent the weak symbol in different shared libraries being merged.
836836
if (Linkage != llvm::GlobalValue::InternalLinkage)

clang/lib/CodeGen/CodeGenModule.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -919,7 +919,15 @@ void CodeGenModule::Release() {
919919
llvm::ConstantArray::get(ATy, UsedArray), "__clang_gpu_used_external");
920920
addCompilerUsedGlobal(GV);
921921
}
922-
922+
if (LangOpts.HIP) {
923+
// Emit a unique ID so that host and device binaries from the same
924+
// compilation unit can be associated.
925+
auto *GV = new llvm::GlobalVariable(
926+
getModule(), Int8Ty, false, llvm::GlobalValue::ExternalLinkage,
927+
llvm::Constant::getNullValue(Int8Ty),
928+
"__hip_cuid_" + getContext().getCUIDHash());
929+
addCompilerUsedGlobal(GV);
930+
}
923931
emitLLVMUsed();
924932
if (SanStats)
925933
SanStats->finish();

clang/lib/Driver/ToolChains/HIPUtility.cpp

Lines changed: 224 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,24 @@
99
#include "HIPUtility.h"
1010
#include "CommonArgs.h"
1111
#include "clang/Driver/Compilation.h"
12+
#include "clang/Driver/Options.h"
13+
#include "llvm/ADT/StringExtras.h"
1214
#include "llvm/ADT/StringRef.h"
15+
#include "llvm/Object/Archive.h"
16+
#include "llvm/Object/ObjectFile.h"
17+
#include "llvm/Support/MD5.h"
18+
#include "llvm/Support/MemoryBuffer.h"
1319
#include "llvm/Support/Path.h"
20+
#include "llvm/Support/raw_ostream.h"
1421
#include "llvm/TargetParser/Triple.h"
22+
#include <deque>
23+
#include <set>
1524

25+
using namespace clang;
1626
using namespace clang::driver;
1727
using namespace clang::driver::tools;
1828
using namespace llvm::opt;
29+
using llvm::dyn_cast;
1930

2031
#if defined(_WIN32) || defined(_WIN64)
2132
#define NULL_FILE "nul"
@@ -36,6 +47,146 @@ static std::string normalizeForBundler(const llvm::Triple &T,
3647
: T.normalize();
3748
}
3849

50+
// Collect undefined __hip_fatbin* and __hip_gpubin_handle* symbols from all
51+
// input object or archive files.
52+
class HIPUndefinedFatBinSymbols {
53+
public:
54+
HIPUndefinedFatBinSymbols(const Compilation &C)
55+
: C(C), DiagID(C.getDriver().getDiags().getCustomDiagID(
56+
DiagnosticsEngine::Error,
57+
"Error collecting HIP undefined fatbin symbols: %0")),
58+
Quiet(C.getArgs().hasArg(options::OPT__HASH_HASH_HASH)),
59+
Verbose(C.getArgs().hasArg(options::OPT_v)) {
60+
populateSymbols();
61+
if (Verbose) {
62+
for (auto Name : FatBinSymbols)
63+
llvm::errs() << "Found undefined HIP fatbin symbol: " << Name << "\n";
64+
for (auto Name : GPUBinHandleSymbols)
65+
llvm::errs() << "Found undefined HIP gpubin handle symbol: " << Name
66+
<< "\n";
67+
}
68+
}
69+
70+
const std::set<std::string> &getFatBinSymbols() const {
71+
return FatBinSymbols;
72+
}
73+
74+
const std::set<std::string> &getGPUBinHandleSymbols() const {
75+
return GPUBinHandleSymbols;
76+
}
77+
78+
private:
79+
const Compilation &C;
80+
unsigned DiagID;
81+
bool Quiet;
82+
bool Verbose;
83+
std::set<std::string> FatBinSymbols;
84+
std::set<std::string> GPUBinHandleSymbols;
85+
const std::string FatBinPrefix = "__hip_fatbin";
86+
const std::string GPUBinHandlePrefix = "__hip_gpubin_handle";
87+
88+
void populateSymbols() {
89+
std::deque<const Action *> WorkList;
90+
std::set<const Action *> Visited;
91+
92+
for (const auto &Action : C.getActions()) {
93+
WorkList.push_back(Action);
94+
}
95+
96+
while (!WorkList.empty()) {
97+
const Action *CurrentAction = WorkList.front();
98+
WorkList.pop_front();
99+
100+
if (!CurrentAction || !Visited.insert(CurrentAction).second)
101+
continue;
102+
103+
if (const auto *IA = dyn_cast<InputAction>(CurrentAction)) {
104+
std::string ID = IA->getId().str();
105+
if (!ID.empty()) {
106+
ID = llvm::utohexstr(llvm::MD5Hash(ID), /*LowerCase=*/true);
107+
FatBinSymbols.insert(Twine(FatBinPrefix + "_" + ID).str());
108+
GPUBinHandleSymbols.insert(
109+
Twine(GPUBinHandlePrefix + "_" + ID).str());
110+
continue;
111+
}
112+
const char *Filename = IA->getInputArg().getValue();
113+
auto BufferOrErr = llvm::MemoryBuffer::getFile(Filename);
114+
// Input action could be options to linker, therefore ignore it
115+
// if cannot read it.
116+
if (!BufferOrErr)
117+
continue;
118+
119+
processInput(BufferOrErr.get()->getMemBufferRef());
120+
} else
121+
WorkList.insert(WorkList.end(), CurrentAction->getInputs().begin(),
122+
CurrentAction->getInputs().end());
123+
}
124+
}
125+
126+
void processInput(const llvm::MemoryBufferRef &Buffer) {
127+
// Try processing as object file first.
128+
auto ObjFileOrErr = llvm::object::ObjectFile::createObjectFile(Buffer);
129+
if (ObjFileOrErr) {
130+
processSymbols(**ObjFileOrErr);
131+
return;
132+
}
133+
134+
// Then try processing as archive files.
135+
llvm::consumeError(ObjFileOrErr.takeError());
136+
auto ArchiveOrErr = llvm::object::Archive::create(Buffer);
137+
if (ArchiveOrErr) {
138+
llvm::Error Err = llvm::Error::success();
139+
llvm::object::Archive &Archive = *ArchiveOrErr.get();
140+
for (auto &Child : Archive.children(Err)) {
141+
auto ChildBufOrErr = Child.getMemoryBufferRef();
142+
if (ChildBufOrErr)
143+
processInput(*ChildBufOrErr);
144+
else
145+
errorHandler(ChildBufOrErr.takeError());
146+
}
147+
148+
if (Err)
149+
errorHandler(std::move(Err));
150+
return;
151+
}
152+
153+
// Ignore other files.
154+
llvm::consumeError(ArchiveOrErr.takeError());
155+
}
156+
void processSymbols(const llvm::object::ObjectFile &Obj) {
157+
for (const auto &Symbol : Obj.symbols()) {
158+
auto FlagOrErr = Symbol.getFlags();
159+
if (!FlagOrErr) {
160+
errorHandler(FlagOrErr.takeError());
161+
continue;
162+
}
163+
164+
// Filter only undefined symbols
165+
if (!(FlagOrErr.get() & llvm::object::SymbolRef::SF_Undefined)) {
166+
continue;
167+
}
168+
169+
auto NameOrErr = Symbol.getName();
170+
if (!NameOrErr) {
171+
errorHandler(NameOrErr.takeError());
172+
continue;
173+
}
174+
llvm::StringRef Name = *NameOrErr;
175+
176+
if (Name.starts_with(FatBinPrefix))
177+
FatBinSymbols.insert(Name.str());
178+
else if (Name.starts_with(GPUBinHandlePrefix))
179+
GPUBinHandleSymbols.insert(Name.str());
180+
}
181+
}
182+
183+
void errorHandler(llvm::Error Err) {
184+
if (Quiet)
185+
return;
186+
C.getDriver().Diag(DiagID) << llvm::toString(std::move(Err));
187+
}
188+
};
189+
39190
// Construct a clang-offload-bundler command to bundle code objects for
40191
// different devices into a HIP fat binary.
41192
void HIP::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
@@ -130,26 +281,86 @@ void HIP::constructGenerateObjFileFromHIPFatBinary(
130281
auto HostTriple =
131282
C.getSingleOffloadToolChain<Action::OFK_Host>()->getTriple();
132283

284+
HIPUndefinedFatBinSymbols Symbols(C);
285+
286+
std::string PrimaryHipFatbinSymbol;
287+
std::string PrimaryGpuBinHandleSymbol;
288+
bool FoundPrimaryHipFatbinSymbol = false;
289+
bool FoundPrimaryGpuBinHandleSymbol = false;
290+
291+
std::vector<std::string> AliasHipFatbinSymbols;
292+
std::vector<std::string> AliasGpuBinHandleSymbols;
293+
294+
// Iterate through symbols to find the primary ones and collect others for
295+
// aliasing
296+
for (const auto &Symbol : Symbols.getFatBinSymbols()) {
297+
if (!FoundPrimaryHipFatbinSymbol) {
298+
PrimaryHipFatbinSymbol = Symbol;
299+
FoundPrimaryHipFatbinSymbol = true;
300+
} else {
301+
AliasHipFatbinSymbols.push_back(Symbol);
302+
}
303+
}
304+
305+
for (const auto &Symbol : Symbols.getGPUBinHandleSymbols()) {
306+
if (!FoundPrimaryGpuBinHandleSymbol) {
307+
PrimaryGpuBinHandleSymbol = Symbol;
308+
FoundPrimaryGpuBinHandleSymbol = true;
309+
} else {
310+
AliasGpuBinHandleSymbols.push_back(Symbol);
311+
}
312+
}
313+
133314
// Add MC directives to embed target binaries. We ensure that each
134315
// section and image is 16-byte aligned. This is not mandatory, but
135316
// increases the likelihood of data to be aligned with a cache block
136317
// in several main host machines.
137318
ObjStream << "# HIP Object Generator\n";
138319
ObjStream << "# *** Automatically generated by Clang ***\n";
139-
if (HostTriple.isWindowsMSVCEnvironment()) {
140-
ObjStream << " .section .hip_fatbin, \"dw\"\n";
141-
} else {
142-
ObjStream << " .protected __hip_fatbin\n";
143-
ObjStream << " .type __hip_fatbin,@object\n";
144-
ObjStream << " .section .hip_fatbin,\"a\",@progbits\n";
320+
if (FoundPrimaryGpuBinHandleSymbol) {
321+
// Define the first gpubin handle symbol
322+
if (HostTriple.isWindowsMSVCEnvironment()) {
323+
ObjStream << " .section .hip_gpubin_handle,\"dw\"\n";
324+
} else {
325+
ObjStream << " .protected " << PrimaryGpuBinHandleSymbol << "\n";
326+
ObjStream << " .type " << PrimaryGpuBinHandleSymbol << ",@object\n";
327+
ObjStream << " .section .hip_gpubin_handle,\"aw\"\n";
328+
}
329+
ObjStream << " .globl " << PrimaryGpuBinHandleSymbol << "\n";
330+
ObjStream << " .p2align 3\n"; // Align 8
331+
ObjStream << PrimaryGpuBinHandleSymbol << ":\n";
332+
ObjStream << " .zero 8\n"; // Size 8
333+
334+
// Generate alias directives for other gpubin handle symbols
335+
for (const auto &AliasSymbol : AliasGpuBinHandleSymbols) {
336+
ObjStream << " .globl " << AliasSymbol << "\n";
337+
ObjStream << " .set " << AliasSymbol << "," << PrimaryGpuBinHandleSymbol
338+
<< "\n";
339+
}
340+
}
341+
if (FoundPrimaryHipFatbinSymbol) {
342+
// Define the first fatbin symbol
343+
if (HostTriple.isWindowsMSVCEnvironment()) {
344+
ObjStream << " .section .hip_fatbin,\"dw\"\n";
345+
} else {
346+
ObjStream << " .protected " << PrimaryHipFatbinSymbol << "\n";
347+
ObjStream << " .type " << PrimaryHipFatbinSymbol << ",@object\n";
348+
ObjStream << " .section .hip_fatbin,\"a\",@progbits\n";
349+
}
350+
ObjStream << " .globl " << PrimaryHipFatbinSymbol << "\n";
351+
ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
352+
<< "\n";
353+
// Generate alias directives for other fatbin symbols
354+
for (const auto &AliasSymbol : AliasHipFatbinSymbols) {
355+
ObjStream << " .globl " << AliasSymbol << "\n";
356+
ObjStream << " .set " << AliasSymbol << "," << PrimaryHipFatbinSymbol
357+
<< "\n";
358+
}
359+
ObjStream << PrimaryHipFatbinSymbol << ":\n";
360+
ObjStream << " .incbin ";
361+
llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true);
362+
ObjStream << "\n";
145363
}
146-
ObjStream << " .globl __hip_fatbin\n";
147-
ObjStream << " .p2align " << llvm::Log2(llvm::Align(HIPCodeObjectAlign))
148-
<< "\n";
149-
ObjStream << "__hip_fatbin:\n";
150-
ObjStream << " .incbin ";
151-
llvm::sys::printArg(ObjStream, BundleFile, /*Quote=*/true);
152-
ObjStream << "\n";
153364
if (HostTriple.isOSLinux() && HostTriple.isOSBinFormatELF())
154365
ObjStream << " .section .note.GNU-stack, \"\", @progbits\n";
155366
ObjStream.flush();

clang/test/CodeGenCUDA/device-stub.cu

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,19 @@
5050
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
5151
// RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \
5252
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF
53-
// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
53+
// RUN: %clang_cc1 -cuid=123 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
5454
// RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF
5555

5656
// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
5757
// RUN: -fcuda-include-gpubinary %t -o - -x hip\
5858
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN
5959

60-
// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
60+
// RUN: %clang_cc1 -cuid=123 -triple x86_64-pc-windows-msvc -aux-triple amdgcn -emit-llvm %s \
6161
// RUN: -o - -x hip\
6262
// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,WIN,HIP,HIPNEF
6363

6464
#include "Inputs/cuda.h"
6565

66-
// HIPNEF: $__hip_gpubin_handle = comdat any
67-
6866
#ifndef NOGLOBALS
6967
// NORDC-DAG: @device_var = internal global i32
7068
// RDC-DAG: @device_var = global i32
@@ -161,7 +159,7 @@ __device__ void device_use() {
161159
// * constant unnamed string with GPU binary
162160
// CUDA: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",
163161
// HIPEF: @[[FATBIN:.*]] = private constant{{.*}} c"GPU binary would be here.",{{.*}}align 4096
164-
// HIPNEF: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin"
162+
// HIPNEF: @[[FATBIN:__hip_fatbin_[0-9a-f]+]] = external constant i8, section ".hip_fatbin"
165163
// CUDANORDC-SAME: section ".nv_fatbin", align 8
166164
// CUDARDC-SAME: section "__nv_relfatbin", align 8
167165
// * constant struct that wraps GPU binary
@@ -177,7 +175,7 @@ __device__ void device_use() {
177175
// HIP-SAME: section ".hipFatBinSegment"
178176
// * variable to save GPU binary handle after initialization
179177
// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global ptr null
180-
// HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global ptr null
178+
// HIPNEF: @__[[PREFIX]]_gpubin_handle_{{[0-9a-f]+}} = external hidden global ptr, align 8
181179
// * constant unnamed string with NVModuleID
182180
// CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant
183181
// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32

clang/test/CodeGenCUDA/host-used-device-var.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
// REQUIRES: amdgpu-registered-target
22
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \
33
// RUN: -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \
4-
// RUN: | FileCheck -check-prefix=DEV %s
4+
// RUN: -cuid=123 | FileCheck -check-prefix=DEV %s
55
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \
6-
// RUN: -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST %s
6+
// RUN: -std=c++17 -O3 -emit-llvm -o - -cuid=123 | FileCheck -check-prefix=HOST %s
77

88
// Negative tests.
99

@@ -187,6 +187,7 @@ public:
187187
// DEV-SAME: {{^[^@]*}} @_ZL2u3
188188
// DEV-SAME: {{^[^@]*}} @_ZZ4fun1vE11static_var1
189189
// DEV-SAME: {{^[^@]*}} @_ZZZN21TestStaticVarInLambda3funEvENKUlPcE_clES0_E4var2
190+
// DEV-SAME: {{^[^@]*}} @__hip_cuid_{{[0-9a-f]+}}
190191
// DEV-SAME: {{^[^@]*}} @constexpr_var2b
191192
// DEV-SAME: {{^[^@]*}} @inline_var
192193
// DEV-SAME: {{^[^@]*}} @u1

0 commit comments

Comments
 (0)