Skip to content

Commit 7f8ae6d

Browse files
committed
wip cuda13 fixes
1 parent 04b473f commit 7f8ae6d

18 files changed

Lines changed: 66 additions & 34 deletions

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -457,9 +457,8 @@ void printVersion(raw_ostream &OS) {
457457
}
458458

459459
namespace nvptx {
460-
Expected<StringRef>
461-
fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
462-
const ArgList &Args) {
460+
Expected<StringRef> fatbinary(ArrayRef<OffloadingImage> Images,
461+
const ArgList &Args) {
463462
llvm::TimeTraceScope TimeScope("NVPTX fatbinary");
464463
// NVPTX uses the fatbinary program to bundle the linked images.
465464
Expected<std::string> FatBinaryPath =
@@ -481,9 +480,26 @@ fatbinary(ArrayRef<std::pair<StringRef, StringRef>> InputFiles,
481480
CmdArgs.push_back(Triple.isArch64Bit() ? "-64" : "-32");
482481
CmdArgs.push_back("--create");
483482
CmdArgs.push_back(*TempFileOrErr);
484-
for (const auto &[File, Arch] : InputFiles)
485-
CmdArgs.push_back(
486-
Args.MakeArgString("--image=profile=" + Arch + ",file=" + File));
483+
for (const OffloadingImage &Image : Images) {
484+
StringRef File = Image.Image->getBufferIdentifier();
485+
StringRef Arch = Image.StringData.lookup("arch");
486+
487+
// Determine the kind based on image type
488+
const char *Kind = "elf";
489+
if (Image.TheImageKind == ImageKind::IMG_PTX)
490+
Kind = "ptx";
491+
492+
// Extract numeric SM value from arch
493+
// Arch can be "sm_75", "compute_75", or just "75"
494+
StringRef SMValue = Arch;
495+
if (Arch.starts_with("sm_"))
496+
SMValue = Arch.drop_front(3);
497+
else if (Arch.starts_with("compute_"))
498+
SMValue = Arch.drop_front(8);
499+
500+
CmdArgs.push_back(Args.MakeArgString("--image3=kind=" + Twine(Kind) +
501+
",sm=" + SMValue + ",file=" + File));
502+
}
487503

488504
if (Error Err = executeCommands(*FatBinaryPath, CmdArgs))
489505
return std::move(Err);
@@ -1992,12 +2008,7 @@ bundleSYCL(ArrayRef<OffloadingImage> Images) {
19922008

19932009
Expected<SmallVector<std::unique_ptr<MemoryBuffer>>>
19942010
bundleCuda(ArrayRef<OffloadingImage> Images, const ArgList &Args) {
1995-
SmallVector<std::pair<StringRef, StringRef>, 4> InputFiles;
1996-
for (const OffloadingImage &Image : Images)
1997-
InputFiles.emplace_back(std::make_pair(Image.Image->getBufferIdentifier(),
1998-
Image.StringData.lookup("arch")));
1999-
2000-
auto FileOrErr = nvptx::fatbinary(InputFiles, Args);
2011+
auto FileOrErr = nvptx::fatbinary(Images, Args);
20012012
if (!FileOrErr)
20022013
return FileOrErr.takeError();
20032014

@@ -2279,7 +2290,7 @@ linkAndWrapDeviceFiles(ArrayRef<SmallVector<OffloadFile>> LinkerInputFiles,
22792290
}
22802291
for (size_t I = 0, E = SplitModules.size(); I != E; ++I) {
22812292
SmallVector<StringRef> Files = {SplitModules[I].ModuleFilePath};
2282-
SmallVector<std::pair<StringRef, StringRef>, 4> BundlerInputFiles;
2293+
SmallVector<OffloadingImage, 4> BundlerImages;
22832294
auto ClangOutputOrErr =
22842295
linkDevice(Files, LinkerArgs, true /* IsSYCLKind */,
22852296
CompileLinkOptionsOrErr->first);
@@ -2292,14 +2303,35 @@ linkAndWrapDeviceFiles(ArrayRef<SmallVector<OffloadFile>> LinkerInputFiles,
22922303
nvptx::ptxas(*ClangOutputOrErr, LinkerArgs, Arch);
22932304
if (!PtxasOutputOrErr)
22942305
return PtxasOutputOrErr.takeError();
2295-
BundlerInputFiles.emplace_back(*ClangOutputOrErr, VirtualArch);
2296-
BundlerInputFiles.emplace_back(*PtxasOutputOrErr, Arch);
2297-
auto BundledFileOrErr =
2298-
nvptx::fatbinary(BundlerInputFiles, LinkerArgs);
2306+
2307+
// Create OffloadingImage for PTX output
2308+
OffloadingImage PtxImage;
2309+
PtxImage.TheImageKind = ImageKind::IMG_PTX;
2310+
PtxImage.TheOffloadKind = OffloadKind::OFK_Cuda;
2311+
PtxImage.StringData["arch"] = VirtualArch;
2312+
auto PtxBuffer = MemoryBuffer::getFile(*ClangOutputOrErr);
2313+
if (!PtxBuffer)
2314+
return createFileError(*ClangOutputOrErr, PtxBuffer.getError());
2315+
PtxImage.Image = std::move(*PtxBuffer);
2316+
BundlerImages.push_back(std::move(PtxImage));
2317+
2318+
// Create OffloadingImage for Cubin output
2319+
OffloadingImage CubinImage;
2320+
CubinImage.TheImageKind = ImageKind::IMG_Cubin;
2321+
CubinImage.TheOffloadKind = OffloadKind::OFK_Cuda;
2322+
CubinImage.StringData["arch"] = Arch;
2323+
auto CubinBuffer = MemoryBuffer::getFile(*PtxasOutputOrErr);
2324+
if (!CubinBuffer)
2325+
return createFileError(*PtxasOutputOrErr, CubinBuffer.getError());
2326+
CubinImage.Image = std::move(*CubinBuffer);
2327+
BundlerImages.push_back(std::move(CubinImage));
2328+
2329+
auto BundledFileOrErr = nvptx::fatbinary(BundlerImages, LinkerArgs);
22992330
if (!BundledFileOrErr)
23002331
return BundledFileOrErr.takeError();
23012332
SplitModules[I].ModuleFilePath = *BundledFileOrErr;
23022333
} else if (Triple.isAMDGCN()) {
2334+
SmallVector<std::pair<StringRef, StringRef>, 4> BundlerInputFiles;
23032335
BundlerInputFiles.emplace_back(*ClangOutputOrErr, Arch);
23042336
auto BundledFileOrErr =
23052337
amdgcn::fatbinary(BundlerInputFiles, LinkerArgs);

sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
1+
// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %}
22
// RUN: %{run} %t.out
33

44
// NOTE: Tests fetch_add for acquire and release memory ordering.

sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
1+
// RUN: %{build} -O3 -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %}
22
// RUN: %{run} %t.out
33

44
#include "atomic_memory_order.h"

sycl/test-e2e/GroupAlgorithm/root_group.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// XFAIL: (opencl && !cpu)
33
// XFAIL-TRACKER: https://github.com/intel/llvm/issues/14641
44

5-
// RUN: %{build} -I . -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
5+
// RUN: %{build} -I . -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %}
66
// RUN: %{run} %t.out
77

88
// Disabled temporarily while investigation into the failure is ongoing.

sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
// REQUIRES: target-nvidia
10-
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %t.out
10+
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
1111
// RUN: %{run} %t.out
1212
//
1313
// This tests the unified matrix extension interfaces for the cuda backend.

sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
// REQUIRES: target-nvidia
10-
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_72 -o %t.out
10+
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out
1111
// RUN: %{run} %t.out
1212
//
1313
// This tests the unified matrix extension interfaces for the cuda backend.

sycl/test-e2e/Reduction/reduction_range_1d_dw.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %{build} -DENABLE_64_BIT=false -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
1+
// RUN: %{build} -DENABLE_64_BIT=false -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %}
22
// RUN: %{run} %t.out
33

44
#include "reduction_utils.hpp"
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %{build} -DENABLE_64_BIT=true -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
1+
// RUN: %{build} -DENABLE_64_BIT=true -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %}
22
// RUN: %{run} %t.out
33

44
#include "reduction_range_1d_dw.cpp"

sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
1+
// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %}
22
// RUN: %{run} %t.out
33

44
// This test performs basic checks of parallel_for(range<1>, reduction, func)

sycl/test-e2e/Reduction/reduction_range_1d_rw.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
1+
// RUN: %{build} -o %t.out %if target-nvidia %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 %}
22
// RUN: %{run} %t.out
33

44
// This test performs basic checks of parallel_for(range<1>, reduction, func)

0 commit comments

Comments
 (0)