Skip to content

Commit

Permalink
[NATIVECPU] Fix local scope module variables for native cpu
Browse files Browse the repository at this point in the history
Although local scope variables inside the kernel are less common in
SYCL, they can happen with hierarchical.

This fixes the problem by adding a pass to replace the local scope variables
which start life as globals with a struct which is allocated on the
stack.

Additionally, this required updating of the code which renames and removes
kernel based on wrappers and vecz success. To simplify this we run the
OCK utility pass TransferKernelMetadata which adds metadata to store
the original kernel name. This in turn simplifies this code significantly.
  • Loading branch information
coldav committed Sep 4, 2024
1 parent 0686208 commit ff92c90
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 37 deletions.
4 changes: 4 additions & 0 deletions llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
#include "compiler/utils/builtin_info.h"
#include "compiler/utils/define_mux_builtins_pass.h"
#include "compiler/utils/device_info.h"
#include "compiler/utils/encode_kernel_metadata_pass.h"
#include "compiler/utils/prepare_barriers_pass.h"
#include "compiler/utils/replace_local_module_scope_variables_pass.h"
#include "compiler/utils/sub_group_analysis.h"
#include "compiler/utils/work_item_loops_pass.h"
#include "vecz/pass.h"
Expand Down Expand Up @@ -60,6 +62,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
OptimizationLevel OptLevel) {
MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
#ifdef NATIVECPU_USE_OCK
MPM.addPass(compiler::utils::TransferKernelMetadataPass());
// Always enable vectorizer, unless explictly disabled or -O0 is set.
if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) {
MAM.registerPass([] { return vecz::TargetInfoAnalysis(); });
Expand Down Expand Up @@ -87,6 +90,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); });
MPM.addPass(compiler::utils::PrepareBarriersPass());
MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
MPM.addPass(compiler::utils::ReplaceLocalModuleScopeVariablesPass());
MPM.addPass(AlwaysInlinerPass());
#endif
MPM.addPass(PrepareSYCLNativeCPUPass());
Expand Down
48 changes: 11 additions & 37 deletions llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,47 +338,21 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
SmallSet<Function *, 5> RemovableFuncs;
SmallVector<Function *, 5> WrapperFuncs;

// Retrieve the wrapper functions created by the WorkItemLoop pass.
for (auto &OldF : OldKernels) {
std::optional<compiler::utils::LinkMetadataResult> VeczR =
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
if (VeczR && VeczR.value().first) {
WrapperFuncs.push_back(OldF);
} else {
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
if (Name != OldF->getName()) {
WrapperFuncs.push_back(OldF);
}
}
}

for (auto &OldF : WrapperFuncs) {
// If vectorization occurred, at this point we have a wrapper function
// that runs the vectorized kernel and peels using the scalar kernel. We
// make it so this wrapper steals the original kernel name.
std::optional<compiler::utils::LinkMetadataResult> VeczR =
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
if (VeczR && VeczR.value().first) {
auto ScalarF = VeczR.value().first;
OldF->takeName(ScalarF);
if (ScalarF->use_empty())
RemovableFuncs.insert(ScalarF);
} else {
// The WorkItemLoops pass created a wrapper function for the original
// kernel. If we have a kernel named foo(), the wrapper will be called
// foo-wrapper(), and will have the original kernel name retrieved by
// getBaseFnNameOrFnName. We set the name of the wrapper function
// to the original kernel name and add the original kernel to the
// list of functions that can be removed from the module.
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
Function *OrigF = M.getFunction(Name);
// that runs the vectorized kernel and peels using the scalar kernel.
// There may also be a wrapper for local variables replacement. We make it
// so this wrapper steals the original kernel name. Otherwise we will have
// a wrapper function from the work item loops. In this case we also steal
// the original kernel name.
auto Name = compiler::utils::getOrigFnName(*OldF);
Function *OrigF = M.getFunction(Name);
if (Name != OldF->getName()) {
if (OrigF != nullptr) {
// The original kernel is inlined by the WorkItemLoops
// pass if it contained barriers or group collectives, otherwise
// we don't want to (and can't) remove it.
if (OrigF->use_empty())
RemovableFuncs.insert(OrigF);
OldF->takeName(OrigF);
if (OrigF->use_empty()) {
RemovableFuncs.insert(OrigF);
}
} else {
OldF->setName(Name);
}
Expand Down
40 changes: 40 additions & 0 deletions sycl/test/check_device_code/native_cpu/local_module_scope.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#include "sycl.hpp"

// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s

// Check that local types is created and place on the stack
// We also check that the attribute mux-orig-fn is created as this is needed to
// find the original function after this pass is run

// CHECK: %localVarTypes = type { ptr addrspace(1) }
// CHECK: define void @_ZTS4TestILi1ELi4EiE.NativeCPUKernel{{.*}} #[[ATTR:[0-9]*]]
// CHECK: alloca %localVarTypes
// CHECK: attributes #[[ATTR]] = {{.*}} "mux-orig-fn"="_ZTS4TestILi1ELi4EiE"
template <int dims, int size, typename T = int>
struct Test;

int main() {
sycl::queue queue;

constexpr int dims = 1;
constexpr int size = 4;

std::array<int, size> data;

const auto range = sycl::range<dims>(size);
const auto range_wg = sycl::range<dims>(1);
{
sycl::buffer<int, dims> buf(data.data(), range);

queue.submit([&](sycl::handler& cgh) {
auto acc = sycl::accessor(buf, cgh, sycl::write_only);
cgh.parallel_for_work_group<Test<dims, size>>(
range, range_wg,
[=](auto group) {
acc[group.get_group_id()] = 42;
});
});
queue.wait_and_throw();
}
return 0;
}

0 comments on commit ff92c90

Please sign in to comment.