Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Do not internalize kernels when supporting dynamic linking #15307

Merged
merged 6 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,11 +667,13 @@ void ModuleDesc::restoreLinkageOfDirectInvokeSimdTargets() {
// the transformation safe.
static bool mustPreserveGV(const GlobalValue &GV) {
if (const Function *F = dyn_cast<Function>(&GV)) {
// When dynamic linking is supported, we internalize everything that can
// not be imported which also means that there is no point of having it
// When dynamic linking is supported, we internalize everything (except
// kernels which are the entry points from host code to device code) that
// cannot be imported which also means that there is no point of having it
// visible outside of the current module.
if (AllowDeviceImageDependencies)
return canBeImportedFunction(*F);
return F->getCallingConv() == CallingConv::SPIR_KERNEL ||
canBeImportedFunction(*F);

// Otherwise, we are being even more aggressive: SYCL modules are expected
// to be self-contained, meaning that they have no external dependencies.
Expand Down
9 changes: 6 additions & 3 deletions sycl/doc/design/SharedLibraries.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,9 @@ if the Function "can be imported". A `canBeImportedFunction` is:

1. Not an intrinsic
2. Name does not start with "__"
3. Demangled name does not start with "__"
4. Must be a `SYCL_EXTERNAL` function
3. Is not a SPIRV, SYCL, or ESIMD builtin function
LU-JOHN marked this conversation as resolved.
Show resolved Hide resolved
4. Demangled name does not start with "__"
5. Must be a `SYCL_EXTERNAL` function

More information about `SYCL_EXTERNAL` can be found in:
https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:syclexternal
Expand All @@ -173,7 +174,9 @@ the following modifications:
function. Instead the dependency is recorded in the imported symbols property list.
- An image that provides a `canBeImportedFunction` has the symbol recorded in the exported
symbols property list.
- All functions symbols that are not `canBeImportedFunction` are internalized
- All functions symbols that are not `canBeImportedFunction` and are not kernels are internalized.
Note that kernel functions should not be included in `canBeImportedFunction` since kernels
are only callable by host code, and thus would never need to be imported into a device image.



Expand Down
282 changes: 282 additions & 0 deletions sycl/test-e2e/DeviceDependencies/free_function_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
// Ensure -fsycl-allow-device-dependencies can work with free function kernels.

// REQUIRES: aspect-usm_shared_allocations
// RUN: %{build} -o %t.out -fsycl-allow-device-dependencies
// RUN: %{run} %t.out

// The name mangling for free function kernels currently does not work with PTX.
// UNSUPPORTED: cuda

#include <iostream>
#include <sycl/detail/core.hpp>
#include <sycl/ext/oneapi/free_function_queries.hpp>
#include <sycl/usm.hpp>

using namespace sycl;

void printUSM(int *usmPtr, int size) {
std::cout << "usmPtr[] = {";
for (int i = 0; i < size; i++) {
std::cout << usmPtr[i] << ", ";
}
std::cout << "}\n";
}

bool checkUSM(int *usmPtr, int size, int *Result) {
bool Pass = true;
for (int i = 0; i < size; i++) {
if (usmPtr[i] != Result[i]) {
Pass = false;
break;
}
}
if (Pass)
return true;

std::cout << "Expected = {";
for (int i = 0; i < size; i++) {
std::cout << Result[i] << ", ";
}
std::cout << "}\n";
std::cout << "Result = {";
for (int i = 0; i < size; i++) {
std::cout << usmPtr[i] << ", ";
}
std::cout << "}\n";
return false;
}

extern "C" SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::single_task_kernel)) void ff_0(int *ptr,
int start,
int end) {
for (int i = start; i <= end; i++)
ptr[i] = start + end;
}

bool test_0(queue Queue) {
constexpr int Range = 10;
int *usmPtr = malloc_shared<int>(Range, Queue);
int start = 3;
int end = 5;
int Result[Range] = {0, 0, 0, 8, 8, 8, 0, 0, 0, 0};
range<1> R1{Range};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.single_task([=]() {
for (int i = start; i <= end; i++)
usmPtr[i] = start + end;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 0a: " << (PassA ? "PASS" : "FAIL") << std::endl;

bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
LU-JOHN marked this conversation as resolved.
Show resolved Hide resolved
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id = ext::oneapi::experimental::get_kernel_id<ff_0>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, start);
Handler.set_arg(2, end);
Handler.single_task(Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 0b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

// Overloaded free function definition.
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<1>))
void ff_1(int *ptr, int start, int end) {
nd_item<1> Item = ext::oneapi::this_work_item::get_nd_item<1>();
id<1> GId = Item.get_global_id();
ptr[GId.get(0)] = GId.get(0) + start + end;
}

bool test_1(queue Queue) {
constexpr int Range = 10;
int *usmPtr = malloc_shared<int>(Range, Queue);
int start = 3;
int Result[Range] = {13, 14, 15, 16, 17, 18, 19, 20, 21, 22};
nd_range<1> R1{{Range}, {1}};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.parallel_for(R1, [=](nd_item<1> Item) {
id<1> GId = Item.get_global_id();
usmPtr[GId.get(0)] = GId.get(0) + start + Range;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 1a: " << (PassA ? "PASS" : "FAIL") << std::endl;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that our E2E tests should print anything in a positive scenario when they pass. The motivation for that is to reduce amount of I/O performed by tests and less data that has to be processed by lit framework. Logs for passed tests are hidden anyways, so they are of no use in our CI.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add TODO comment

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather immediately drop (almost) all std::cout lines, but I will leave that up to SYCL RT reviewers, who are code owners of the file


bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id = ext::oneapi::experimental::get_kernel_id<(
void (*)(int *, int, int))ff_1>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, start);
Handler.set_arg(2, Range);
Handler.parallel_for(R1, Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 1b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

// Overloaded free function definition.
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<2>))
void ff_1(int *ptr, int start) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(ptr);
nd_item<2> Item = ext::oneapi::this_work_item::get_nd_item<2>();
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + start;
}

bool test_2(queue Queue) {
constexpr int Range = 16;
int *usmPtr = malloc_shared<int>(Range, Queue);
int value = 55;
int Result[Range] = {55, 56, 55, 56, 56, 57, 56, 57,
55, 56, 55, 56, 56, 57, 56, 57};
nd_range<2> R2{range<2>{4, 4}, range<2>{2, 2}};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.parallel_for(R2, [=](nd_item<2> Item) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(usmPtr);
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + value;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 2a: " << (PassA ? "PASS" : "FAIL") << std::endl;

bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id =
ext::oneapi::experimental::get_kernel_id<(void (*)(int *, int))ff_1>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, value);
Handler.parallel_for(R2, Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 2b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

// Templated free function definition.
template <typename T>
SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(
(ext::oneapi::experimental::nd_range_kernel<2>))
void ff_3(T *ptr, T start) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(ptr);
nd_item<2> Item = ext::oneapi::this_work_item::get_nd_item<2>();
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + start;
}

// Explicit instantiation with "int*".
template void ff_3(int *ptr, int start);

bool test_3(queue Queue) {
constexpr int Range = 16;
int *usmPtr = malloc_shared<int>(Range, Queue);
int value = 55;
int Result[Range] = {55, 56, 55, 56, 56, 57, 56, 57,
55, 56, 55, 56, 56, 57, 56, 57};
nd_range<2> R2{range<2>{4, 4}, range<2>{2, 2}};

memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.parallel_for(R2, [=](nd_item<2> Item) {
int(&ptr2D)[4][4] = *reinterpret_cast<int(*)[4][4]>(usmPtr);
id<2> GId = Item.get_global_id();
id<2> LId = Item.get_local_id();
ptr2D[GId.get(0)][GId.get(1)] = LId.get(0) + LId.get(1) + value;
});
});
Queue.wait();
bool PassA = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 3a: " << (PassA ? "PASS" : "FAIL") << std::endl;

bool PassB = false;
// TODO: Avoid using __SYCL_DEVICE_ONLY__ or give rationale with a comment
#ifndef __SYCL_DEVICE_ONLY__
kernel_bundle Bundle =
get_kernel_bundle<bundle_state::executable>(Queue.get_context());
kernel_id Kernel_id = ext::oneapi::experimental::get_kernel_id<(
void (*)(int *, int))ff_3<int>>();
kernel Kernel = Bundle.get_kernel(Kernel_id);
memset(usmPtr, 0, Range * sizeof(int));
Queue.submit([&](handler &Handler) {
Handler.set_arg(0, usmPtr);
Handler.set_arg(1, value);
Handler.parallel_for(R2, Kernel);
});
Queue.wait();
PassB = checkUSM(usmPtr, Range, Result);
// TODO: Avoid printing anything if test passes to reduce I/O.
std::cout << "Test 3b: " << (PassB ? "PASS" : "FAIL") << std::endl;

free(usmPtr, Queue);
#endif
return PassA && PassB;
}

int main() {
queue Queue;

bool Pass = true;
Pass &= test_0(Queue);
Pass &= test_1(Queue);
Pass &= test_2(Queue);
Pass &= test_3(Queue);

return Pass ? 0 : 1;
}
Loading