Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions compiler/rustc_abi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1702,6 +1702,9 @@ pub struct AddressSpace(pub u32);
impl AddressSpace {
/// LLVM's `0` address space.
pub const ZERO: Self = AddressSpace(0);
/// The address space for workgroup memory on nvptx and amdgpu.
/// See e.g. the `gpu_launch_sized_workgroup_mem` intrinsic for details.
pub const GPU_WORKGROUP: Self = AddressSpace(3);
}

/// The way we represent values to the backend
Expand Down
23 changes: 23 additions & 0 deletions compiler/rustc_codegen_llvm/src/declare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use std::borrow::Borrow;

use itertools::Itertools;
use rustc_abi::AddressSpace;
use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods;
use rustc_data_structures::fx::FxIndexSet;
use rustc_middle::ty::{Instance, Ty};
Expand Down Expand Up @@ -97,6 +98,28 @@ impl<'ll, CX: Borrow<SCx<'ll>>> GenericCx<'ll, CX> {
)
}
}

/// Declare a global value in a specific address space.
///
/// If there’s a value with the same name already declared, the function will
/// return its Value instead.
pub(crate) fn declare_global_in_addrspace(
&self,
name: &str,
ty: &'ll Type,
addr_space: AddressSpace,
) -> &'ll Value {
debug!("declare_global(name={name:?}, addrspace={addr_space:?})");
unsafe {
llvm::LLVMRustGetOrInsertGlobalInAddrspace(
(**self).borrow().llmod,
name.as_c_char_ptr(),
name.len(),
ty,
addr_space.0,
)
}
}
}

impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> {
Expand Down
43 changes: 41 additions & 2 deletions compiler/rustc_codegen_llvm/src/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ use std::ffi::c_uint;
use std::ptr;

use rustc_abi::{
Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size, WrappingRange,
AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size,
WrappingRange,
};
use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh};
use rustc_codegen_ssa::codegen_attrs::autodiff_attrs;
Expand All @@ -24,7 +25,7 @@ use rustc_session::config::CrateType;
use rustc_span::{Span, Symbol, sym};
use rustc_symbol_mangling::{mangle_internal_symbol, symbol_name_for_instance_in_crate};
use rustc_target::callconv::PassMode;
use rustc_target::spec::Os;
use rustc_target::spec::{Arch, Os};
use tracing::debug;

use crate::abi::FnAbiLlvmExt;
Expand Down Expand Up @@ -575,6 +576,44 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
return Ok(());
}

sym::gpu_launch_sized_workgroup_mem => {
// Generate an anonymous global per call, with these properties:
// 1. The global is in the address space for workgroup memory
// 2. It is an `external` global
// 3. It is correctly aligned for the pointee `T`
// All instances of extern addrspace(gpu_workgroup) globals are merged in the LLVM backend.
// The name is irrelevant.
// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared
// FIXME Workaround an nvptx backend issue that extern globals must have a name
let name = if tcx.sess.target.arch == Arch::Nvptx64 {
"gpu_launch_sized_workgroup_mem"
} else {
""
};
let global = self.declare_global_in_addrspace(
name,
self.type_array(self.type_i8(), 0),
AddressSpace::GPU_WORKGROUP,
);
let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() };
// The alignment of the global is used to specify the *minimum* alignment that
// must be obeyed by the GPU runtime.
// When multiple of these global variables are used by a kernel, the maximum alignment is taken.
// See https://github.com/llvm/llvm-project/blob/a271d07488a85ce677674bbe8101b10efff58c95/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp#L821
let alignment = self.align_of(*inner_ty).bytes() as u32;
unsafe {
// FIXME Workaround the above issue by taking maximum alignment if the global existed
if tcx.sess.target.arch == Arch::Nvptx64 {
if alignment > llvm::LLVMGetAlignment(global) {
llvm::LLVMSetAlignment(global, alignment);
}
} else {
llvm::LLVMSetAlignment(global, alignment);
}
}
self.cx().const_pointercast(global, self.type_ptr())
}

sym::amdgpu_dispatch_ptr => {
let val = self.call_intrinsic("llvm.amdgcn.dispatch.ptr", &[], &[]);
// Relying on `LLVMBuildPointerCast` to produce an addrspacecast
Expand Down
7 changes: 7 additions & 0 deletions compiler/rustc_codegen_llvm/src/llvm/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1989,6 +1989,13 @@ unsafe extern "C" {
NameLen: size_t,
T: &'a Type,
) -> &'a Value;
pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>(
M: &'a Module,
Name: *const c_char,
NameLen: size_t,
T: &'a Type,
AddressSpace: c_uint,
) -> &'a Value;
pub(crate) fn LLVMRustGetNamedValue(
M: &Module,
Name: *const c_char,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
sym::abort
| sym::unreachable
| sym::cold_path
| sym::gpu_launch_sized_workgroup_mem
| sym::breakpoint
| sym::amdgpu_dispatch_ptr
| sym::assert_zero_valid
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_hir_analysis/src/check/intrinsic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi
| sym::forget
| sym::frem_algebraic
| sym::fsub_algebraic
| sym::gpu_launch_sized_workgroup_mem
| sym::is_val_statically_known
| sym::log2f16
| sym::log2f32
Expand Down Expand Up @@ -299,6 +300,7 @@ pub(crate) fn check_intrinsic_type(
sym::offset_of => (1, 0, vec![tcx.types.u32, tcx.types.u32], tcx.types.usize),
sym::rustc_peek => (1, 0, vec![param(0)], param(0)),
sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()),
sym::gpu_launch_sized_workgroup_mem => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))),
sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => {
(1, 0, vec![], tcx.types.unit)
}
Expand Down
26 changes: 21 additions & 5 deletions compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,10 +298,12 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M,
.getCallee());
}

extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
const char *Name,
size_t NameLen,
LLVMTypeRef Ty) {
// Get the global variable with the given name if it exists or create a new
// external global.
extern "C" LLVMValueRef
LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name,
Comment on lines +303 to +304
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this function will always create a specifically-extern global, can we document that here? The name is whatever, to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I added a comment for that (it already did that before my change, I only added the addrspace argument)

size_t NameLen, LLVMTypeRef Ty,
unsigned int AddressSpace) {
Module *Mod = unwrap(M);
auto NameRef = StringRef(Name, NameLen);

Expand All @@ -312,10 +314,24 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true);
if (!GV)
GV = new GlobalVariable(*Mod, unwrap(Ty), false,
GlobalValue::ExternalLinkage, nullptr, NameRef);
GlobalValue::ExternalLinkage, nullptr, NameRef,
nullptr, GlobalValue::NotThreadLocal, AddressSpace);
return wrap(GV);
}

// Get the global variable with the given name if it exists or create a new
// external global.
extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M,
const char *Name,
size_t NameLen,
LLVMTypeRef Ty) {
Module *Mod = unwrap(M);
unsigned int AddressSpace =
Mod->getDataLayout().getDefaultGlobalsAddressSpace();
return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty,
AddressSpace);
}

// Must match the layout of `rustc_codegen_llvm::llvm::ffi::AttributeKind`.
enum class LLVMRustAttributeKind {
AlwaysInline = 0,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_span/src/symbol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,7 @@ symbols! {
global_asm,
global_registration,
globs,
gpu_launch_sized_workgroup_mem,
gt,
guard_patterns,
half_open_range_patterns,
Expand Down
40 changes: 40 additions & 0 deletions library/core/src/intrinsics/gpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,46 @@

#![unstable(feature = "gpu_intrinsics", issue = "none")]

/// Returns the pointer to workgroup memory allocated at launch-time on GPUs.
///
/// Workgroup memory is a memory region that is shared between all threads in
/// the same workgroup. It is faster to access than other memory but pointers do not
/// work outside the workgroup where they were obtained.
/// Workgroup memory can be allocated statically or after compilation, when
/// launching a gpu-kernel. `gpu_launch_sized_workgroup_mem` returns the pointer to
/// the memory that is allocated at launch-time.
/// The size of this memory can differ between launches of a gpu-kernel, depending on
/// what is specified at launch-time.
/// However, the alignment is fixed by the kernel itself, at compile-time.
///
/// The returned pointer is the start of the workgroup memory region that is
/// allocated at launch-time.
/// All calls to `gpu_launch_sized_workgroup_mem` in a workgroup, independent of the
/// generic type, return the same address, so alias the same memory.
/// The returned pointer is aligned by at least the alignment of `T`.
///
/// # Safety
///
/// The pointer is safe to dereference from the start (the returned pointer) up to the
/// size of workgroup memory that was specified when launching the current gpu-kernel.
/// This allocated size is not related in any way to `T`.
///
/// The user must take care of synchronizing access to workgroup memory between
/// threads in a workgroup. The usual data race requirements apply.
///
/// # Other APIs
///
/// CUDA and HIP call this dynamic shared memory, shared between threads in a block.
/// OpenCL and SYCL call this local memory, shared between threads in a work-group.
/// GLSL calls this shared memory, shared between invocations in a work group.
/// DirectX calls this groupshared memory, shared between threads in a thread-group.
#[must_use = "returns a pointer that does nothing unless used"]
#[rustc_intrinsic]
#[rustc_nounwind]
#[unstable(feature = "gpu_launch_sized_workgroup_mem", issue = "135513")]
#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))]
pub fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;

/// Returns a pointer to the HSA kernel dispatch packet.
///
/// A `gpu-kernel` on amdgpu is always launched through a kernel dispatch packet.
Expand Down
4 changes: 4 additions & 0 deletions src/tools/tidy/src/style.rs
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,10 @@ fn should_ignore(line: &str) -> bool {
|| static_regex!(
"\\s*//@ \\!?(count|files|has|has-dir|hasraw|matches|matchesraw|snapshot)\\s.*"
).is_match(line)
// Matching for FileCheck checks
|| static_regex!(
"\\s*// [a-zA-Z0-9-_]*:\\s.*"
).is_match(line)
}

/// Returns `true` if `line` is allowed to be longer than the normal limit.
Expand Down
32 changes: 32 additions & 0 deletions tests/codegen-llvm/gpu-launch-sized-workgroup-memory.rs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At first I kind of wanted there to be another test that does the cross-crate version of this. Then I remembered what was discussed elsewhere: that the targets in question are pure LLVM bitcode that gets mashed together anyways, so I am not sure it would actually benefit us, and it would probably involve a ton of tedium with run-make, having considered it in more detail. So, meh.

Basically only leaving this note here to remind myself that if this turns out to go awry in the future, I can update in the direction of following this kind of instinct more often. :^)

Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Checks that the GPU intrinsic to get launch-sized workgroup memory works
// and correctly aligns the `external addrspace(...) global`s over multiple calls.

//@ revisions: amdgpu nvptx
//@ compile-flags: --crate-type=rlib -Copt-level=1
//
//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900
//@ [amdgpu] needs-llvm-components: amdgpu
//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda
//@ [nvptx] needs-llvm-components: nvptx
//@ add-minicore
#![feature(intrinsics, no_core, rustc_attrs)]
#![no_core]

extern crate minicore;

#[rustc_intrinsic]
#[rustc_nounwind]
fn gpu_launch_sized_workgroup_mem<T>() -> *mut T;

// amdgpu-DAG: @[[SMALL:[^ ]+]] = external addrspace(3) global [0 x i8], align 4
// amdgpu-DAG: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
// amdgpu: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[SMALL]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }

// nvptx: @[[BIG:[^ ]+]] = external addrspace(3) global [0 x i8], align 8
// nvptx: ret { ptr, ptr } { ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr), ptr addrspacecast (ptr addrspace(3) @[[BIG]] to ptr) }
#[unsafe(no_mangle)]
pub fn fun() -> (*mut i32, *mut f64) {
let small = gpu_launch_sized_workgroup_mem::<i32>();
let big = gpu_launch_sized_workgroup_mem::<f64>(); // Increase alignment to 8
(small, big)
}
Loading