Skip to content

atomic fences cause errors and atomic instructions are emitted without fences on nvptx64-nvidia-cuda #136480

Open
@usamoi

Description

@usamoi

I tried this code:

#![no_std]
#![allow(internal_features)]
#![feature(core_intrinsics)]

use core::sync::atomic::AtomicU32;

#[panic_handler]
fn panic_handler(_: &core::panic::PanicInfo<'_>) -> ! {
    loop {}
}

static COUNTER: AtomicU32 = AtomicU32::new(0);

// use intrinsics here, since inliner not work for `AtomicU32::fetch_add`

#[no_mangle]
fn atomic_xadd_seqcst() {
    unsafe {
        core::intrinsics::atomic_xadd_seqcst(COUNTER.as_ptr(), 1);
    }
}

#[no_mangle]
fn atomic_xadd_acqrel() {
    unsafe {
        core::intrinsics::atomic_xadd_acqrel(COUNTER.as_ptr(), 1);
    }
}

#[no_mangle]
fn atomic_xadd_relaxed() {
    unsafe {
        core::intrinsics::atomic_xadd_relaxed(COUNTER.as_ptr(), 1);
    }
}
[unstable]
build-std = ["core"]

[build]
target = "nvptx64-nvidia-cuda"
rustflags = [
    "-Clinker=llvm-bitcode-linker",
    "-Clinker-flavor=llbc",
    "-Zunstable-options",
    "-Clink-arg=-O3",
    "-Ctarget-cpu=sm_90",
]

Same instructions are generated for these 3 functions.

//
// Generated by LLVM NVPTX Back-End
//

.version 7.8
.target sm_90
.address_size 64

	// .globl	atomic_xadd_seqcst      // -- Begin function atomic_xadd_seqcst
.global .align 4 .b8 _ZN2nv7COUNTER17h4c11a16573192bf7E[4];
                                        // @atomic_xadd_seqcst
.visible .func atomic_xadd_seqcst()
{
	.reg .b32 	%r<2>;
	.reg .b64 	%rd<2>;

// %bb.0:
	mov.u64 	%rd1, _ZN2nv7COUNTER17h4c11a16573192bf7E;
	atom.global.add.u32 	%r1, [%rd1], 1;
	ret;
                                        // -- End function
}
	// .globl	atomic_xadd_acqrel      // -- Begin function atomic_xadd_acqrel
.visible .func atomic_xadd_acqrel()     // @atomic_xadd_acqrel
{
	.reg .b32 	%r<2>;
	.reg .b64 	%rd<2>;

// %bb.0:
	mov.u64 	%rd1, _ZN2nv7COUNTER17h4c11a16573192bf7E;
	atom.global.add.u32 	%r1, [%rd1], 1;
	ret;
                                        // -- End function
}
	// .globl	atomic_xadd_relaxed     // -- Begin function atomic_xadd_relaxed
.visible .func atomic_xadd_relaxed()    // @atomic_xadd_relaxed
{
	.reg .b32 	%r<2>;
	.reg .b64 	%rd<2>;

// %bb.0:
	mov.u64 	%rd1, _ZN2nv7COUNTER17h4c11a16573192bf7E;
	atom.global.add.u32 	%r1, [%rd1], 1;
	ret;
                                        // -- End function
}

Different instruction should be emitted.

Link for PTX document:

Parallel Synchronization and Communication Instructions: atom

Meta

rustc --version --verbose:

rustc 1.86.0-nightly (8239a37f9 2025-02-01)
binary: rustc
commit-hash: 8239a37f9c0951a037cfc51763ea52a20e71e6bd
commit-date: 2025-02-01
host: x86_64-unknown-linux-gnu
release: 1.86.0-nightly
LLVM version: 19.1.7

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-atomicArea: Atomics, barriers, and sync primitivesC-bugCategory: This is a bug.O-NVPTXTarget: the NVPTX LLVM backend for running rust on GPUs, https://llvm.org/docs/NVPTXUsage.htmlT-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions