From 2378959977922d95e36ed727000c14ac866cd731 Mon Sep 17 00:00:00 2001 From: Flakebi Date: Thu, 4 Sep 2025 00:50:22 +0200 Subject: [PATCH] Add intrinsic for dynamic shared memory Shared memory is a memory region that is shared between all threads in a thread block/workgroup on GPUs. Dynamic shared memory is in that memory region, though the allocated size is specified late, when launching a kernel. Shared memory in amdgpu and nvptx lives in address space 3. Dynamic shared memory is implemented by creating an external global variable in address space 3. The global is declared with size 0, as the actual size is only known at runtime. It is defined behavior in LLVM to access an external global outside the defined size. As far as I know, there is no similar way to get the allocated size of dynamic shared memory on amdgpu an nvptx, so users have to pass this out-of-band or rely on target specific ways. --- compiler/rustc_abi/src/lib.rs | 2 ++ compiler/rustc_codegen_llvm/src/declare.rs | 23 ++++++++++++++++ compiler/rustc_codegen_llvm/src/intrinsic.rs | 20 +++++++++++++- compiler/rustc_codegen_llvm/src/llvm/ffi.rs | 7 +++++ .../rustc_codegen_ssa/src/mir/intrinsic.rs | 1 + .../rustc_hir_analysis/src/check/intrinsic.rs | 2 ++ .../rustc_llvm/llvm-wrapper/RustWrapper.cpp | 21 +++++++++++---- compiler/rustc_span/src/symbol.rs | 1 + library/core/src/intrinsics/mod.rs | 17 ++++++++++++ src/bootstrap/src/core/build_steps/compile.rs | 8 ++++++ src/build_helper/src/targets.rs | 1 + .../src/directives/directive_names.rs | 1 + tests/codegen-llvm/dynamic_shared_memory.rs | 27 +++++++++++++++++++ 13 files changed, 125 insertions(+), 6 deletions(-) create mode 100644 tests/codegen-llvm/dynamic_shared_memory.rs diff --git a/compiler/rustc_abi/src/lib.rs b/compiler/rustc_abi/src/lib.rs index 369874521e57e..0804310ac7078 100644 --- a/compiler/rustc_abi/src/lib.rs +++ b/compiler/rustc_abi/src/lib.rs @@ -1716,6 +1716,8 @@ pub struct AddressSpace(pub u32); impl AddressSpace { /// LLVM's `0` address space. pub const ZERO: Self = AddressSpace(0); + /// The address space for shared memory on nvptx and amdgpu. + pub const SHARED: Self = AddressSpace(3); } /// The way we represent values to the backend diff --git a/compiler/rustc_codegen_llvm/src/declare.rs b/compiler/rustc_codegen_llvm/src/declare.rs index 960a895a2031c..96c09702eb1ce 100644 --- a/compiler/rustc_codegen_llvm/src/declare.rs +++ b/compiler/rustc_codegen_llvm/src/declare.rs @@ -14,6 +14,7 @@ use std::borrow::Borrow; use itertools::Itertools; +use rustc_abi::AddressSpace; use rustc_codegen_ssa::traits::TypeMembershipCodegenMethods; use rustc_data_structures::fx::FxIndexSet; use rustc_middle::ty::{Instance, Ty}; @@ -99,6 +100,28 @@ impl<'ll, CX: Borrow>> GenericCx<'ll, CX> { ) } } + + /// Declare a global value in a specific address space. + /// + /// If there’s a value with the same name already declared, the function will + /// return its Value instead. + pub(crate) fn declare_global_in_addrspace( + &self, + name: &str, + ty: &'ll Type, + addr_space: AddressSpace, + ) -> &'ll Value { + debug!("declare_global(name={name:?}, addrspace={addr_space:?})"); + unsafe { + llvm::LLVMRustGetOrInsertGlobalInAddrspace( + (**self).borrow().llmod, + name.as_c_char_ptr(), + name.len(), + ty, + addr_space.0, + ) + } + } } impl<'ll, 'tcx> CodegenCx<'ll, 'tcx> { diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 49d3dedbeabdf..61e262c981068 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1,7 +1,9 @@ use std::assert_matches::assert_matches; use std::cmp::Ordering; -use rustc_abi::{Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size}; +use rustc_abi::{ + AddressSpace, Align, BackendRepr, ExternAbi, Float, HasDataLayout, Primitive, Size, +}; use rustc_codegen_ssa::base::{compare_simd_types, wants_msvc_seh, wants_wasm_eh}; use rustc_codegen_ssa::codegen_attrs::autodiff_attrs; use rustc_codegen_ssa::common::{IntPredicate, TypeKind}; @@ -532,6 +534,22 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { return Ok(()); } + sym::dynamic_shared_memory => { + let global = self.declare_global_in_addrspace( + "dynamic_shared_memory", + self.type_array(self.type_i8(), 0), + AddressSpace::SHARED, + ); + let ty::RawPtr(inner_ty, _) = result.layout.ty.kind() else { unreachable!() }; + let alignment = self.align_of(*inner_ty).bytes() as u32; + unsafe { + if alignment > llvm::LLVMGetAlignment(global) { + llvm::LLVMSetAlignment(global, alignment); + } + } + self.cx().const_pointercast(global, self.type_ptr()) + } + _ if name.as_str().starts_with("simd_") => { // Unpack non-power-of-2 #[repr(packed, simd)] arguments. // This gives them the expected layout of a regular #[repr(simd)] vector. diff --git a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs index b66fc157b3cb2..2639b71caf523 100644 --- a/compiler/rustc_codegen_llvm/src/llvm/ffi.rs +++ b/compiler/rustc_codegen_llvm/src/llvm/ffi.rs @@ -1907,6 +1907,13 @@ unsafe extern "C" { NameLen: size_t, T: &'a Type, ) -> &'a Value; + pub(crate) fn LLVMRustGetOrInsertGlobalInAddrspace<'a>( + M: &'a Module, + Name: *const c_char, + NameLen: size_t, + T: &'a Type, + AddressSpace: c_uint, + ) -> &'a Value; pub(crate) fn LLVMRustInsertPrivateGlobal<'a>(M: &'a Module, T: &'a Type) -> &'a Value; pub(crate) fn LLVMRustGetNamedValue( M: &Module, diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs index 3c667b8e88203..18dc839901a39 100644 --- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs @@ -110,6 +110,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { sym::abort | sym::unreachable | sym::cold_path + | sym::dynamic_shared_memory | sym::breakpoint | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index cfc6bc2f3a0a9..4af40d336172c 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -74,6 +74,7 @@ fn intrinsic_operation_unsafety(tcx: TyCtxt<'_>, intrinsic_id: LocalDefId) -> hi | sym::align_of | sym::needs_drop | sym::caller_location + | sym::dynamic_shared_memory | sym::add_with_overflow | sym::sub_with_overflow | sym::mul_with_overflow @@ -213,6 +214,7 @@ pub(crate) fn check_intrinsic_type( } sym::rustc_peek => (1, 0, vec![param(0)], param(0)), sym::caller_location => (0, 0, vec![], tcx.caller_location_ty()), + sym::dynamic_shared_memory => (1, 0, vec![], Ty::new_mut_ptr(tcx, param(0))), sym::assert_inhabited | sym::assert_zero_valid | sym::assert_mem_uninitialized_valid => { (1, 0, vec![], tcx.types.unit) } diff --git a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp index 361a5f765510f..25c49b2ee8a6f 100644 --- a/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp +++ b/compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp @@ -209,10 +209,10 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertFunction(LLVMModuleRef M, .getCallee()); } -extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, - const char *Name, - size_t NameLen, - LLVMTypeRef Ty) { +extern "C" LLVMValueRef +LLVMRustGetOrInsertGlobalInAddrspace(LLVMModuleRef M, const char *Name, + size_t NameLen, LLVMTypeRef Ty, + unsigned AddressSpace) { Module *Mod = unwrap(M); auto NameRef = StringRef(Name, NameLen); @@ -223,10 +223,21 @@ extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, GlobalVariable *GV = Mod->getGlobalVariable(NameRef, true); if (!GV) GV = new GlobalVariable(*Mod, unwrap(Ty), false, - GlobalValue::ExternalLinkage, nullptr, NameRef); + GlobalValue::ExternalLinkage, nullptr, NameRef, + nullptr, GlobalValue::NotThreadLocal, AddressSpace); return wrap(GV); } +extern "C" LLVMValueRef LLVMRustGetOrInsertGlobal(LLVMModuleRef M, + const char *Name, + size_t NameLen, + LLVMTypeRef Ty) { + Module *Mod = unwrap(M); + unsigned AddressSpace = Mod->getDataLayout().getDefaultGlobalsAddressSpace(); + return LLVMRustGetOrInsertGlobalInAddrspace(M, Name, NameLen, Ty, + AddressSpace); +} + extern "C" LLVMValueRef LLVMRustInsertPrivateGlobal(LLVMModuleRef M, LLVMTypeRef Ty) { return wrap(new GlobalVariable(*unwrap(M), unwrap(Ty), false, diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 77260d07c9950..7980d541555e1 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -903,6 +903,7 @@ symbols! { dyn_star, dyn_trait, dynamic_no_pic: "dynamic-no-pic", + dynamic_shared_memory, e, edition_panic, effective_target_features, diff --git a/library/core/src/intrinsics/mod.rs b/library/core/src/intrinsics/mod.rs index 904aa52c7845b..3ffe38b758c96 100644 --- a/library/core/src/intrinsics/mod.rs +++ b/library/core/src/intrinsics/mod.rs @@ -3238,6 +3238,23 @@ pub(crate) const fn miri_promise_symbolic_alignment(ptr: *const (), align: usize ) } +/// Returns a pointer to dynamic shared memory. +/// +/// The returned pointer is the start of the dynamic shared memory region. +/// All pointers returned by `dynamic_shared_memory` point to the same address, +/// so alias the same memory. +/// The returned pointer is aligned by at least the alignment of `T`. +/// +/// # Other APIs +/// +/// CUDA and HIP call this shared memory. +/// OpenCL and SYCL call this local memory. +#[rustc_intrinsic] +#[rustc_nounwind] +#[unstable(feature = "dynamic_shared_memory", issue = "135513")] +#[cfg(any(target_arch = "amdgpu", target_arch = "nvptx64"))] +pub fn dynamic_shared_memory() -> *mut T; + /// Copies the current location of arglist `src` to the arglist `dst`. /// /// FIXME: document safety requirements diff --git a/src/bootstrap/src/core/build_steps/compile.rs b/src/bootstrap/src/core/build_steps/compile.rs index 0b75e85772f86..893c48c7cb9db 100644 --- a/src/bootstrap/src/core/build_steps/compile.rs +++ b/src/bootstrap/src/core/build_steps/compile.rs @@ -668,6 +668,14 @@ pub fn std_cargo(builder: &Builder<'_>, target: TargetSelection, cargo: &mut Car cargo.rustflag("-Cforce-unwind-tables=yes"); } + // amdgcn must have a cpu specified, otherwise it refuses to compile. + // We want to be able to run tests for amdgcn that depend on core, therefore + // we need to be able to compiler core. + // The cpu used here must match in tests that use the standard library. + if target.contains("amdgcn") && target.file.is_none() { + cargo.rustflag("-Ctarget-cpu=gfx900"); + } + // Enable frame pointers by default for the library. Note that they are still controlled by a // separate setting for the compiler. cargo.rustflag("-Zunstable-options"); diff --git a/src/build_helper/src/targets.rs b/src/build_helper/src/targets.rs index cccc413368bc9..c1b198f0ac864 100644 --- a/src/build_helper/src/targets.rs +++ b/src/build_helper/src/targets.rs @@ -6,6 +6,7 @@ // `compiletest`. pub fn target_supports_std(target_tuple: &str) -> bool { !(target_tuple.contains("-none") + || target_tuple.contains("amdgcn") || target_tuple.contains("nvptx") || target_tuple.contains("switch")) } diff --git a/src/tools/compiletest/src/directives/directive_names.rs b/src/tools/compiletest/src/directives/directive_names.rs index 0ef84fb459493..cdb2da6af4c26 100644 --- a/src/tools/compiletest/src/directives/directive_names.rs +++ b/src/tools/compiletest/src/directives/directive_names.rs @@ -188,6 +188,7 @@ pub(crate) const KNOWN_DIRECTIVE_NAMES: &[&str] = &[ "only-aarch64", "only-aarch64-apple-darwin", "only-aarch64-unknown-linux-gnu", + "only-amdgpu", "only-apple", "only-arm", "only-avr", diff --git a/tests/codegen-llvm/dynamic_shared_memory.rs b/tests/codegen-llvm/dynamic_shared_memory.rs new file mode 100644 index 0000000000000..8828c30958f9d --- /dev/null +++ b/tests/codegen-llvm/dynamic_shared_memory.rs @@ -0,0 +1,27 @@ +// Checks that dynamic_shared_memory works. + +//@ revisions: amdgpu nvptx x86 +//@ compile-flags: --crate-type=rlib +// +//@ [amdgpu] compile-flags: --target amdgcn-amd-amdhsa -Ctarget-cpu=gfx900 +//@ [amdgpu] only-amdgpu +//@ [amdgpu] needs-llvm-components: amdgpu +//@ [nvptx] compile-flags: --target nvptx64-nvidia-cuda +//@ [nvptx] only-nvptx64 +//@ [nvptx] needs-llvm-components: nvptx +//@ [x86] compile-flags: --target x86_64-unknown-linux-gnu +//@ [x86] only-x86_64 +//@ [x86] needs-llvm-components: x86 +//@ [x86] should-fail +#![feature(core_intrinsics, dynamic_shared_memory)] +#![no_std] + +use core::intrinsics::dynamic_shared_memory; + +// CHECK: @dynamic_shared_memory = external addrspace(3) global [0 x i8], align 8 +// CHECK: ret ptr addrspacecast (ptr addrspace(3) @dynamic_shared_memory to ptr) +pub fn fun() -> *mut i32 { + let res = dynamic_shared_memory::(); + dynamic_shared_memory::(); // Increase alignment to 8 + res +}