From 3b9535b507ba15f7f16cbd9fba6e7d2135eb3c04 Mon Sep 17 00:00:00 2001
From: Alice Ryhl <alice@ryhl.io>
Date: Fri, 22 Dec 2023 18:40:31 +0100
Subject: [PATCH] runtime: only mitigate false sharing for multi-threaded
 runtimes

---
 .../runtime/scheduler/multi_thread/worker.rs  |  11 ++
 .../scheduler/multi_thread_alt/worker.rs      |  11 ++
 tokio/src/runtime/task/core.rs                |  96 +-------------
 tokio/src/runtime/task/harness.rs             |   4 +-
 tokio/src/runtime/task/mod.rs                 |  12 ++
 tokio/src/runtime/task/raw.rs                 |   7 +-
 tokio/src/runtime/task/task_box.rs            | 124 ++++++++++++++++++
 tokio/src/util/cacheline.rs                   |  27 +++-
 8 files changed, 194 insertions(+), 98 deletions(-)
 create mode 100644 tokio/src/runtime/task/task_box.rs
diff --git a/tokio/src/runtime/scheduler/multi_thread/worker.rs b/tokio/src/runtime/scheduler/multi_thread/worker.rs
index 313e2ea68f7..d652f5416ec 100644
--- a/tokio/src/runtime/scheduler/multi_thread/worker.rs
+++ b/tokio/src/runtime/scheduler/multi_thread/worker.rs
@@ -1014,6 +1014,17 @@ impl task::Schedule for Arc<Handle> {
     fn yield_now(&self, task: Notified) {
         self.schedule_task(task, true);
     }
+
+    fn min_align(&self) -> usize {
+        use crate::util::cacheline::CachePadded;
+
+        // One for single-threaded runtime, otherwise use a high value to avoid
+        // false sharing.
+        match self.shared.remotes.len() {
+            1 => 1,
+            _ => std::mem::align_of::<CachePadded<()>>(),
+        }
+    }
 }
 
 impl Handle {
diff --git a/tokio/src/runtime/scheduler/multi_thread_alt/worker.rs b/tokio/src/runtime/scheduler/multi_thread_alt/worker.rs
index 8d16418a80b..4af2d65f8be 100644
--- a/tokio/src/runtime/scheduler/multi_thread_alt/worker.rs
+++ b/tokio/src/runtime/scheduler/multi_thread_alt/worker.rs
@@ -1553,6 +1553,17 @@ impl task::Schedule for Arc<Handle> {
     fn yield_now(&self, task: Notified) {
         self.shared.schedule_task(task, true);
     }
+
+    fn min_align(&self) -> usize {
+        use crate::util::cacheline::CachePadded;
+
+        // One for single-threaded runtime, otherwise use a high value to avoid
+        // false sharing.
+        match self.shared.remotes.len() {
+            1 => 1,
+            _ => std::mem::align_of::<CachePadded<()>>(),
+        }
+    }
 }
 
 impl AsMut<Synced> for Synced {
diff --git a/tokio/src/runtime/task/core.rs b/tokio/src/runtime/task/core.rs
index 1903a01aa41..b9c3ab6d220 100644
--- a/tokio/src/runtime/task/core.rs
+++ b/tokio/src/runtime/task/core.rs
@@ -14,7 +14,7 @@ use crate::loom::cell::UnsafeCell;
 use crate::runtime::context;
 use crate::runtime::task::raw::{self, Vtable};
 use crate::runtime::task::state::State;
-use crate::runtime::task::{Id, Schedule};
+use crate::runtime::task::{Id, Schedule, TaskBox};
 use crate::util::linked_list;
 
 use std::num::NonZeroU64;
@@ -30,87 +30,6 @@ use std::task::{Context, Poll, Waker};
 /// Any changes to the layout of this struct _must_ also be reflected in the
 /// const fns in raw.rs.
 ///
-// # This struct should be cache padded to avoid false sharing. The cache padding rules are copied
-// from crossbeam-utils/src/cache_padded.rs
-//
-// Starting from Intel's Sandy Bridge, spatial prefetcher is now pulling pairs of 64-byte cache
-// lines at a time, so we have to align to 128 bytes rather than 64.
-//
-// Sources:
-// - https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
-// - https://github.com/facebook/folly/blob/1b5288e6eea6df074758f877c849b6e73bbb9fbb/folly/lang/Align.h#L107
-//
-// ARM's big.LITTLE architecture has asymmetric cores and "big" cores have 128-byte cache line size.
-//
-// Sources:
-// - https://www.mono-project.com/news/2016/09/12/arm64-icache/
-//
-// powerpc64 has 128-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_ppc64x.go#L9
-#[cfg_attr(
-    any(
-        target_arch = "x86_64",
-        target_arch = "aarch64",
-        target_arch = "powerpc64",
-    ),
-    repr(align(128))
-)]
-// arm, mips, mips64, sparc, and hexagon have 32-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_arm.go#L7
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips.go#L7
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mipsle.go#L7
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips64x.go#L9
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L17
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/hexagon/include/asm/cache.h#L12
-#[cfg_attr(
-    any(
-        target_arch = "arm",
-        target_arch = "mips",
-        target_arch = "mips64",
-        target_arch = "sparc",
-        target_arch = "hexagon",
-    ),
-    repr(align(32))
-)]
-// m68k has 16-byte cache line size.
-//
-// Sources:
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/m68k/include/asm/cache.h#L9
-#[cfg_attr(target_arch = "m68k", repr(align(16)))]
-// s390x has 256-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_s390x.go#L7
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/s390/include/asm/cache.h#L13
-#[cfg_attr(target_arch = "s390x", repr(align(256)))]
-// x86, riscv, wasm, and sparc64 have 64-byte cache line size.
-//
-// Sources:
-// - https://github.com/golang/go/blob/dda2991c2ea0c5914714469c4defc2562a907230/src/internal/cpu/cpu_x86.go#L9
-// - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_wasm.go#L7
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L19
-// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/riscv/include/asm/cache.h#L10
-//
-// All others are assumed to have 64-byte cache line size.
-#[cfg_attr(
-    not(any(
-        target_arch = "x86_64",
-        target_arch = "aarch64",
-        target_arch = "powerpc64",
-        target_arch = "arm",
-        target_arch = "mips",
-        target_arch = "mips64",
-        target_arch = "sparc",
-        target_arch = "hexagon",
-        target_arch = "m68k",
-        target_arch = "s390x",
-    )),
-    repr(align(64))
-)]
 #[repr(C)]
 pub(super) struct Cell<T: Future, S> {
     /// Hot task state data
@@ -205,7 +124,7 @@ pub(super) enum Stage<T: Future> {
 impl<T: Future, S: Schedule> Cell<T, S> {
     /// Allocates a new task cell, containing the header, trailer, and core
     /// structures.
-    pub(super) fn new(future: T, scheduler: S, state: State, task_id: Id) -> Box<Cell<T, S>> {
+    pub(super) fn new(future: T, scheduler: S, state: State, task_id: Id) -> TaskBox<T, S> {
         // Separated into a non-generic function to reduce LLVM codegen
         fn new_header(
             state: State,
@@ -225,22 +144,21 @@ impl<T: Future, S: Schedule> Cell<T, S> {
         #[cfg(all(tokio_unstable, feature = "tracing"))]
         let tracing_id = future.id();
         let vtable = raw::vtable::<T, S>();
-        let result = Box::new(Cell {
-            header: new_header(
+        let result = TaskBox::new(
+            new_header(
                 state,
                 vtable,
                 #[cfg(all(tokio_unstable, feature = "tracing"))]
                 tracing_id,
             ),
-            core: Core {
+            Core {
                 scheduler,
                 stage: CoreStage {
                     stage: UnsafeCell::new(Stage::Running(future)),
                 },
                 task_id,
             },
-            trailer: Trailer::new(),
-        });
+        );
 
         #[cfg(debug_assertions)]
         {
@@ -459,7 +377,7 @@ impl Header {
 }
 
 impl Trailer {
-    fn new() -> Self {
+    pub(super) fn new() -> Self {
         Trailer {
             waker: UnsafeCell::new(None),
             owned: linked_list::Pointers::new(),
diff --git a/tokio/src/runtime/task/harness.rs b/tokio/src/runtime/task/harness.rs
index 8bfd57e6fbf..d029a45dd5d 100644
--- a/tokio/src/runtime/task/harness.rs
+++ b/tokio/src/runtime/task/harness.rs
@@ -2,7 +2,7 @@ use crate::future::Future;
 use crate::runtime::task::core::{Cell, Core, Header, Trailer};
 use crate::runtime::task::state::{Snapshot, State};
 use crate::runtime::task::waker::waker_ref;
-use crate::runtime::task::{Id, JoinError, Notified, RawTask, Schedule, Task};
+use crate::runtime::task::{Id, JoinError, Notified, RawTask, Schedule, Task, TaskBox};
 
 use std::any::Any;
 use std::mem;
@@ -269,7 +269,7 @@ where
         // are allowed to be dangling after their last use, even if the
         // reference has not yet gone out of scope.
         unsafe {
-            drop(Box::from_raw(self.cell.as_ptr()));
+            drop(TaskBox::from_raw(self.cell));
         }
     }
 
diff --git a/tokio/src/runtime/task/mod.rs b/tokio/src/runtime/task/mod.rs
index 0bd40cd875c..eeb589f75be 100644
--- a/tokio/src/runtime/task/mod.rs
+++ b/tokio/src/runtime/task/mod.rs
@@ -200,6 +200,9 @@ pub(crate) use self::raw::RawTask;
 mod state;
 use self::state::State;
 
+mod task_box;
+use self::task_box::TaskBox;
+
 mod waker;
 
 cfg_taskdump! {
@@ -272,6 +275,15 @@ pub(crate) trait Schedule: Sync + Sized + 'static {
         self.schedule(task);
     }
 
+    /// The minimum alignment for tasks spawned on this runtime.
+    ///
+    /// This is used by the multi-threaded runtime to avoid false sharing.
+    ///
+    /// The same scheduler must always return the same value.
+    fn min_align(&self) -> usize {
+        1
+    }
+
     /// Polling the task resulted in a panic. Should the runtime shutdown?
     fn unhandled_panic(&self) {
         // By default, do nothing. This maintains the 1.0 behavior.
diff --git a/tokio/src/runtime/task/raw.rs b/tokio/src/runtime/task/raw.rs
index 6699551f3ec..ea55bddddf4 100644
--- a/tokio/src/runtime/task/raw.rs
+++ b/tokio/src/runtime/task/raw.rs
@@ -1,6 +1,6 @@
 use crate::future::Future;
 use crate::runtime::task::core::{Core, Trailer};
-use crate::runtime::task::{Cell, Harness, Header, Id, Schedule, State};
+use crate::runtime::task::{Cell, Harness, Header, Id, Schedule, State, TaskBox};
 
 use std::ptr::NonNull;
 use std::task::{Poll, Waker};
@@ -162,10 +162,9 @@ impl RawTask {
         T: Future,
         S: Schedule,
     {
-        let ptr = Box::into_raw(Cell::<_, S>::new(task, scheduler, State::new(), id));
-        let ptr = unsafe { NonNull::new_unchecked(ptr.cast()) };
+        let ptr = TaskBox::into_raw(Cell::<_, S>::new(task, scheduler, State::new(), id));
 
-        RawTask { ptr }
+        RawTask { ptr: ptr.cast() }
     }
 
     pub(super) unsafe fn from_raw(ptr: NonNull<Header>) -> RawTask {
diff --git a/tokio/src/runtime/task/task_box.rs b/tokio/src/runtime/task/task_box.rs
new file mode 100644
index 00000000000..408e73d1795
--- /dev/null
+++ b/tokio/src/runtime/task/task_box.rs
@@ -0,0 +1,124 @@
+//! Helper module for allocating and deallocating tasks.
+
+use crate::runtime::task::core::{Cell, Core, Header, Trailer};
+use crate::runtime::task::Schedule;
+
+use std::alloc::{alloc, dealloc, handle_alloc_error, Layout};
+use std::future::Future;
+use std::marker::PhantomData;
+use std::mem::{align_of, size_of, ManuallyDrop};
+use std::ptr::{drop_in_place, NonNull};
+
+fn layout_of<T: Future, S: Schedule>(scheduler: &S) -> Layout {
+    let size = std::mem::size_of::<Cell<T, S>>();
+    let mut align = std::mem::align_of::<Cell<T, S>>();
+    let min_align = scheduler.min_align();
+    if align < min_align {
+        align = min_align;
+    }
+    match Layout::from_size_align(size, align) {
+        Ok(layout) => layout,
+        Err(_) => panic!("Failed to build layout of type."),
+    }
+}
+
+/// A `Box<Cell<T, S>>` with an alignment of at least `s.min_align()`.
+pub(super) struct TaskBox<T: Future, S: Schedule> {
+    ptr: NonNull<Cell<T, S>>,
+    _phantom: PhantomData<Cell<T, S>>,
+}
+
+impl<T: Future, S: Schedule> TaskBox<T, S> {
+    /// Creates a new task allocation.
+    pub(super) fn new(header: Header, core: Core<T, S>) -> Self {
+        let layout = layout_of::<T, S>(&core.scheduler);
+
+        assert_eq!(size_of::<Cell<T, S>>(), layout.size());
+        assert_ne!(size_of::<Cell<T, S>>(), 0);
+        assert!(align_of::<Cell<T, S>>() <= layout.align());
+
+        // SAFETY: The size of `layout` is non-zero as checked above.
+        let ptr = unsafe { alloc(layout) } as *mut Cell<T, S>;
+
+        let ptr = match NonNull::new(ptr) {
+            Some(ptr) => ptr,
+            None => handle_alloc_error(layout),
+        };
+
+        // SAFETY: We just allocated memory with the same size and a compatible
+        // alignment for `Cell<T, S>`.
+        unsafe {
+            ptr.as_ptr().write(Cell {
+                header,
+                core,
+                trailer: Trailer::new(),
+            });
+        };
+
+        Self {
+            ptr,
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Convert this allocation into a raw pointer.
+    pub(super) fn into_raw(self) -> NonNull<Cell<T, S>> {
+        let me = ManuallyDrop::new(self);
+        me.ptr
+    }
+
+    /// Convert this allocation back into a `TaskBox`.
+    ///
+    /// # Safety
+    ///
+    /// The provided pointer must originate from a previous call to `into_raw`,
+    /// and the raw pointer must not be used again after this call.
+    pub(super) unsafe fn from_raw(ptr: NonNull<Cell<T, S>>) -> Self {
+        Self {
+            ptr,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<T: Future, S: Schedule> std::ops::Deref for TaskBox<T, S> {
+    type Target = Cell<T, S>;
+
+    fn deref(&self) -> &Cell<T, S> {
+        // SAFETY: This box always points at a valid cell.
+        unsafe { &*self.ptr.as_ptr() }
+    }
+}
+
+impl<T: Future, S: Schedule> Drop for TaskBox<T, S> {
+    fn drop(&mut self) {
+        let ptr = self.ptr.as_ptr();
+
+        // SAFETY: The task is still valid, so we can dereference the pointer.
+        let layout = layout_of::<T, S>(unsafe { &(*ptr).core.scheduler });
+
+        // SAFETY: The pointer was allocated with this layout. (The return value
+        // of `min_align` doesn't change.)
+        let _drop_helper = DropHelper {
+            layout,
+            ptr: ptr as *mut u8,
+        };
+
+        // SAFETY: A task box contains a pointer to a valid cell, and we have
+        // not dropped the allocation yet.
+        unsafe { drop_in_place(self.ptr.as_ptr()) };
+    }
+}
+
+struct DropHelper {
+    ptr: *mut u8,
+    layout: Layout,
+}
+
+impl Drop for DropHelper {
+    #[inline]
+    fn drop(&mut self) {
+        // SAFETY: See `TaskBox::drop`.
+        unsafe { dealloc(self.ptr, self.layout) };
+    }
+}
diff --git a/tokio/src/util/cacheline.rs b/tokio/src/util/cacheline.rs
index b34004c1ac1..e29834a179c 100644
--- a/tokio/src/util/cacheline.rs
+++ b/tokio/src/util/cacheline.rs
@@ -3,6 +3,9 @@ use std::ops::{Deref, DerefMut};
 
 /// Pads and aligns a value to the length of a cache line.
 #[derive(Clone, Copy, Default, Hash, PartialEq, Eq)]
+// # This struct should be cache padded to avoid false sharing. The cache padding rules are copied
+// from crossbeam-utils/src/cache_padded.rs
+//
 // Starting from Intel's Sandy Bridge, spatial prefetcher is now pulling pairs of 64-byte cache
 // lines at a time, so we have to align to 128 bytes rather than 64.
 //
@@ -27,27 +30,42 @@ use std::ops::{Deref, DerefMut};
     ),
     repr(align(128))
 )]
-// arm, mips and mips64 have 32-byte cache line size.
+// arm, mips, mips64, sparc, and hexagon have 32-byte cache line size.
 //
 // Sources:
 // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_arm.go#L7
 // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips.go#L7
 // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mipsle.go#L7
 // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_mips64x.go#L9
+// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L17
+// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/hexagon/include/asm/cache.h#L12
 #[cfg_attr(
-    any(target_arch = "arm", target_arch = "mips", target_arch = "mips64",),
+    any(
+        target_arch = "arm",
+        target_arch = "mips",
+        target_arch = "mips64",
+        target_arch = "sparc",
+        target_arch = "hexagon",
+    ),
     repr(align(32))
 )]
+// m68k has 16-byte cache line size.
+//
+// Sources:
+// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/m68k/include/asm/cache.h#L9
+#[cfg_attr(target_arch = "m68k", repr(align(16)))]
 // s390x has 256-byte cache line size.
 //
 // Sources:
 // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_s390x.go#L7
+// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/s390/include/asm/cache.h#L13
 #[cfg_attr(target_arch = "s390x", repr(align(256)))]
-// x86, riscv and wasm have 64-byte cache line size.
+// x86, riscv, wasm, and sparc64 have 64-byte cache line size.
 //
 // Sources:
 // - https://github.com/golang/go/blob/dda2991c2ea0c5914714469c4defc2562a907230/src/internal/cpu/cpu_x86.go#L9
 // - https://github.com/golang/go/blob/3dd58676054223962cd915bb0934d1f9f489d4d2/src/internal/cpu/cpu_wasm.go#L7
+// - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/sparc/include/asm/cache.h#L19
 // - https://github.com/torvalds/linux/blob/3516bd729358a2a9b090c1905bd2a3fa926e24c6/arch/riscv/include/asm/cache.h#L10
 //
 // All others are assumed to have 64-byte cache line size.
@@ -59,6 +77,9 @@ use std::ops::{Deref, DerefMut};
         target_arch = "arm",
         target_arch = "mips",
         target_arch = "mips64",
+        target_arch = "sparc",
+        target_arch = "hexagon",
+        target_arch = "m68k",
         target_arch = "s390x",
     )),
     repr(align(64))