sched-ext · hodgesds · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/rust/scx_arena/scx_arena/src/arenalib.rs b/rust/scx_arena/scx_arena/src/arenalib.rs
@@ -173,6 +173,10 @@ impl<'a> ArenaLib<'a> {
             )?;
         }
 
+        // Drop all_clusters to release Arc references to cores before processing cores
+        // Clusters may hold Arc references to cores, so we need to drop them first
+        drop(topo.all_clusters);
+
         for (core_id, core) in topo.all_cores {
             self.setup_topology_node(
                 Arc::<Core>::into_inner(core)

diff --git a/rust/scx_arena/selftests/src/main.rs b/rust/scx_arena/selftests/src/main.rs
@@ -145,6 +145,9 @@ fn setup_topology(skel: &mut BpfSkel<'_>) -> Result<()> {
         )?;
     }
 
+    // Drop all_clusters to release Arc references to cores before processing cores
+    drop(topo.all_clusters);
+
     for (_, core) in topo.all_cores {
         setup_topology_node(
             skel,

diff --git a/rust/scx_utils/src/topology.rs b/rust/scx_utils/src/topology.rs
@@ -167,12 +167,31 @@ pub struct Core {
     pub node_id: usize,
 }
 
+#[derive(Debug, Clone)]
+pub struct Cluster {
+    /// Monotonically increasing unique id
+    pub id: usize,
+    /// The kernel id of the L2 cache or cluster
+    pub kernel_id: usize,
+    pub cores: BTreeMap<usize, Arc<Core>>,
+    /// Cpumask of all CPUs in this cluster.
+    pub span: Cpumask,
+
+    /// Ancestor IDs.
+    pub llc_id: usize,
+    pub node_id: usize,
+
+    /// Skip indices to access lower level members easily.
+    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
+}
+
 #[derive(Debug, Clone)]
 pub struct Llc {
     /// Monotonically increasing unique id
     pub id: usize,
     /// The kernel id of the llc
     pub kernel_id: usize,
+    pub clusters: BTreeMap<usize, Arc<Cluster>>,
     pub cores: BTreeMap<usize, Arc<Core>>,
     /// Cpumask of all CPUs in this llc.
     pub span: Cpumask,
@@ -181,6 +200,7 @@ pub struct Llc {
     pub node_id: usize,
 
     /// Skip indices to access lower level members easily.
+    pub all_clusters: BTreeMap<usize, Arc<Cluster>>,
     pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
 }
 
@@ -210,6 +230,7 @@ pub struct Topology {
 
     /// Skip indices to access lower level members easily.
     pub all_llcs: BTreeMap<usize, Arc<Llc>>,
+    pub all_clusters: BTreeMap<usize, Arc<Cluster>>,
     pub all_cores: BTreeMap<usize, Arc<Core>>,
     pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
 }
@@ -220,6 +241,7 @@ impl Topology {
         // objects can only be modified while there's only one reference,
         // skip indices must be built from bottom to top.
         let mut topo_llcs = BTreeMap::new();
+        let mut topo_clusters = BTreeMap::new();
         let mut topo_cores = BTreeMap::new();
         let mut topo_cpus = BTreeMap::new();
 
@@ -229,33 +251,84 @@ impl Topology {
 
             for (&llc_id, llc) in node.llcs.iter_mut() {
                 let llc_mut = Arc::get_mut(llc).unwrap();
+                let mut llc_clusters = BTreeMap::new();
                 let mut llc_cpus = BTreeMap::new();
 
-                for (&core_id, core) in llc_mut.cores.iter_mut() {
-                    let core_mut = Arc::get_mut(core).unwrap();
-                    let smt_level = core_mut.cpus.len();
-
-                    for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
-                        let cpu_mut = Arc::get_mut(cpu).unwrap();
-                        cpu_mut.smt_level = smt_level;
-
-                        if topo_cpus
-                            .insert(cpu_id, cpu.clone())
-                            .or(node_cpus.insert(cpu_id, cpu.clone()))
-                            .or(llc_cpus.insert(cpu_id, cpu.clone()))
-                            .is_some()
-                        {
-                            bail!("Duplicate CPU ID {}", cpu_id);
+                for (&cluster_id, cluster) in llc_mut.clusters.iter_mut() {
+                    let cluster_mut = Arc::get_mut(cluster).unwrap();
+                    let mut cluster_cpus = BTreeMap::new();
+
+                    for (&core_id, core) in cluster_mut.cores.iter_mut() {
+                        let core_mut = Arc::get_mut(core).unwrap();
+                        let smt_level = core_mut.cpus.len();
+
+                        for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
+                            let cpu_mut = Arc::get_mut(cpu).unwrap();
+                            cpu_mut.smt_level = smt_level;
+
+                            if topo_cpus
+                                .insert(cpu_id, cpu.clone())
+                                .or(node_cpus.insert(cpu_id, cpu.clone()))
+                                .or(llc_cpus.insert(cpu_id, cpu.clone()))
+                                .or(cluster_cpus.insert(cpu_id, cpu.clone()))
+                                .is_some()
+                            {
+                                bail!("Duplicate CPU ID {}", cpu_id);
+                            }
+                        }
+
+                        // Note that in some weird architectures, core ids can be
+                        // duplicated in different LLC domains.
+                        topo_cores
+                            .insert(core_id, core.clone())
+                            .or(node_cores.insert(core_id, core.clone()));
+                    }
+
+                    cluster_mut.all_cpus = cluster_cpus;
+
+                    if topo_clusters.insert(cluster_id, cluster.clone()).is_some() {
+                        bail!("Duplicate Cluster ID {}", cluster_id);
+                    }
+                    llc_clusters.insert(cluster_id, cluster.clone());
+                }
+
+                // Fallback: if LLC has no clusters (e.g., virtual LLCs), process cores directly
+                if llc_mut.clusters.is_empty() {
+                    for (&core_id, core) in llc_mut.cores.iter_mut() {
+                        let core_mut = Arc::get_mut(core).unwrap();
+                        let smt_level = core_mut.cpus.len();
+
+                        for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
+                            let cpu_mut = Arc::get_mut(cpu).unwrap();
+                            cpu_mut.smt_level = smt_level;
+
+                            if topo_cpus
+                                .insert(cpu_id, cpu.clone())
+                                .or(node_cpus.insert(cpu_id, cpu.clone()))
+                                .or(llc_cpus.insert(cpu_id, cpu.clone()))
+                                .is_some()
+                            {
+                                bail!("Duplicate CPU ID {}", cpu_id);
+                            }
                         }
+
+                        // Note that in some weird architectures, core ids can be
+                        // duplicated in different LLC domains.
+                        topo_cores
+                            .insert(core_id, core.clone())
+                            .or(node_cores.insert(core_id, core.clone()));
                     }
+                }
 
-                    // Note that in some weird architectures, core ids can be
-                    // duplicated in different LLC domains.
-                    topo_cores
-                        .insert(core_id, core.clone())
-                        .or(node_cores.insert(core_id, core.clone()));
+                // Populate llc.cores from cluster.cores before LLC is cloned
+                // This must be done while we still have exclusive access via llc_mut
+                for (_cluster_id, cluster) in llc_mut.clusters.iter() {
+                    for (&core_id, core) in cluster.cores.iter() {
+                        llc_mut.cores.insert(core_id, core.clone());
+                    }
                 }
 
+                llc_mut.all_clusters = llc_clusters;
                 llc_mut.all_cpus = llc_cpus;
 
                 if topo_llcs.insert(llc_id, llc.clone()).is_some() {
@@ -272,6 +345,7 @@ impl Topology {
             span,
             smt_enabled: is_smt_active().unwrap_or(false),
             all_llcs: topo_llcs,
+            all_clusters: topo_clusters,
             all_cores: topo_cores,
             all_cpus: topo_cpus,
         })
@@ -372,6 +446,8 @@ struct TopoCtx {
     node_core_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
     /// Mapping of NUMA node LLC ids
     node_llc_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
+    /// Mapping of NUMA node LLC cluster ids (node_id, llc_id, cluster_kernel_id) -> cluster_id
+    node_llc_cluster_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
     /// Mapping of L2 ids
     l2_ids: BTreeMap<String, usize>,
     /// Mapping of L3 ids
@@ -382,11 +458,13 @@ impl TopoCtx {
     fn new() -> TopoCtx {
         let core_kernel_ids = BTreeMap::new();
         let llc_kernel_ids = BTreeMap::new();
+        let cluster_kernel_ids = BTreeMap::new();
         let l2_ids = BTreeMap::new();
         let l3_ids = BTreeMap::new();
         TopoCtx {
             node_core_kernel_ids: core_kernel_ids,
             node_llc_kernel_ids: llc_kernel_ids,
+            node_llc_cluster_kernel_ids: cluster_kernel_ids,
             l2_ids,
             l3_ids,
         }
@@ -523,15 +601,52 @@ fn create_insert_cpu(
 
     let llc = node.llcs.entry(*llc_id).or_insert(Arc::new(Llc {
         id: *llc_id,
+        clusters: BTreeMap::new(),
         cores: BTreeMap::new(),
         span: Cpumask::new(),
+        all_clusters: BTreeMap::new(),
         all_cpus: BTreeMap::new(),
 
         node_id: node.id,
         kernel_id: llc_kernel_id,
     }));
     let llc_mut = Arc::get_mut(llc).unwrap();
 
+    // Determine cluster kernel ID: use cluster_id if available (>= 0), else use L2 ID
+    // cluster_id is isize, with -1 indicating no cluster support
+    let cluster_kernel_id = if cluster_id >= 0 {
+        cluster_id as usize
+    } else if l2_id != usize::MAX {
+        l2_id
+    } else {
+        // No cluster information available, use LLC as cluster
+        llc_kernel_id
+    };
+
+    // Create unique cluster ID using (node.id, llc_id, cluster_kernel_id)
+    let num_clusters = topo_ctx.node_llc_cluster_kernel_ids.len();
+    let cluster_id_unique = topo_ctx
+        .node_llc_cluster_kernel_ids
+        .entry((node.id, *llc_id, cluster_kernel_id))
+        .or_insert(num_clusters);
+
+    // Create or get cluster
+    let cluster = llc_mut
+        .clusters
+        .entry(*cluster_id_unique)
+        .or_insert(Arc::new(Cluster {
+            id: *cluster_id_unique,
+            kernel_id: cluster_kernel_id,
+            cores: BTreeMap::new(),
+            span: Cpumask::new(),
+
+            llc_id: *llc_id,
+            node_id: node.id,
+
+            all_cpus: BTreeMap::new(),
+        }));
+    let cluster_mut = Arc::get_mut(cluster).unwrap();
+
     let core_type = if cs.avg_rcap < cs.max_rcap && rcap == cs.max_rcap {
         CoreType::Big { turbo: true }
     } else if !cs.has_biglittle || rcap >= cs.avg_rcap {
@@ -546,7 +661,8 @@ fn create_insert_cpu(
         .entry((node.id, package_id, core_kernel_id))
         .or_insert(num_cores);
 
-    let core = llc_mut.cores.entry(*core_id).or_insert(Arc::new(Core {
+    // Insert core into cluster
+    let core = cluster_mut.cores.entry(*core_id).or_insert(Arc::new(Core {
         id: *core_id,
         cpus: BTreeMap::new(),
         span: Cpumask::new(),
@@ -589,6 +705,7 @@ fn create_insert_cpu(
 
     // Update all of the devices' spans to include this CPU.
     core_mut.span.set_cpu(id)?;
+    cluster_mut.span.set_cpu(id)?;
     llc_mut.span.set_cpu(id)?;
     node.span.set_cpu(id)?;
 
@@ -776,9 +893,11 @@ fn replace_with_virt_llcs(
             Arc::new(Llc {
                 id: vllc_id,
                 kernel_id,
+                clusters: BTreeMap::new(),
                 cores: BTreeMap::new(),
                 span: Cpumask::new(),
                 node_id: node.id,
+                all_clusters: BTreeMap::new(),
                 all_cpus: BTreeMap::new(),
             }),
         );
@@ -884,6 +1003,15 @@ fn create_default_node(
         create_insert_cpu(*cpu_id, &mut node, online_mask, topo_ctx, &cs, flatten_llc)?;
     }
 
+    // Clear clusters before creating virtual LLCs to avoid multiple Arc references to cores
+    // replace_with_virt_llcs() will create new LLCs without clusters anyway
+    if nr_cores_per_vllc.is_some() {
+        for (_llc_id, llc) in node.llcs.iter_mut() {
+            let llc_mut = Arc::get_mut(llc).unwrap();
+            llc_mut.clusters.clear();
+        }
+    }
+
     if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
         replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, 0)?;
     }
@@ -985,6 +1113,15 @@ fn create_numa_nodes(
             create_insert_cpu(cpu_id, &mut node, online_mask, topo_ctx, &cs, false)?;
         }
 
+        // Clear clusters before creating virtual LLCs to avoid multiple Arc references to cores
+        // replace_with_virt_llcs() will create new LLCs without clusters anyway
+        if nr_cores_per_vllc.is_some() {
+            for (_llc_id, llc) in node.llcs.iter_mut() {
+                let llc_mut = Arc::get_mut(llc).unwrap();
+                llc_mut.clusters.clear();
+            }
+        }
+
         if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
             next_virt_llc_id =
                 replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, next_virt_llc_id)?;

diff --git a/scheds/rust/scx_p2dq/src/bpf/intf.h b/scheds/rust/scx_p2dq/src/bpf/intf.h
@@ -14,43 +14,44 @@
 #endif
 
 #ifndef __KERNEL__
-typedef unsigned char u8;
-typedef unsigned int u32;
+typedef unsigned char	   u8;
+typedef unsigned int	   u32;
 typedef unsigned long long u64;
 #endif
 
-
 enum consts {
-	MAX_CPUS		= 512,
-	MAX_NUMA_NODES		= 64,
-	MAX_LLCS		= 64,
-	MAX_DSQS_PER_LLC	= 8,
-	MAX_LLC_SHARDS		= 32,
-	MAX_TASK_PRIO		= 39,
-	MAX_TOPO_NODES		= 1024,
+	MAX_CPUS	   = 512,
+	MAX_NUMA_NODES	   = 64,
+	MAX_LLCS	   = 64,
+	MAX_CLUSTERS	   = 128,
+	MAX_DSQS_PER_LLC   = 8,
+	MAX_LLC_SHARDS	   = 32,
+	MAX_TASK_PRIO	   = 39,
+	MAX_TOPO_NODES	   = 1024,
 
-	NSEC_PER_USEC		= 1000ULL,
-	NSEC_PER_MSEC		= (1000ULL * NSEC_PER_USEC),
-	MSEC_PER_SEC		= 1000ULL,
-	NSEC_PER_SEC		= NSEC_PER_MSEC * MSEC_PER_SEC,
+	NSEC_PER_USEC	   = 1000ULL,
+	NSEC_PER_MSEC	   = (1000ULL * NSEC_PER_USEC),
+	MSEC_PER_SEC	   = 1000ULL,
+	NSEC_PER_SEC	   = NSEC_PER_MSEC * MSEC_PER_SEC,
 
-	MIN_SLICE_USEC		= 10ULL,
-	MIN_SLICE_NSEC		= (10ULL * NSEC_PER_USEC),
+	MIN_SLICE_USEC	   = 10ULL,
+	MIN_SLICE_NSEC	   = (10ULL * NSEC_PER_USEC),
 
-	LOAD_BALANCE_SLACK	= 20ULL,
+	LOAD_BALANCE_SLACK = 20ULL,
 
-	P2DQ_MIG_DSQ		= 1LLU << 60,
-	P2DQ_INTR_DSQ		= 1LLU << 32,
+	P2DQ_MIG_DSQ	   = 1LLU << 60,
+	P2DQ_INTR_DSQ	   = 1LLU << 32,
 
 	// PELT (Per-Entity Load Tracking) constants
-	PELT_HALFLIFE_MS	= 32,		// 32ms half-life for exponential decay
-	PELT_PERIOD_MS		= 1,		// 1ms update period (simplified from kernel's 1024us)
-	PELT_MAX_UTIL		= 1024,		// Maximum utilization value
-	PELT_DECAY_SHIFT	= 7,		// Decay factor: (127/128) ≈ 0.98 per ms
-	PELT_SUM_MAX		= 131072,	// Maximum sum value (128 * 1024)
+	PELT_HALFLIFE_MS = 32, // 32ms half-life for exponential decay
+	PELT_PERIOD_MS =
+		1, // 1ms update period (simplified from kernel's 1024us)
+	PELT_MAX_UTIL	 = 1024, // Maximum utilization value
+	PELT_DECAY_SHIFT = 7, // Decay factor: (127/128) ≈ 0.98 per ms
+	PELT_SUM_MAX	 = 131072, // Maximum sum value (128 * 1024)
 
 	// kernel definitions
-	CLOCK_BOOTTIME		= 7,
+	CLOCK_BOOTTIME = 7,
 };
 
 enum p2dq_timers_defs {