diff --git a/rust/scx_arena/scx_arena/src/arenalib.rs b/rust/scx_arena/scx_arena/src/arenalib.rs
index 55a3f6a379..064fb3a4cd 100644
--- a/rust/scx_arena/scx_arena/src/arenalib.rs
+++ b/rust/scx_arena/scx_arena/src/arenalib.rs
@@ -173,6 +173,10 @@ impl<'a> ArenaLib<'a> {
             )?;
         }
 
+        // Drop all_clusters to release Arc references to cores before processing cores
+        // Clusters may hold Arc references to cores, so we need to drop them first
+        drop(topo.all_clusters);
+
         for (core_id, core) in topo.all_cores {
             self.setup_topology_node(
                 Arc::<Core>::into_inner(core)
diff --git a/rust/scx_arena/selftests/src/main.rs b/rust/scx_arena/selftests/src/main.rs
index d205ac7db3..c83941be1e 100644
--- a/rust/scx_arena/selftests/src/main.rs
+++ b/rust/scx_arena/selftests/src/main.rs
@@ -145,6 +145,9 @@ fn setup_topology(skel: &mut BpfSkel<'_>) -> Result<()> {
         )?;
     }
 
+    // Drop all_clusters to release Arc references to cores before processing cores
+    drop(topo.all_clusters);
+
     for (_, core) in topo.all_cores {
         setup_topology_node(
             skel,
diff --git a/rust/scx_utils/src/topology.rs b/rust/scx_utils/src/topology.rs
index efa2e6d062..47734a6ebc 100644
--- a/rust/scx_utils/src/topology.rs
+++ b/rust/scx_utils/src/topology.rs
@@ -167,12 +167,31 @@ pub struct Core {
     pub node_id: usize,
 }
 
+#[derive(Debug, Clone)]
+pub struct Cluster {
+    /// Monotonically increasing unique id
+    pub id: usize,
+    /// The kernel id of the L2 cache or cluster
+    pub kernel_id: usize,
+    pub cores: BTreeMap<usize, Arc<Core>>,
+    /// Cpumask of all CPUs in this cluster.
+    pub span: Cpumask,
+
+    /// Ancestor IDs.
+    pub llc_id: usize,
+    pub node_id: usize,
+
+    /// Skip indices to access lower level members easily.
+    pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
+}
+
 #[derive(Debug, Clone)]
 pub struct Llc {
     /// Monotonically increasing unique id
     pub id: usize,
     /// The kernel id of the llc
     pub kernel_id: usize,
+    pub clusters: BTreeMap<usize, Arc<Cluster>>,
     pub cores: BTreeMap<usize, Arc<Core>>,
     /// Cpumask of all CPUs in this llc.
     pub span: Cpumask,
@@ -181,6 +200,7 @@ pub struct Llc {
     pub node_id: usize,
 
     /// Skip indices to access lower level members easily.
+    pub all_clusters: BTreeMap<usize, Arc<Cluster>>,
     pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
 }
 
@@ -210,6 +230,7 @@ pub struct Topology {
 
     /// Skip indices to access lower level members easily.
     pub all_llcs: BTreeMap<usize, Arc<Llc>>,
+    pub all_clusters: BTreeMap<usize, Arc<Cluster>>,
     pub all_cores: BTreeMap<usize, Arc<Core>>,
     pub all_cpus: BTreeMap<usize, Arc<Cpu>>,
 }
@@ -220,6 +241,7 @@ impl Topology {
         // objects can only be modified while there's only one reference,
         // skip indices must be built from bottom to top.
         let mut topo_llcs = BTreeMap::new();
+        let mut topo_clusters = BTreeMap::new();
         let mut topo_cores = BTreeMap::new();
         let mut topo_cpus = BTreeMap::new();
 
@@ -229,33 +251,84 @@ impl Topology {
 
             for (&llc_id, llc) in node.llcs.iter_mut() {
                 let llc_mut = Arc::get_mut(llc).unwrap();
+                let mut llc_clusters = BTreeMap::new();
                 let mut llc_cpus = BTreeMap::new();
 
-                for (&core_id, core) in llc_mut.cores.iter_mut() {
-                    let core_mut = Arc::get_mut(core).unwrap();
-                    let smt_level = core_mut.cpus.len();
-
-                    for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
-                        let cpu_mut = Arc::get_mut(cpu).unwrap();
-                        cpu_mut.smt_level = smt_level;
-
-                        if topo_cpus
-                            .insert(cpu_id, cpu.clone())
-                            .or(node_cpus.insert(cpu_id, cpu.clone()))
-                            .or(llc_cpus.insert(cpu_id, cpu.clone()))
-                            .is_some()
-                        {
-                            bail!("Duplicate CPU ID {}", cpu_id);
+                for (&cluster_id, cluster) in llc_mut.clusters.iter_mut() {
+                    let cluster_mut = Arc::get_mut(cluster).unwrap();
+                    let mut cluster_cpus = BTreeMap::new();
+
+                    for (&core_id, core) in cluster_mut.cores.iter_mut() {
+                        let core_mut = Arc::get_mut(core).unwrap();
+                        let smt_level = core_mut.cpus.len();
+
+                        for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
+                            let cpu_mut = Arc::get_mut(cpu).unwrap();
+                            cpu_mut.smt_level = smt_level;
+
+                            if topo_cpus
+                                .insert(cpu_id, cpu.clone())
+                                .or(node_cpus.insert(cpu_id, cpu.clone()))
+                                .or(llc_cpus.insert(cpu_id, cpu.clone()))
+                                .or(cluster_cpus.insert(cpu_id, cpu.clone()))
+                                .is_some()
+                            {
+                                bail!("Duplicate CPU ID {}", cpu_id);
+                            }
+                        }
+
+                        // Note that in some weird architectures, core ids can be
+                        // duplicated in different LLC domains.
+                        topo_cores
+                            .insert(core_id, core.clone())
+                            .or(node_cores.insert(core_id, core.clone()));
+                    }
+
+                    cluster_mut.all_cpus = cluster_cpus;
+
+                    if topo_clusters.insert(cluster_id, cluster.clone()).is_some() {
+                        bail!("Duplicate Cluster ID {}", cluster_id);
+                    }
+                    llc_clusters.insert(cluster_id, cluster.clone());
+                }
+
+                // Fallback: if LLC has no clusters (e.g., virtual LLCs), process cores directly
+                if llc_mut.clusters.is_empty() {
+                    for (&core_id, core) in llc_mut.cores.iter_mut() {
+                        let core_mut = Arc::get_mut(core).unwrap();
+                        let smt_level = core_mut.cpus.len();
+
+                        for (&cpu_id, cpu) in core_mut.cpus.iter_mut() {
+                            let cpu_mut = Arc::get_mut(cpu).unwrap();
+                            cpu_mut.smt_level = smt_level;
+
+                            if topo_cpus
+                                .insert(cpu_id, cpu.clone())
+                                .or(node_cpus.insert(cpu_id, cpu.clone()))
+                                .or(llc_cpus.insert(cpu_id, cpu.clone()))
+                                .is_some()
+                            {
+                                bail!("Duplicate CPU ID {}", cpu_id);
+                            }
                         }
+
+                        // Note that in some weird architectures, core ids can be
+                        // duplicated in different LLC domains.
+                        topo_cores
+                            .insert(core_id, core.clone())
+                            .or(node_cores.insert(core_id, core.clone()));
                     }
+                }
 
-                    // Note that in some weird architectures, core ids can be
-                    // duplicated in different LLC domains.
-                    topo_cores
-                        .insert(core_id, core.clone())
-                        .or(node_cores.insert(core_id, core.clone()));
+                // Populate llc.cores from cluster.cores before LLC is cloned
+                // This must be done while we still have exclusive access via llc_mut
+                for (_cluster_id, cluster) in llc_mut.clusters.iter() {
+                    for (&core_id, core) in cluster.cores.iter() {
+                        llc_mut.cores.insert(core_id, core.clone());
+                    }
                 }
 
+                llc_mut.all_clusters = llc_clusters;
                 llc_mut.all_cpus = llc_cpus;
 
                 if topo_llcs.insert(llc_id, llc.clone()).is_some() {
@@ -272,6 +345,7 @@ impl Topology {
             span,
             smt_enabled: is_smt_active().unwrap_or(false),
             all_llcs: topo_llcs,
+            all_clusters: topo_clusters,
             all_cores: topo_cores,
             all_cpus: topo_cpus,
         })
@@ -372,6 +446,8 @@ struct TopoCtx {
     node_core_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
     /// Mapping of NUMA node LLC ids
     node_llc_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
+    /// Mapping of NUMA node LLC cluster ids (node_id, llc_id, cluster_kernel_id) -> cluster_id
+    node_llc_cluster_kernel_ids: BTreeMap<(usize, usize, usize), usize>,
     /// Mapping of L2 ids
     l2_ids: BTreeMap<String, usize>,
     /// Mapping of L3 ids
@@ -382,11 +458,13 @@ impl TopoCtx {
     fn new() -> TopoCtx {
         let core_kernel_ids = BTreeMap::new();
         let llc_kernel_ids = BTreeMap::new();
+        let cluster_kernel_ids = BTreeMap::new();
         let l2_ids = BTreeMap::new();
         let l3_ids = BTreeMap::new();
         TopoCtx {
             node_core_kernel_ids: core_kernel_ids,
             node_llc_kernel_ids: llc_kernel_ids,
+            node_llc_cluster_kernel_ids: cluster_kernel_ids,
             l2_ids,
             l3_ids,
         }
@@ -523,8 +601,10 @@ fn create_insert_cpu(
 
     let llc = node.llcs.entry(*llc_id).or_insert(Arc::new(Llc {
         id: *llc_id,
+        clusters: BTreeMap::new(),
         cores: BTreeMap::new(),
         span: Cpumask::new(),
+        all_clusters: BTreeMap::new(),
         all_cpus: BTreeMap::new(),
 
         node_id: node.id,
@@ -532,6 +612,41 @@ fn create_insert_cpu(
     }));
     let llc_mut = Arc::get_mut(llc).unwrap();
 
+    // Determine cluster kernel ID: use cluster_id if available (>= 0), else use L2 ID
+    // cluster_id is isize, with -1 indicating no cluster support
+    let cluster_kernel_id = if cluster_id >= 0 {
+        cluster_id as usize
+    } else if l2_id != usize::MAX {
+        l2_id
+    } else {
+        // No cluster information available, use LLC as cluster
+        llc_kernel_id
+    };
+
+    // Create unique cluster ID using (node.id, llc_id, cluster_kernel_id)
+    let num_clusters = topo_ctx.node_llc_cluster_kernel_ids.len();
+    let cluster_id_unique = topo_ctx
+        .node_llc_cluster_kernel_ids
+        .entry((node.id, *llc_id, cluster_kernel_id))
+        .or_insert(num_clusters);
+
+    // Create or get cluster
+    let cluster = llc_mut
+        .clusters
+        .entry(*cluster_id_unique)
+        .or_insert(Arc::new(Cluster {
+            id: *cluster_id_unique,
+            kernel_id: cluster_kernel_id,
+            cores: BTreeMap::new(),
+            span: Cpumask::new(),
+
+            llc_id: *llc_id,
+            node_id: node.id,
+
+            all_cpus: BTreeMap::new(),
+        }));
+    let cluster_mut = Arc::get_mut(cluster).unwrap();
+
     let core_type = if cs.avg_rcap < cs.max_rcap && rcap == cs.max_rcap {
         CoreType::Big { turbo: true }
     } else if !cs.has_biglittle || rcap >= cs.avg_rcap {
@@ -546,7 +661,8 @@ fn create_insert_cpu(
         .entry((node.id, package_id, core_kernel_id))
         .or_insert(num_cores);
 
-    let core = llc_mut.cores.entry(*core_id).or_insert(Arc::new(Core {
+    // Insert core into cluster
+    let core = cluster_mut.cores.entry(*core_id).or_insert(Arc::new(Core {
         id: *core_id,
         cpus: BTreeMap::new(),
         span: Cpumask::new(),
@@ -589,6 +705,7 @@ fn create_insert_cpu(
 
     // Update all of the devices' spans to include this CPU.
     core_mut.span.set_cpu(id)?;
+    cluster_mut.span.set_cpu(id)?;
     llc_mut.span.set_cpu(id)?;
     node.span.set_cpu(id)?;
 
@@ -776,9 +893,11 @@ fn replace_with_virt_llcs(
             Arc::new(Llc {
                 id: vllc_id,
                 kernel_id,
+                clusters: BTreeMap::new(),
                 cores: BTreeMap::new(),
                 span: Cpumask::new(),
                 node_id: node.id,
+                all_clusters: BTreeMap::new(),
                 all_cpus: BTreeMap::new(),
             }),
         );
@@ -884,6 +1003,15 @@ fn create_default_node(
         create_insert_cpu(*cpu_id, &mut node, online_mask, topo_ctx, &cs, flatten_llc)?;
     }
 
+    // Clear clusters before creating virtual LLCs to avoid multiple Arc references to cores
+    // replace_with_virt_llcs() will create new LLCs without clusters anyway
+    if nr_cores_per_vllc.is_some() {
+        for (_llc_id, llc) in node.llcs.iter_mut() {
+            let llc_mut = Arc::get_mut(llc).unwrap();
+            llc_mut.clusters.clear();
+        }
+    }
+
     if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
         replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, 0)?;
     }
@@ -985,6 +1113,15 @@ fn create_numa_nodes(
             create_insert_cpu(cpu_id, &mut node, online_mask, topo_ctx, &cs, false)?;
         }
 
+        // Clear clusters before creating virtual LLCs to avoid multiple Arc references to cores
+        // replace_with_virt_llcs() will create new LLCs without clusters anyway
+        if nr_cores_per_vllc.is_some() {
+            for (_llc_id, llc) in node.llcs.iter_mut() {
+                let llc_mut = Arc::get_mut(llc).unwrap();
+                llc_mut.clusters.clear();
+            }
+        }
+
         if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc {
             next_virt_llc_id =
                 replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, next_virt_llc_id)?;
diff --git a/scheds/rust/scx_p2dq/src/bpf/intf.h b/scheds/rust/scx_p2dq/src/bpf/intf.h
index fbb7715c61..f42ded849f 100644
--- a/scheds/rust/scx_p2dq/src/bpf/intf.h
+++ b/scheds/rust/scx_p2dq/src/bpf/intf.h
@@ -14,43 +14,44 @@
 #endif
 
 #ifndef __KERNEL__
-typedef unsigned char u8;
-typedef unsigned int u32;
+typedef unsigned char	   u8;
+typedef unsigned int	   u32;
 typedef unsigned long long u64;
 #endif
 
-
 enum consts {
-	MAX_CPUS		= 512,
-	MAX_NUMA_NODES		= 64,
-	MAX_LLCS		= 64,
-	MAX_DSQS_PER_LLC	= 8,
-	MAX_LLC_SHARDS		= 32,
-	MAX_TASK_PRIO		= 39,
-	MAX_TOPO_NODES		= 1024,
+	MAX_CPUS	   = 512,
+	MAX_NUMA_NODES	   = 64,
+	MAX_LLCS	   = 64,
+	MAX_CLUSTERS	   = 128,
+	MAX_DSQS_PER_LLC   = 8,
+	MAX_LLC_SHARDS	   = 32,
+	MAX_TASK_PRIO	   = 39,
+	MAX_TOPO_NODES	   = 1024,
 
-	NSEC_PER_USEC		= 1000ULL,
-	NSEC_PER_MSEC		= (1000ULL * NSEC_PER_USEC),
-	MSEC_PER_SEC		= 1000ULL,
-	NSEC_PER_SEC		= NSEC_PER_MSEC * MSEC_PER_SEC,
+	NSEC_PER_USEC	   = 1000ULL,
+	NSEC_PER_MSEC	   = (1000ULL * NSEC_PER_USEC),
+	MSEC_PER_SEC	   = 1000ULL,
+	NSEC_PER_SEC	   = NSEC_PER_MSEC * MSEC_PER_SEC,
 
-	MIN_SLICE_USEC		= 10ULL,
-	MIN_SLICE_NSEC		= (10ULL * NSEC_PER_USEC),
+	MIN_SLICE_USEC	   = 10ULL,
+	MIN_SLICE_NSEC	   = (10ULL * NSEC_PER_USEC),
 
-	LOAD_BALANCE_SLACK	= 20ULL,
+	LOAD_BALANCE_SLACK = 20ULL,
 
-	P2DQ_MIG_DSQ		= 1LLU << 60,
-	P2DQ_INTR_DSQ		= 1LLU << 32,
+	P2DQ_MIG_DSQ	   = 1LLU << 60,
+	P2DQ_INTR_DSQ	   = 1LLU << 32,
 
 	// PELT (Per-Entity Load Tracking) constants
-	PELT_HALFLIFE_MS	= 32,		// 32ms half-life for exponential decay
-	PELT_PERIOD_MS		= 1,		// 1ms update period (simplified from kernel's 1024us)
-	PELT_MAX_UTIL		= 1024,		// Maximum utilization value
-	PELT_DECAY_SHIFT	= 7,		// Decay factor: (127/128) ≈ 0.98 per ms
-	PELT_SUM_MAX		= 131072,	// Maximum sum value (128 * 1024)
+	PELT_HALFLIFE_MS = 32, // 32ms half-life for exponential decay
+	PELT_PERIOD_MS =
+		1, // 1ms update period (simplified from kernel's 1024us)
+	PELT_MAX_UTIL	 = 1024, // Maximum utilization value
+	PELT_DECAY_SHIFT = 7, // Decay factor: (127/128) ≈ 0.98 per ms
+	PELT_SUM_MAX	 = 131072, // Maximum sum value (128 * 1024)
 
 	// kernel definitions
-	CLOCK_BOOTTIME		= 7,
+	CLOCK_BOOTTIME = 7,
 };
 
 enum p2dq_timers_defs {
diff --git a/scheds/rust/scx_p2dq/src/bpf/main.bpf.c b/scheds/rust/scx_p2dq/src/bpf/main.bpf.c
index 655580ffe2..cd9ac9161e 100644
--- a/scheds/rust/scx_p2dq/src/bpf/main.bpf.c
+++ b/scheds/rust/scx_p2dq/src/bpf/main.bpf.c
@@ -35,7 +35,6 @@
 #include "intf.h"
 #include "types.h"
 
-
 #include <errno.h>
 #include <stdbool.h>
 #include <bpf/bpf_core_read.h>
@@ -50,44 +49,56 @@ char _license[] SEC("license") = "GPL";
 
 UEI_DEFINE(uei);
 
-#define dbg(fmt, args...)	do { if (debug) bpf_printk(fmt, ##args); } while (0)
-#define trace(fmt, args...)	do { if (debug > 1) bpf_printk(fmt, ##args); } while (0)
+#define dbg(fmt, args...)                        \
+	do {                                     \
+		if (debug)                       \
+			bpf_printk(fmt, ##args); \
+	} while (0)
+#define trace(fmt, args...)                      \
+	do {                                     \
+		if (debug > 1)                   \
+			bpf_printk(fmt, ##args); \
+	} while (0)
 
 const volatile struct {
-	u32 nr_cpus;
-	u32 nr_llcs;
-	u32 nr_nodes;
+	u32  nr_cpus;
+	u32  nr_llcs;
+	u32  nr_clusters;
+	u32  nr_nodes;
 
 	bool smt_enabled;
 	bool has_little_cores;
+	bool has_clusters;
 } topo_config = {
-	.nr_cpus = 64,
-	.nr_llcs = 32,
-	.nr_nodes = 32,
+	.nr_cpus	  = 64,
+	.nr_llcs	  = 32,
+	.nr_clusters	  = 64,
+	.nr_nodes	  = 32,
 
-	.smt_enabled = true,
+	.smt_enabled	  = true,
 	.has_little_cores = false,
+	.has_clusters	  = false,
 };
 
 const volatile struct {
-	u64 min_slice_us;
-	u64 max_exec_ns;
+	u64  min_slice_us;
+	u64  max_exec_ns;
 	bool autoslice;
 	bool deadline;
 } timeline_config = {
 	.min_slice_us = 100,
-	.max_exec_ns = 20 * NSEC_PER_MSEC,
-	.autoslice = true,
-	.deadline = true,
+	.max_exec_ns  = 20 * NSEC_PER_MSEC,
+	.autoslice    = true,
+	.deadline     = true,
 };
 
 const volatile struct {
-	u64 backoff_ns;
-	u64 dispatch_lb_busy;
-	u64 min_llc_runs_pick2;
-	u64 min_nr_queued_pick2;
-	u64 slack_factor;
-	u64 wakeup_lb_busy;
+	u64  backoff_ns;
+	u64  dispatch_lb_busy;
+	u64  min_llc_runs_pick2;
+	u64  min_nr_queued_pick2;
+	u64  slack_factor;
+	u64  wakeup_lb_busy;
 
 	bool dispatch_lb_interactive;
 	bool dispatch_pick2_disable;
@@ -96,30 +107,30 @@ const volatile struct {
 	bool wakeup_llc_migrations;
 	bool single_llc_mode;
 } lb_config = {
-	.backoff_ns = 5LLU * NSEC_PER_MSEC,
-	.dispatch_lb_busy = 75,
-	.min_llc_runs_pick2 = 4,
-	.min_nr_queued_pick2 = 10,
-	.slack_factor = LOAD_BALANCE_SLACK,
-	.wakeup_lb_busy = 90,
+	.backoff_ns		 = 5LLU * NSEC_PER_MSEC,
+	.dispatch_lb_busy	 = 75,
+	.min_llc_runs_pick2	 = 4,
+	.min_nr_queued_pick2	 = 10,
+	.slack_factor		 = LOAD_BALANCE_SLACK,
+	.wakeup_lb_busy		 = 90,
 
 	.dispatch_lb_interactive = false,
-	.dispatch_pick2_disable = false,
-	.eager_load_balance = true,
-	.max_dsq_pick2 = false,
-	.wakeup_llc_migrations = false,
-	.single_llc_mode = false,
+	.dispatch_pick2_disable	 = false,
+	.eager_load_balance	 = true,
+	.max_dsq_pick2		 = false,
+	.wakeup_llc_migrations	 = false,
+	.single_llc_mode	 = false,
 };
 
 const volatile struct {
-	u32 nr_dsqs_per_llc;
-	int init_dsq_index;
-	u64 dsq_shift;
-	u32 interactive_ratio;
-	u32 saturated_percent;
-	u32 sched_mode;
-	u32 llc_shards;
-	u64 dhq_max_imbalance;
+	u32  nr_dsqs_per_llc;
+	int  init_dsq_index;
+	u64  dsq_shift;
+	u32  interactive_ratio;
+	u32  saturated_percent;
+	u32  sched_mode;
+	u32  llc_shards;
+	u64  dhq_max_imbalance;
 
 	bool atq_enabled;
 	bool dhq_enabled;
@@ -131,43 +142,45 @@ const volatile struct {
 	bool kthreads_local;
 	bool pelt_enabled;
 } p2dq_config = {
-	.sched_mode = MODE_DEFAULT,
-	.nr_dsqs_per_llc = 3,
-	.init_dsq_index = 0,
-	.dsq_shift = 2,
-	.interactive_ratio = 10,
-	.saturated_percent = 5,
-	.llc_shards = 0,
-	.dhq_max_imbalance = 3,
-
-	.atq_enabled = false,
-	.dhq_enabled = false,
-	.cpu_priority = false,
-	.task_slice = true,
-	.freq_control = false,
-	.interactive_sticky = false,
+	.sched_mode	      = MODE_DEFAULT,
+	.nr_dsqs_per_llc      = 3,
+	.init_dsq_index	      = 0,
+	.dsq_shift	      = 2,
+	.interactive_ratio    = 10,
+	.saturated_percent    = 5,
+	.llc_shards	      = 0,
+	.dhq_max_imbalance    = 3,
+
+	.atq_enabled	      = false,
+	.dhq_enabled	      = false,
+	.cpu_priority	      = false,
+	.task_slice	      = true,
+	.freq_control	      = false,
+	.interactive_sticky   = false,
 	.keep_running_enabled = true,
-	.kthreads_local = true,
-	.pelt_enabled = true,
+	.kthreads_local	      = true,
+	.pelt_enabled	      = true,
 };
 
-const volatile u32 debug = 2;
-const u32 zero_u32 = 0;
+const volatile u32	  debug	   = 2;
+const u32		  zero_u32 = 0;
 extern const volatile u32 nr_cpu_ids;
 
-const u64 lb_timer_intvl_ns = 250LLU * NSEC_PER_MSEC;
+const u64		  lb_timer_intvl_ns  = 250LLU * NSEC_PER_MSEC;
 
-static u32 llc_lb_offset = 1;
-static u64 min_llc_runs_pick2 = 1;
-static bool saturated = false;
-static bool overloaded = false;
+static u32		  llc_lb_offset	     = 1;
+static u64		  min_llc_runs_pick2 = 1;
+static bool		  saturated	     = false;
+static bool		  overloaded	     = false;
 
-u64 llc_ids[MAX_LLCS];
-u32 cpu_core_ids[MAX_CPUS];
-u64 cpu_llc_ids[MAX_CPUS];
-u64 cpu_node_ids[MAX_CPUS];
-u64 big_core_ids[MAX_CPUS];
-u64 dsq_time_slices[MAX_DSQS_PER_LLC];
+u64			  llc_ids[MAX_LLCS];
+u64			  cluster_ids[MAX_CLUSTERS];
+u32			  cpu_cluster_ids[MAX_CPUS];
+u32			  cpu_core_ids[MAX_CPUS];
+u64			  cpu_llc_ids[MAX_CPUS];
+u64			  cpu_node_ids[MAX_CPUS];
+u64			  big_core_ids[MAX_CPUS];
+u64			  dsq_time_slices[MAX_DSQS_PER_LLC];
 
 /* DHQ per LLC pair for migration (MAX_LLCS / 2 DHQs) */
 scx_dhq_t *llc_pair_dhqs[MAX_LLCS / 2];
@@ -176,7 +189,7 @@ u32 llcs_per_node[MAX_NUMA_NODES];
 /* Global DHQ counter for unique indexing */
 u32 global_dhq_count = 0;
 
-u64 min_slice_ns = 500;
+u64 min_slice_ns     = 500;
 
 private(A) struct bpf_cpumask __kptr *all_cpumask;
 private(A) struct bpf_cpumask __kptr *big_cpumask;
@@ -217,13 +230,13 @@ static __always_inline u64 min_dsq_time_slice(void)
 
 static __always_inline u64 clamp_slice(u64 slice_ns)
 {
-	return min(max(min_dsq_time_slice(), slice_ns),
-		   max_dsq_time_slice());
+	return min(max(min_dsq_time_slice(), slice_ns), max_dsq_time_slice());
 }
 
 static __always_inline u64 shard_dsq_id(u32 llc_id, u32 shard_id)
 {
-	return ((MAX_DSQS_PER_LLC * MAX_LLCS) << 3) + (llc_id * MAX_DSQS_PER_LLC) + shard_id;
+	return ((MAX_DSQS_PER_LLC * MAX_LLCS) << 3) +
+	       (llc_id * MAX_DSQS_PER_LLC) + shard_id;
 }
 
 static __always_inline u64 cpu_dsq_id(s32 cpu)
@@ -267,7 +280,7 @@ static int init_cpumask(struct bpf_cpumask **mask_p)
 static s32 pref_idle_cpu(struct llc_ctx *llcx)
 {
 	struct scx_minheap_elem helem;
-	int ret;
+	int			ret;
 
 	if ((ret = arena_spin_lock((void __arena *)&llcx->idle_lock)))
 		return ret;
@@ -305,7 +318,8 @@ static __always_inline u32 pelt_decay(u32 val, u32 periods)
 	u32 i;
 
 	/* Bound iterations for BPF verifier (max 256 periods = 256ms) */
-	bpf_for(i, 0, periods) {
+	bpf_for(i, 0, periods)
+	{
 		if (i >= 256)
 			break;
 		val = (val * 127) >> 7;
@@ -322,7 +336,8 @@ static __always_inline u32 pelt_decay(u32 val, u32 periods)
  * @now: Current timestamp in ns
  * @delta_ns: Runtime delta (0 for decay-only update)
  */
-static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta_ns)
+static __always_inline void update_task_pelt(task_ctx *taskc, u64 now,
+					     u64 delta_ns)
 {
 	u64 elapsed_ns, elapsed_ms;
 	u32 periods, delta_ms;
@@ -333,9 +348,9 @@ static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta
 	if (!taskc->pelt_last_update_time) {
 		/* First update - initialize */
 		taskc->pelt_last_update_time = now;
-		taskc->util_sum = 0;
-		taskc->util_avg = 0;
-		taskc->period_contrib = 0;
+		taskc->util_sum		     = 0;
+		taskc->util_avg		     = 0;
+		taskc->period_contrib	     = 0;
 		return;
 	}
 
@@ -354,7 +369,7 @@ static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta
 
 	periods = (u32)elapsed_ms;
 	if (periods > 256)
-		periods = 256;  /* Cap for verifier */
+		periods = 256; /* Cap for verifier */
 
 	if (taskc->util_sum > 0) {
 		taskc->util_sum = pelt_decay(taskc->util_sum, periods);
@@ -390,9 +405,9 @@ static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta
  * @is_affinitized: Whether task is affinitized to this LLC
  */
 static __always_inline void aggregate_pelt_to_llc(struct llc_ctx *llcx,
-						   task_ctx *taskc,
-						   bool is_interactive,
-						   bool is_affinitized)
+						  task_ctx	 *taskc,
+						  bool is_interactive,
+						  bool is_affinitized)
 {
 	if (!p2dq_config.pelt_enabled)
 		return;
@@ -406,7 +421,6 @@ static __always_inline void aggregate_pelt_to_llc(struct llc_ctx *llcx,
 		__sync_fetch_and_add(&llcx->affn_util_avg, taskc->util_avg);
 }
 
-
 static u32 idle_cpu_percent(const struct cpumask *idle_cpumask)
 {
 	return (100 * nr_idle_cpus(idle_cpumask)) / topo_config.nr_cpus;
@@ -460,8 +474,8 @@ static int llc_create_atqs(struct llc_ctx *llcx)
 		return 0;
 
 	if (topo_config.nr_llcs > 1) {
-		llcx->mig_atq = (scx_atq_t *)scx_atq_create_size(false,
-								 topo_config.nr_cpus);
+		llcx->mig_atq = (scx_atq_t *)scx_atq_create_size(
+			false, topo_config.nr_cpus);
 		if (!llcx->mig_atq) {
 			scx_bpf_error("ATQ failed to create ATQ for LLC %u",
 				      llcx->id);
@@ -502,13 +516,15 @@ static int llc_create_dhqs(struct llc_ctx *llcx)
 	node_llc_count = llcs_per_node[node_id];
 
 	/* Strand: A for first LLC in pair, B for second */
-	strand = (node_llc_count % 2 == 0) ? SCX_DHQ_STRAND_A : SCX_DHQ_STRAND_B;
+	strand = (node_llc_count % 2 == 0) ? SCX_DHQ_STRAND_A :
+					     SCX_DHQ_STRAND_B;
 
 	/* First LLC in a pair: create a new DHQ */
 	if (strand == SCX_DHQ_STRAND_A) {
 		dhq_index = global_dhq_count;
 		if (dhq_index >= (MAX_LLCS / 2)) {
-			scx_bpf_error("DHQ: dhq_index %u >= MAX_LLCS/2", dhq_index);
+			scx_bpf_error("DHQ: dhq_index %u >= MAX_LLCS/2",
+				      dhq_index);
 			return -EINVAL;
 		}
 
@@ -517,12 +533,13 @@ static int llc_create_dhqs(struct llc_ctx *llcx)
 		 * for queued tasks under load without excessive memory usage.
 		 * Max imbalance controls strand balance for cross-LLC load balancing.
 		 */
-		u64 dhq_capacity = topo_config.nr_cpus * 4;
+		u64 dhq_capacity	 = topo_config.nr_cpus * 4;
 		llc_pair_dhqs[dhq_index] = (scx_dhq_t *)scx_dhq_create_balanced(
-			false,                          /* vtime mode */
-			dhq_capacity,                   /* fixed capacity */
-			SCX_DHQ_MODE_PRIORITY,          /* lowest vtime wins */
-			p2dq_config.dhq_max_imbalance   /* max_imbalance from config */
+			false, /* vtime mode */
+			dhq_capacity, /* fixed capacity */
+			SCX_DHQ_MODE_PRIORITY, /* lowest vtime wins */
+			p2dq_config
+				.dhq_max_imbalance /* max_imbalance from config */
 		);
 		if (!llc_pair_dhqs[dhq_index]) {
 			scx_bpf_error("DHQ failed to create DHQ %u for node %u",
@@ -533,22 +550,23 @@ static int llc_create_dhqs(struct llc_ctx *llcx)
 		      dhq_index, node_id, llcx->id, dhq_capacity);
 
 		/* Assign DHQ and strand to this LLC */
-		llcx->mig_dhq = llc_pair_dhqs[dhq_index];
+		llcx->mig_dhq	 = llc_pair_dhqs[dhq_index];
 		llcx->dhq_strand = strand;
 		global_dhq_count++;
 	} else {
 		/* Second LLC in pair: use the most recently created DHQ */
 		dhq_index = global_dhq_count - 1;
 		if (dhq_index >= (MAX_LLCS / 2) || !llc_pair_dhqs[dhq_index]) {
-			scx_bpf_error("DHQ: DHQ %u not available for second LLC %u",
-				      dhq_index, llcx->id);
+			scx_bpf_error(
+				"DHQ: DHQ %u not available for second LLC %u",
+				dhq_index, llcx->id);
 			return -EINVAL;
 		}
 		trace("DHQ %u assigned to LLC %u (node %u, strand B)",
 		      dhq_index, llcx->id, node_id);
 
 		/* Assign DHQ and strand to this LLC */
-		llcx->mig_dhq = llc_pair_dhqs[dhq_index];
+		llcx->mig_dhq	 = llc_pair_dhqs[dhq_index];
 		llcx->dhq_strand = strand;
 	}
 
@@ -557,15 +575,13 @@ static int llc_create_dhqs(struct llc_ctx *llcx)
 	return 0;
 }
 
-
 struct p2dq_timer p2dq_timers[MAX_TIMERS] = {
-	{lb_timer_intvl_ns,
-	     CLOCK_BOOTTIME, 0},
+	{ lb_timer_intvl_ns, CLOCK_BOOTTIME, 0 },
 };
 
 struct timer_wrapper {
 	struct bpf_timer timer;
-	int	key;
+	int		 key;
 };
 
 struct {
@@ -575,13 +591,12 @@ struct {
 	__type(value, struct timer_wrapper);
 } timer_data SEC(".maps");
 
-
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__type(key, u32);
 	__type(value, struct cpu_ctx);
 	__uint(max_entries, 1);
-} cpu_ctxs SEC(".maps");
+} cpu_ctxs	       SEC(".maps");
 
 static struct cpu_ctx *lookup_cpu_ctx(int cpu)
 {
@@ -590,8 +605,7 @@ static struct cpu_ctx *lookup_cpu_ctx(int cpu)
 	if (cpu < 0) {
 		cpuc = bpf_map_lookup_elem(&cpu_ctxs, &zero_u32);
 	} else {
-		cpuc = bpf_map_lookup_percpu_elem(&cpu_ctxs,
-						  &zero_u32, cpu);
+		cpuc = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero_u32, cpu);
 	}
 
 	if (!cpuc) {
@@ -609,6 +623,13 @@ struct {
 	__uint(max_entries, MAX_LLCS);
 } llc_ctxs SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__type(key, u32);
+	__type(value, struct cluster_ctx);
+	__uint(max_entries, MAX_CLUSTERS);
+} cluster_ctxs	       SEC(".maps");
+
 static struct llc_ctx *lookup_llc_ctx(u32 llc_id)
 {
 	struct llc_ctx *llcx;
@@ -622,6 +643,19 @@ static struct llc_ctx *lookup_llc_ctx(u32 llc_id)
 	return llcx;
 }
 
+static struct cluster_ctx *lookup_cluster_ctx(u32 cluster_id)
+{
+	struct cluster_ctx *clusterx;
+
+	clusterx = bpf_map_lookup_elem(&cluster_ctxs, &cluster_id);
+	if (!clusterx) {
+		scx_bpf_error("no cluster_ctx for cluster %u", cluster_id);
+		return NULL;
+	}
+
+	return clusterx;
+}
+
 static struct llc_ctx *lookup_cpu_llc_ctx(s32 cpu)
 {
 	if (cpu >= topo_config.nr_cpus || cpu < 0) {
@@ -638,7 +672,7 @@ struct {
 	__type(value, struct node_ctx);
 	__uint(max_entries, MAX_NUMA_NODES);
 	__uint(map_flags, 0);
-} node_ctxs SEC(".maps");
+} node_ctxs		SEC(".maps");
 
 static struct node_ctx *lookup_node_ctx(u32 node_id)
 {
@@ -662,7 +696,7 @@ struct {
 	__uint(map_flags, BPF_F_NO_PREALLOC);
 	__type(key, int);
 	__type(value, struct mask_wrapper);
-} task_masks SEC(".maps");
+} task_masks	 SEC(".maps");
 
 static task_ctx *lookup_task_ctx(struct task_struct *p)
 {
@@ -679,11 +713,11 @@ struct {
 	__type(key, u32);
 	__type(value, u64);
 	__uint(max_entries, P2DQ_NR_STATS);
-} stats SEC(".maps");
+} stats		   SEC(".maps");
 
 static inline void stat_add(enum stat_idx idx, u64 amount)
 {
-	u32 idx_v = idx;
+	u32  idx_v = idx;
 	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
 	if (cnt_p)
 		(*cnt_p) += amount;
@@ -713,7 +747,8 @@ static bool can_migrate(task_ctx *taskc, struct llc_ctx *llcx)
 
 	if (topo_config.nr_llcs < 2 ||
 	    !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS) ||
-	    (!lb_config.dispatch_lb_interactive && task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)))
+	    (!lb_config.dispatch_lb_interactive &&
+	     task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)))
 		return false;
 
 	if (lb_config.max_dsq_pick2 &&
@@ -736,11 +771,11 @@ static void set_deadline_slice(struct task_struct *p, task_ctx *taskc,
 			       struct llc_ctx *llcx)
 {
 	u64 nr_idle;
-	u64 max_ns = scale_by_task_weight(p, max_dsq_time_slice());
+	u64 max_ns    = scale_by_task_weight(p, max_dsq_time_slice());
 	u64 nr_queued = llc_nr_queued(llcx);
 
 	const struct cpumask *idle_cpumask = scx_bpf_get_idle_cpumask();
-	nr_idle = bpf_cpumask_weight(idle_cpumask);
+	nr_idle				   = bpf_cpumask_weight(idle_cpumask);
 	scx_bpf_put_cpumask(idle_cpumask);
 
 	if (nr_idle == 0)
@@ -794,8 +829,7 @@ static bool keep_running(struct cpu_ctx *cpuc, struct llc_ctx *llcx,
 			 struct task_struct *p)
 {
 	// Only tasks in the most interactive DSQs can keep running.
-	if (!p2dq_config.keep_running_enabled ||
-	    !llcx || !cpuc ||
+	if (!p2dq_config.keep_running_enabled || !llcx || !cpuc ||
 	    cpuc->dsq_index == p2dq_config.nr_dsqs_per_llc - 1 ||
 	    p->scx.flags & SCX_TASK_QUEUED ||
 	    cpuc->ran_for >= timeline_config.max_exec_ns)
@@ -817,10 +851,10 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc,
 				     s32 prev_cpu, bool *is_idle)
 {
 	const struct cpumask *idle_smtmask, *idle_cpumask;
-	struct mask_wrapper *wrapper;
-	struct bpf_cpumask *mask;
-	struct llc_ctx *llcx;
-	s32 cpu = prev_cpu;
+	struct mask_wrapper  *wrapper;
+	struct bpf_cpumask   *mask;
+	struct llc_ctx	     *llcx;
+	s32		      cpu = prev_cpu;
 
 	// Migration-disabled tasks must stay on their current CPU
 	if (is_migration_disabled(p)) {
@@ -831,8 +865,7 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc,
 	idle_cpumask = scx_bpf_get_idle_cpumask();
 	idle_smtmask = scx_bpf_get_idle_smtmask();
 
-	if (!(llcx = lookup_llc_ctx(taskc->llc_id)) ||
-	    !llcx->cpumask)
+	if (!(llcx = lookup_llc_ctx(taskc->llc_id)) || !llcx->cpumask)
 		goto found_cpu;
 
 	// First try last CPU
@@ -851,8 +884,7 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc,
 		goto found_cpu;
 
 	if (llcx->cpumask)
-		bpf_cpumask_and(mask, cast_mask(llcx->cpumask),
-				p->cpus_ptr);
+		bpf_cpumask_and(mask, cast_mask(llcx->cpumask), p->cpus_ptr);
 
 	// First try to find an idle SMT in the LLC
 	if (topo_config.smt_enabled) {
@@ -872,8 +904,7 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc,
 
 	// Next try to find an idle CPU in the node
 	if (llcx->node_cpumask && mask) {
-		bpf_cpumask_and(mask,
-				cast_mask(llcx->node_cpumask),
+		bpf_cpumask_and(mask, cast_mask(llcx->node_cpumask),
 				p->cpus_ptr);
 
 		cpu = __pick_idle_cpu(mask, 0);
@@ -893,16 +924,37 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc,
 	return cpu;
 }
 
-static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
-			 s32 prev_cpu, u64 wake_flags, bool *is_idle)
+/*
+ * Pick an idle CPU within a cluster, intersecting with task's allowed CPUs.
+ * Returns idle CPU >= 0 on success, -1 if no idle CPU available in cluster.
+ */
+static __always_inline s32 pick_idle_cpu_in_cluster(struct task_struct *p,
+						    struct cpu_ctx     *cpuc,
+						    s32 prev_cpu, int flags)
+{
+	struct cluster_ctx *clusterx;
+
+	if (!topo_config.has_clusters || !cpuc)
+		return -1;
+
+	clusterx = lookup_cluster_ctx(cpuc->cluster_id);
+	if (!clusterx || !clusterx->cpumask)
+		return -1;
+
+	// scx_bpf_pick_idle_cpu already respects task affinity (p->cpus_ptr)
+	return __pick_idle_cpu(clusterx->cpumask, flags);
+}
+
+static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, s32 prev_cpu,
+			 u64 wake_flags, bool *is_idle)
 {
 	const struct cpumask *idle_smtmask, *idle_cpumask;
-	struct llc_ctx *llcx;
-	s32 pref_cpu, cpu = prev_cpu;
-	bool migratable = false;
+	struct llc_ctx	     *llcx;
+	s32		      pref_cpu, cpu = prev_cpu;
+	bool		      migratable = false;
 
-	idle_cpumask = scx_bpf_get_idle_cpumask();
-	idle_smtmask = scx_bpf_get_idle_smtmask();
+	idle_cpumask			 = scx_bpf_get_idle_cpumask();
+	idle_smtmask			 = scx_bpf_get_idle_smtmask();
 
 	if (!idle_cpumask || !idle_smtmask)
 		goto found_cpu;
@@ -913,7 +965,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 		goto found_cpu;
 	}
 
-	if (p2dq_config.interactive_sticky && task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)) {
+	if (p2dq_config.interactive_sticky &&
+	    task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)) {
 		*is_idle = scx_bpf_test_and_clear_cpu_idle(prev_cpu);
 		goto found_cpu;
 	}
@@ -921,13 +974,13 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 	if (idle_cpumask && bpf_cpumask_empty(idle_cpumask))
 		goto found_cpu;
 
-	if (!(llcx = lookup_llc_ctx(taskc->llc_id)) ||
-	    !llcx->cpumask)
+	if (!(llcx = lookup_llc_ctx(taskc->llc_id)) || !llcx->cpumask)
 		goto found_cpu;
 
 	migratable = can_migrate(taskc, llcx);
 	if (topo_config.nr_llcs > 1 &&
-	    (llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) || saturated || overloaded) &&
+	    (llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) || saturated ||
+	     overloaded) &&
 	    !migratable) {
 		cpu = prev_cpu;
 		goto found_cpu;
@@ -946,10 +999,24 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 		// Interactive tasks aren't worth migrating across LLCs.
 		if (task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE) ||
 		    (topo_config.nr_llcs == 2 && topo_config.nr_nodes == 2)) {
+			// Try cluster-level idle CPU first for interactive tasks
+			if (topo_config.has_clusters) {
+				struct cpu_ctx *prev_cpuc =
+					lookup_cpu_ctx(prev_cpu);
+				if (prev_cpuc) {
+					cpu = pick_idle_cpu_in_cluster(
+						p, prev_cpuc, prev_cpu, 0);
+					if (cpu >= 0) {
+						stat_inc(P2DQ_STAT_WAKE_LLC);
+						*is_idle = true;
+						goto found_cpu;
+					}
+				}
+			}
+
 			// Try an idle CPU in the LLC.
 			if (llcx->cpumask &&
-			    (cpu = __pick_idle_cpu(llcx->cpumask, 0)
-			     ) >= 0) {
+			    (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0) {
 				stat_inc(P2DQ_STAT_WAKE_LLC);
 				*is_idle = true;
 				goto found_cpu;
@@ -961,7 +1028,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 		}
 
 		struct task_struct *waker = (void *)bpf_get_current_task_btf();
-		task_ctx *waker_taskc = scx_task_data(waker);
+		task_ctx	   *waker_taskc = scx_task_data(waker);
 		// Shouldn't happen, but makes code easier to follow
 		if (!waker_taskc) {
 			stat_inc(P2DQ_STAT_WAKE_PREV);
@@ -970,21 +1037,47 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 
 		if (waker_taskc->llc_id == llcx->id ||
 		    !lb_config.wakeup_llc_migrations) {
+			// If clusters enabled, check if waker and wakee in same cluster
+			if (topo_config.has_clusters) {
+				struct cpu_ctx *waker_cpuc =
+					lookup_cpu_ctx(scx_bpf_task_cpu(waker));
+				struct cpu_ctx *prev_cpuc =
+					lookup_cpu_ctx(prev_cpu);
+
+				if (waker_cpuc && prev_cpuc &&
+				    waker_cpuc->cluster_id ==
+					    prev_cpuc->cluster_id) {
+					// Try idle core in same cluster first
+					if (topo_config.smt_enabled) {
+						cpu = pick_idle_cpu_in_cluster(
+							p, prev_cpuc, prev_cpu,
+							SCX_PICK_IDLE_CORE);
+						if (cpu >= 0) {
+							*is_idle = true;
+							goto found_cpu;
+						}
+					}
+					// Try any idle CPU in same cluster
+					cpu = pick_idle_cpu_in_cluster(
+						p, prev_cpuc, prev_cpu, 0);
+					if (cpu >= 0) {
+						*is_idle = true;
+						goto found_cpu;
+					}
+				}
+			}
+
 			// Try an idle smt core in the LLC.
-			if (topo_config.smt_enabled &&
-			    llcx->cpumask &&
+			if (topo_config.smt_enabled && llcx->cpumask &&
 			    (cpu = __pick_idle_cpu(llcx->cpumask,
-						   SCX_PICK_IDLE_CORE)
-			     ) >= 0) {
+						   SCX_PICK_IDLE_CORE)) >= 0) {
 				stat_inc(P2DQ_STAT_WAKE_LLC);
 				*is_idle = true;
 				goto found_cpu;
 			}
 			// Try an idle cpu in the LLC.
 			if (llcx->cpumask &&
-			    (cpu = __pick_idle_cpu(llcx->cpumask,
-						   0)
-			     ) >= 0) {
+			    (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0) {
 				stat_inc(P2DQ_STAT_WAKE_LLC);
 				*is_idle = true;
 				goto found_cpu;
@@ -996,7 +1089,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 		}
 
 		// If wakeup LLC are allowed then migrate to the waker llc.
-		struct llc_ctx *waker_llcx = lookup_llc_ctx(waker_taskc->llc_id);
+		struct llc_ctx *waker_llcx =
+			lookup_llc_ctx(waker_taskc->llc_id);
 		if (!waker_llcx) {
 			stat_inc(P2DQ_STAT_WAKE_PREV);
 			cpu = prev_cpu;
@@ -1005,8 +1099,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 
 		if (waker_llcx->cpumask &&
 		    (cpu = __pick_idle_cpu(waker_llcx->cpumask,
-					   SCX_PICK_IDLE_CORE)
-		     ) >= 0) {
+					   SCX_PICK_IDLE_CORE)) >= 0) {
 			stat_inc(P2DQ_STAT_WAKE_MIG);
 			*is_idle = true;
 			goto found_cpu;
@@ -1014,9 +1107,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 
 		// Couldn't find an idle core so just migrate to the CPU
 		if (waker_llcx->cpumask &&
-		    (cpu = __pick_idle_cpu(waker_llcx->cpumask,
-					   0)
-		     ) >= 0) {
+		    (cpu = __pick_idle_cpu(waker_llcx->cpumask, 0)) >= 0) {
 			stat_inc(P2DQ_STAT_WAKE_MIG);
 			*is_idle = true;
 			goto found_cpu;
@@ -1029,10 +1120,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 	}
 
 	if (p2dq_config.sched_mode == MODE_PERF &&
-	    topo_config.has_little_cores &&
-	    llcx->big_cpumask) {
-		cpu = __pick_idle_cpu(llcx->big_cpumask,
-				      SCX_PICK_IDLE_CORE);
+	    topo_config.has_little_cores && llcx->big_cpumask) {
+		cpu = __pick_idle_cpu(llcx->big_cpumask, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
 			*is_idle = true;
 			goto found_cpu;
@@ -1047,8 +1136,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 	}
 
 	if (p2dq_config.sched_mode == MODE_EFFICIENCY &&
-	    topo_config.has_little_cores &&
-	    llcx->little_cpumask) {
+	    topo_config.has_little_cores && llcx->little_cpumask) {
 		cpu = __pick_idle_cpu(llcx->little_cpumask, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0) {
 			*is_idle = true;
@@ -1063,21 +1151,18 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 		}
 	}
 
-
-	if (llcx->lb_llc_id < MAX_LLCS &&
-	    taskc->llc_runs == 0) {
+	if (llcx->lb_llc_id < MAX_LLCS && taskc->llc_runs == 0) {
 		u32 target_llc_id = llcx->lb_llc_id;
-		llcx->lb_llc_id = MAX_LLCS;
+		llcx->lb_llc_id	  = MAX_LLCS;
 		if (!(llcx = lookup_llc_ctx(target_llc_id)))
 			goto found_cpu;
 		stat_inc(P2DQ_STAT_SELECT_PICK2);
 	}
 
-	if (topo_config.has_little_cores &&
-	    llcx->little_cpumask && llcx->big_cpumask) {
+	if (topo_config.has_little_cores && llcx->little_cpumask &&
+	    llcx->big_cpumask) {
 		if (task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)) {
-			cpu = __pick_idle_cpu(llcx->little_cpumask,
-					      0);
+			cpu = __pick_idle_cpu(llcx->little_cpumask, 0);
 			if (cpu >= 0) {
 				*is_idle = true;
 				goto found_cpu;
@@ -1097,31 +1182,59 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 		if (llcx->cpumask && pref_cpu >= 0 &&
 		    scx_bpf_test_and_clear_cpu_idle(pref_cpu)) {
 			*is_idle = true;
-			cpu = pref_cpu;
+			cpu	 = pref_cpu;
 			trace("PREF idle %s->%d", p->comm, pref_cpu);
 			goto found_cpu;
 		}
 	}
 
+	/*
+	 * Try cluster-level idle CPU search before LLC-wide search, but only if the LLC
+	 * is under pressure. In low-contention scenarios, cluster selection adds overhead
+	 * without providing cache locality benefits since the LLC search will succeed anyway.
+	 *
+	 * We use LLC load as a proxy for pressure: only use cluster search when load
+	 * indicates more than 1ms of work per CPU (llc->nr_cpus * NSEC_PER_MSEC).
+	 */
+	if (topo_config.has_clusters &&
+	    llcx->load > (llcx->nr_cpus * NSEC_PER_MSEC)) {
+		struct cpu_ctx *prev_cpuc = lookup_cpu_ctx(prev_cpu);
+		if (prev_cpuc) {
+			// First try idle core within prev_cpu's cluster
+			cpu = pick_idle_cpu_in_cluster(p, prev_cpuc, prev_cpu,
+						       SCX_PICK_IDLE_CORE);
+			if (cpu >= 0) {
+				*is_idle = true;
+				goto found_cpu;
+			}
+
+			// Then try any idle CPU within prev_cpu's cluster
+			cpu = pick_idle_cpu_in_cluster(p, prev_cpuc, prev_cpu,
+						       0);
+			if (cpu >= 0) {
+				*is_idle = true;
+				goto found_cpu;
+			}
+		}
+	}
+
 	// Next try in the local LLC (usually succeeds)
 	if (likely(llcx->cpumask &&
-	    (cpu = __pick_idle_cpu(llcx->cpumask,
-				   SCX_PICK_IDLE_CORE)
-	     ) >= 0)) {
+		   (cpu = __pick_idle_cpu(llcx->cpumask, SCX_PICK_IDLE_CORE)) >=
+			   0)) {
 		*is_idle = true;
 		goto found_cpu;
 	}
 
 	// Try a idle CPU in the llc (also likely to succeed)
 	if (likely(llcx->cpumask &&
-	    (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0)) {
+		   (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0)) {
 		*is_idle = true;
 		goto found_cpu;
 	}
 
 	if (topo_config.nr_llcs > 1 &&
-	    llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) &&
-	    migratable &&
+	    llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) && migratable &&
 	    llcx->node_cpumask) {
 		cpu = scx_bpf_pick_idle_cpu(cast_mask(llcx->node_cpumask),
 					    SCX_PICK_IDLE_CORE);
@@ -1130,7 +1243,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 			goto found_cpu;
 		}
 		if (llcx->node_cpumask) {
-			cpu = scx_bpf_pick_idle_cpu(cast_mask(llcx->node_cpumask), 0);
+			cpu = scx_bpf_pick_idle_cpu(
+				cast_mask(llcx->node_cpumask), 0);
 			if (cpu >= 0) {
 				*is_idle = true;
 				goto found_cpu;
@@ -1144,7 +1258,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 				goto found_cpu;
 			}
 			if (all_cpumask) {
-				cpu = scx_bpf_pick_idle_cpu(cast_mask(all_cpumask), 0);
+				cpu = scx_bpf_pick_idle_cpu(
+					cast_mask(all_cpumask), 0);
 				if (cpu >= 0) {
 					*is_idle = true;
 					goto found_cpu;
@@ -1162,12 +1277,12 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc,
 	return cpu;
 }
 
-
-static s32 p2dq_select_cpu_impl(struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+static s32 p2dq_select_cpu_impl(struct task_struct *p, s32 prev_cpu,
+				u64 wake_flags)
 {
 	task_ctx *taskc;
-	bool is_idle = false;
-	s32 cpu;
+	bool	  is_idle = false;
+	s32	  cpu;
 
 	if (!(taskc = lookup_task_ctx(p)))
 		return prev_cpu;
@@ -1182,16 +1297,16 @@ static s32 p2dq_select_cpu_impl(struct task_struct *p, s32 prev_cpu, u64 wake_fl
 		// Only direct dispatch non-affinitized tasks
 		// Affinitized tasks will be queued by enqueue to prevent livelock
 		if (task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS)) {
-			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, taskc->slice_ns, 0);
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, taskc->slice_ns,
+					   0);
 		}
 	}
-	trace("SELECT [%d][%s] %i->%i idle %i",
-	      p->pid, p->comm, prev_cpu, cpu, is_idle);
+	trace("SELECT [%d][%s] %i->%i idle %i", p->pid, p->comm, prev_cpu, cpu,
+	      is_idle);
 
 	return cpu;
 }
 
-
 /*
  * Perform the enqueue logic for `p` but don't enqueue it where possible.  This
  * is primarily used so that scx_chaos can decide to enqueue a task either
@@ -1210,8 +1325,8 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 {
 	struct cpu_ctx *cpuc;
 	struct llc_ctx *llcx;
-	task_ctx *taskc;
-	s32 cpu = scx_bpf_task_cpu(p);
+	task_ctx       *taskc;
+	s32		cpu = scx_bpf_task_cpu(p);
 
 	// Default to 0 and set to failed.
 	__builtin_memset(ret, 0, sizeof(*ret));
@@ -1221,13 +1336,10 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 	 * Per-cpu kthreads are considered interactive and dispatched directly
 	 * into the local DSQ.
 	 */
-	if (unlikely(p2dq_config.kthreads_local &&
-	    (p->flags & PF_KTHREAD) &&
-	    p->nr_cpus_allowed == 1)) {
+	if (unlikely(p2dq_config.kthreads_local && (p->flags & PF_KTHREAD) &&
+		     p->nr_cpus_allowed == 1)) {
 		stat_inc(P2DQ_STAT_DIRECT);
-		scx_bpf_dsq_insert(p,
-				   SCX_DSQ_LOCAL,
-				   min_dsq_time_slice(),
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, min_dsq_time_slice(),
 				   enq_flags);
 		if (scx_bpf_test_and_clear_cpu_idle(cpu))
 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
@@ -1235,7 +1347,7 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 		return;
 	}
 
-	if(!(taskc = lookup_task_ctx(p))) {
+	if (!(taskc = lookup_task_ctx(p))) {
 		scx_bpf_error("invalid lookup");
 		return;
 	}
@@ -1248,17 +1360,17 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 		bool has_cleared_idle = false;
 		if (!__COMPAT_is_enq_cpu_selected(enq_flags) ||
 		    !bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
-			cpu = pick_idle_affinitized_cpu(p,
-							taskc,
-							cpu,
+			cpu = pick_idle_affinitized_cpu(p, taskc, cpu,
 							&has_cleared_idle);
 		else
 			has_cleared_idle = scx_bpf_test_and_clear_cpu_idle(cpu);
 
 		if (has_cleared_idle)
-			enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
+			enqueue_promise_set_flag(
+				ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
 		else
-			enqueue_promise_clear_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
+			enqueue_promise_clear_flag(
+				ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
 
 		ret->cpu = cpu;
 		if (!(cpuc = lookup_cpu_ctx(cpu)) ||
@@ -1288,26 +1400,36 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 				struct llc_ctx *prev_llcx;
 				if (prev_cpu >= 0 && prev_cpu < NR_CPUS &&
 				    (prev_cpuc = lookup_cpu_ctx(prev_cpu)) &&
-				    (prev_llcx = lookup_llc_ctx(prev_cpuc->llc_id)) &&
+				    (prev_llcx = lookup_llc_ctx(
+					     prev_cpuc->llc_id)) &&
 				    prev_llcx->cpumask) {
 					// Check if any CPU in prev LLC matches affinity
-					s32 llc_cpu = scx_bpf_pick_idle_cpu(cast_mask(prev_llcx->cpumask), 0);
-					if (llc_cpu >= 0 && bpf_cpumask_test_cpu(llc_cpu, p->cpus_ptr)) {
+					s32 llc_cpu = scx_bpf_pick_idle_cpu(
+						cast_mask(prev_llcx->cpumask),
+						0);
+					if (llc_cpu >= 0 &&
+					    bpf_cpumask_test_cpu(llc_cpu,
+								 p->cpus_ptr)) {
 						target_cpu = llc_cpu;
 					} else {
 						// Fallback to random CPU in affinity mask
-						target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
+						target_cpu =
+							bpf_cpumask_any_distribute(
+								p->cpus_ptr);
 					}
 				} else {
 					// Fallback to random CPU in affinity mask
-					target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
+					target_cpu = bpf_cpumask_any_distribute(
+						p->cpus_ptr);
 				}
 			}
 
 			// Update cpuc and llcx to match target_cpu
 			if (!(cpuc = lookup_cpu_ctx(target_cpu)) ||
 			    !(llcx = lookup_llc_ctx(cpuc->llc_id))) {
-				scx_bpf_error("invalid lookup for target_cpu %d", target_cpu);
+				scx_bpf_error(
+					"invalid lookup for target_cpu %d",
+					target_cpu);
 				return;
 			}
 			ret->cpu = target_cpu;
@@ -1327,9 +1449,11 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 				u64 old_slice = taskc->slice_ns;
 				// Scale slice inversely with queue depth
 				// Add 1 to account for the task we're about to enqueue
-				taskc->slice_ns = clamp_slice(taskc->slice_ns / (nr_queued + 1));
+				taskc->slice_ns = clamp_slice(taskc->slice_ns /
+							      (nr_queued + 1));
 				trace("PENALIZE [%d][%s] cpu=%d nr_queued=%llu old_slice=%llu new_slice=%llu",
-				      p->pid, p->comm, target_cpu, nr_queued, old_slice, taskc->slice_ns);
+				      p->pid, p->comm, target_cpu, nr_queued,
+				      old_slice, taskc->slice_ns);
 			}
 		}
 
@@ -1338,21 +1462,23 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 
 		// Always queue affinitized tasks to affn_dsq (no direct dispatch)
 		// This prevents tight wakeup loops and allows proper idle state
-		u64 task_vtime_affn = p->scx.dsq_vtime;
+		u64 task_vtime_affn  = p->scx.dsq_vtime;
 
-		ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME;
-		ret->vtime.dsq_id = taskc->dsq_id;
-		ret->vtime.slice_ns = taskc->slice_ns;
+		ret->kind	     = P2DQ_ENQUEUE_PROMISE_VTIME;
+		ret->vtime.dsq_id    = taskc->dsq_id;
+		ret->vtime.slice_ns  = taskc->slice_ns;
 		ret->vtime.enq_flags = enq_flags;
-		ret->vtime.vtime = task_vtime_affn;
+		ret->vtime.vtime     = task_vtime_affn;
 
 		// Kick target CPU if we cleared idle state
-		if (enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE))
-			enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_KICK_IDLE);
+		if (enqueue_promise_test_flag(
+			    ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE))
+			enqueue_promise_set_flag(ret,
+						 ENQUEUE_PROMISE_F_KICK_IDLE);
 
 		trace("ENQUEUE %s weight %d slice %llu vtime %llu llc vtime %llu affn_dsq",
-		      p->comm, p->scx.weight, taskc->slice_ns,
-		      task_vtime_affn, llcx->vtime);
+		      p->comm, p->scx.weight, taskc->slice_ns, task_vtime_affn,
+		      llcx->vtime);
 
 		return;
 	}
@@ -1360,24 +1486,22 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 	// If an idle CPU hasn't been found in select_cpu find one now
 	if (!__COMPAT_is_enq_cpu_selected(enq_flags)) {
 		bool has_cleared_idle = false;
-		cpu = pick_idle_cpu(p,
-				    taskc,
-				    cpu,
-				    0,
-				    &has_cleared_idle);
+		cpu = pick_idle_cpu(p, taskc, cpu, 0, &has_cleared_idle);
 		if (has_cleared_idle)
-			enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
+			enqueue_promise_set_flag(
+				ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
 		else
-			enqueue_promise_clear_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
+			enqueue_promise_clear_flag(
+				ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
 
 		if (!(cpuc = lookup_cpu_ctx(cpu)) ||
-		     !(llcx = lookup_llc_ctx(cpuc->llc_id))) {
+		    !(llcx = lookup_llc_ctx(cpuc->llc_id))) {
 			scx_bpf_error("invalid lookup");
 			return;
 		}
 
 		s32 task_cpu = scx_bpf_task_cpu(p);
-		ret->cpu = cpu;
+		ret->cpu     = cpu;
 		update_vtime(p, cpuc, taskc, llcx);
 		if (timeline_config.deadline)
 			set_deadline_slice(p, taskc, llcx);
@@ -1385,28 +1509,31 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 		if (cpu_ctx_test_flag(cpuc, CPU_CTX_F_NICE_TASK))
 			enq_flags |= SCX_ENQ_PREEMPT;
 
-		if ((enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) ||
+		if ((enqueue_promise_test_flag(
+			     ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) ||
 		     cpu_ctx_test_flag(cpuc, CPU_CTX_F_NICE_TASK))) {
 			ret->kind = P2DQ_ENQUEUE_PROMISE_FIFO;
 			// For migration-disabled tasks, use SCX_DSQ_LOCAL to dispatch
 			// to the task's current CPU, not SCX_DSQ_LOCAL_ON|cpu
 			if (cpu != task_cpu && !is_migration_disabled(p)) {
-				ret->fifo.dsq_id = SCX_DSQ_LOCAL_ON|cpu;
+				ret->fifo.dsq_id = SCX_DSQ_LOCAL_ON | cpu;
 			} else {
 				ret->fifo.dsq_id = SCX_DSQ_LOCAL;
 			}
-			ret->fifo.slice_ns = taskc->slice_ns;
+			ret->fifo.slice_ns  = taskc->slice_ns;
 			ret->fifo.enq_flags = enq_flags;
-			if (enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE))
-				enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_KICK_IDLE);
+			if (enqueue_promise_test_flag(
+				    ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE))
+				enqueue_promise_set_flag(
+					ret, ENQUEUE_PROMISE_F_KICK_IDLE);
 			return;
 		}
 
 		// Only allow tasks with full CPU affinity into migration DSQs
 		// Affinitized tasks stay in LLC DSQ to prevent cross-LLC livelock
 		bool migrate = likely(!lb_config.single_llc_mode) &&
-		               can_migrate(taskc, llcx) &&
-		               task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS);
+			       can_migrate(taskc, llcx) &&
+			       task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS);
 
 		u64 task_vtime_early = p->scx.dsq_vtime;
 
@@ -1415,40 +1542,40 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 			if (p2dq_config.dhq_enabled) {
 				taskc->enq_flags = enq_flags;
 				ret->kind = P2DQ_ENQUEUE_PROMISE_DHQ_VTIME;
-				ret->dhq.dsq_id = cpuc->llc_dsq;
-				ret->dhq.dhq = llcx->mig_dhq;
-				ret->dhq.strand = llcx->dhq_strand;
-				ret->dhq.slice_ns = taskc->slice_ns;
-				ret->dhq.vtime = task_vtime_early;
+				ret->dhq.dsq_id	   = cpuc->llc_dsq;
+				ret->dhq.dhq	   = llcx->mig_dhq;
+				ret->dhq.strand	   = llcx->dhq_strand;
+				ret->dhq.slice_ns  = taskc->slice_ns;
+				ret->dhq.vtime	   = task_vtime_early;
 				ret->dhq.enq_flags = enq_flags;
 			} else if (p2dq_config.atq_enabled) {
 				taskc->enq_flags = enq_flags;
 				ret->kind = P2DQ_ENQUEUE_PROMISE_ATQ_VTIME;
-				ret->vtime.dsq_id = cpuc->llc_dsq;
-				ret->vtime.atq = llcx->mig_atq;
+				ret->vtime.dsq_id   = cpuc->llc_dsq;
+				ret->vtime.atq	    = llcx->mig_atq;
 				ret->vtime.slice_ns = taskc->slice_ns;
-				ret->vtime.vtime = task_vtime_early;
+				ret->vtime.vtime    = task_vtime_early;
 			} else {
-				ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME;
+				ret->kind	  = P2DQ_ENQUEUE_PROMISE_VTIME;
 				ret->vtime.dsq_id = taskc->dsq_id;
-				ret->vtime.slice_ns = taskc->slice_ns;
+				ret->vtime.slice_ns  = taskc->slice_ns;
 				ret->vtime.enq_flags = enq_flags;
-				ret->vtime.vtime = task_vtime_early;
+				ret->vtime.vtime     = task_vtime_early;
 			}
 			stat_inc(P2DQ_STAT_ENQ_MIG);
 		} else {
-			taskc->dsq_id = cpuc->llc_dsq;
-			ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME;
-			ret->vtime.dsq_id = taskc->dsq_id;
-			ret->vtime.slice_ns = taskc->slice_ns;
+			taskc->dsq_id	     = cpuc->llc_dsq;
+			ret->kind	     = P2DQ_ENQUEUE_PROMISE_VTIME;
+			ret->vtime.dsq_id    = taskc->dsq_id;
+			ret->vtime.slice_ns  = taskc->slice_ns;
 			ret->vtime.enq_flags = enq_flags;
-			ret->vtime.vtime = task_vtime_early;
+			ret->vtime.vtime     = task_vtime_early;
 			stat_inc(P2DQ_STAT_ENQ_LLC);
 		}
 
 		trace("ENQUEUE %s weight %d slice %llu vtime %llu llc vtime %llu",
-		      p->comm, p->scx.weight, taskc->slice_ns,
-		      task_vtime_early, llcx->vtime);
+		      p->comm, p->scx.weight, taskc->slice_ns, task_vtime_early,
+		      llcx->vtime);
 
 		return;
 	}
@@ -1469,27 +1596,32 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 
 	bool has_cleared_idle = scx_bpf_test_and_clear_cpu_idle(cpu);
 	if (has_cleared_idle)
-		enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
+		enqueue_promise_set_flag(ret,
+					 ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
 	else
-		enqueue_promise_clear_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
+		enqueue_promise_clear_flag(ret,
+					   ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE);
 
-	if ((enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) ||
+	if ((enqueue_promise_test_flag(ret,
+				       ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) ||
 	     cpu_ctx_test_flag(cpuc, CPU_CTX_F_NICE_TASK)) &&
 	    bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
-		ret->kind = P2DQ_ENQUEUE_PROMISE_FIFO;
-		ret->fifo.dsq_id = SCX_DSQ_LOCAL;
-		ret->fifo.slice_ns = taskc->slice_ns;
+		ret->kind	    = P2DQ_ENQUEUE_PROMISE_FIFO;
+		ret->fifo.dsq_id    = SCX_DSQ_LOCAL;
+		ret->fifo.slice_ns  = taskc->slice_ns;
 		ret->fifo.enq_flags = enq_flags;
-		if (enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE))
-			enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_KICK_IDLE);
+		if (enqueue_promise_test_flag(
+			    ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE))
+			enqueue_promise_set_flag(ret,
+						 ENQUEUE_PROMISE_F_KICK_IDLE);
 		return;
 	}
 
 	// Only allow tasks with full CPU affinity into migration DSQs
 	// Affinitized tasks stay in LLC DSQ to prevent cross-LLC livelock
 	bool migrate = likely(!lb_config.single_llc_mode) &&
-	               can_migrate(taskc, llcx) &&
-	               task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS);
+		       can_migrate(taskc, llcx) &&
+		       task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS);
 	if (migrate) {
 		taskc->dsq_id = llcx->mig_dsq;
 		stat_inc(P2DQ_STAT_ENQ_MIG);
@@ -1497,23 +1629,23 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 		u64 task_vtime_mig = p->scx.dsq_vtime;
 
 		if (p2dq_config.dhq_enabled) {
-			taskc->enq_flags = enq_flags;
-			ret->kind = P2DQ_ENQUEUE_PROMISE_DHQ_VTIME;
-			ret->dhq.dsq_id = cpuc->llc_dsq;
-			ret->dhq.dhq = llcx->mig_dhq;
-			ret->dhq.strand = llcx->dhq_strand;
-			ret->dhq.slice_ns = taskc->slice_ns;
-			ret->dhq.vtime = task_vtime_mig;
+			taskc->enq_flags   = enq_flags;
+			ret->kind	   = P2DQ_ENQUEUE_PROMISE_DHQ_VTIME;
+			ret->dhq.dsq_id	   = cpuc->llc_dsq;
+			ret->dhq.dhq	   = llcx->mig_dhq;
+			ret->dhq.strand	   = llcx->dhq_strand;
+			ret->dhq.slice_ns  = taskc->slice_ns;
+			ret->dhq.vtime	   = task_vtime_mig;
 			ret->dhq.enq_flags = enq_flags;
 
 			return;
 		} else if (p2dq_config.atq_enabled) {
-			taskc->enq_flags = enq_flags;
-			ret->kind = P2DQ_ENQUEUE_PROMISE_ATQ_VTIME;
-			ret->vtime.dsq_id = cpuc->llc_dsq;
-			ret->vtime.atq = llcx->mig_atq;
+			taskc->enq_flags    = enq_flags;
+			ret->kind	    = P2DQ_ENQUEUE_PROMISE_ATQ_VTIME;
+			ret->vtime.dsq_id   = cpuc->llc_dsq;
+			ret->vtime.atq	    = llcx->mig_atq;
 			ret->vtime.slice_ns = taskc->slice_ns;
-			ret->vtime.vtime = task_vtime_mig;
+			ret->vtime.vtime    = task_vtime_mig;
 
 			return;
 		}
@@ -1525,35 +1657,31 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret,
 	u64 task_vtime = p->scx.dsq_vtime;
 
 	trace("ENQUEUE %s weight %d slice %llu vtime %llu llc vtime %llu",
-	      p->comm, p->scx.weight, taskc->slice_ns,
-	      task_vtime, llcx->vtime);
+	      p->comm, p->scx.weight, taskc->slice_ns, task_vtime, llcx->vtime);
 
-	ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME;
-	ret->vtime.dsq_id = taskc->dsq_id;
+	ret->kind	     = P2DQ_ENQUEUE_PROMISE_VTIME;
+	ret->vtime.dsq_id    = taskc->dsq_id;
 	ret->vtime.enq_flags = enq_flags;
-	ret->vtime.slice_ns = taskc->slice_ns;
-	ret->vtime.vtime = task_vtime;
+	ret->vtime.slice_ns  = taskc->slice_ns;
+	ret->vtime.vtime     = task_vtime;
 }
 
-static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struct *p)
+static void complete_p2dq_enqueue(struct enqueue_promise *pro,
+				  struct task_struct	 *p)
 {
 	task_ctx *taskc;
-	int ret;
+	int	  ret;
 
 	switch (pro->kind) {
 	case P2DQ_ENQUEUE_PROMISE_COMPLETE:
 		break;
 	case P2DQ_ENQUEUE_PROMISE_FIFO:
-		scx_bpf_dsq_insert(p,
-				   pro->fifo.dsq_id,
-				   pro->fifo.slice_ns,
+		scx_bpf_dsq_insert(p, pro->fifo.dsq_id, pro->fifo.slice_ns,
 				   pro->fifo.enq_flags);
 		break;
 	case P2DQ_ENQUEUE_PROMISE_VTIME:
-		scx_bpf_dsq_insert_vtime(p,
-					 pro->vtime.dsq_id,
-					 pro->vtime.slice_ns,
-				         pro->vtime.vtime,
+		scx_bpf_dsq_insert_vtime(p, pro->vtime.dsq_id,
+					 pro->vtime.slice_ns, pro->vtime.vtime,
 					 pro->vtime.enq_flags);
 		break;
 	case P2DQ_ENQUEUE_PROMISE_ATQ_FIFO:
@@ -1563,7 +1691,7 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc
 		}
 
 		taskc = lookup_task_ctx(p);
-		ret = scx_atq_insert(pro->fifo.atq, &taskc->common);
+		ret   = scx_atq_insert(pro->fifo.atq, &taskc->common);
 		if (ret) {
 			scx_bpf_error("error %d on scx_atq_insert", ret);
 			break;
@@ -1579,9 +1707,8 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc
 		}
 
 		taskc = lookup_task_ctx(p);
-		ret = scx_atq_insert_vtime(pro->vtime.atq,
-					       &taskc->common,
-					       pro->vtime.vtime);
+		ret   = scx_atq_insert_vtime(pro->vtime.atq, &taskc->common,
+					     pro->vtime.vtime);
 		if (ret) {
 			scx_bpf_error("error %d on scx_atq_insert", ret);
 			break;
@@ -1592,15 +1719,12 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc
 			scx_bpf_error("invalid DHQ");
 			break;
 		}
-		ret = scx_dhq_insert_vtime(pro->dhq.dhq,
-					   (u64)p->pid,
-					   pro->dhq.vtime,
-					   pro->dhq.strand);
+		ret = scx_dhq_insert_vtime(pro->dhq.dhq, (u64)p->pid,
+					   pro->dhq.vtime, pro->dhq.strand);
 		if (ret) {
 			// The DHQ insert failed (EAGAIN if imbalanced, ENOSPC if full)
 			// Fallback to the DSQ
-			scx_bpf_dsq_insert_vtime(p,
-						 pro->dhq.dsq_id,
+			scx_bpf_dsq_insert_vtime(p, pro->dhq.dsq_id,
 						 pro->dhq.slice_ns,
 						 pro->dhq.vtime,
 						 pro->dhq.enq_flags);
@@ -1626,10 +1750,10 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc
 
 static int p2dq_running_impl(struct task_struct *p)
 {
-	task_ctx *taskc;
+	task_ctx       *taskc;
 	struct cpu_ctx *cpuc;
 	struct llc_ctx *llcx;
-	s32 task_cpu = scx_bpf_task_cpu(p);
+	s32		task_cpu = scx_bpf_task_cpu(p);
 
 	if (!(taskc = lookup_task_ctx(p)) ||
 	    !(cpuc = lookup_cpu_ctx(task_cpu)) ||
@@ -1639,9 +1763,8 @@ static int p2dq_running_impl(struct task_struct *p)
 	if (taskc->llc_id != cpuc->llc_id) {
 		task_refresh_llc_runs(taskc);
 		stat_inc(P2DQ_STAT_LLC_MIGRATION);
-		trace("RUNNING %d cpu %d->%d llc %d->%d",
-		      p->pid, cpuc->id, task_cpu,
-		      taskc->llc_id, llcx->id);
+		trace("RUNNING %d cpu %d->%d llc %d->%d", p->pid, cpuc->id,
+		      task_cpu, taskc->llc_id, llcx->id);
 	} else {
 		if (taskc->llc_runs == 0)
 			task_refresh_llc_runs(taskc);
@@ -1652,7 +1775,7 @@ static int p2dq_running_impl(struct task_struct *p)
 		stat_inc(P2DQ_STAT_NODE_MIGRATION);
 	}
 
-	taskc->llc_id = llcx->id;
+	taskc->llc_id  = llcx->id;
 	taskc->node_id = llcx->node_id;
 	if (p->scx.weight < 100)
 		task_ctx_set_flag(taskc, TASK_CTX_F_WAS_NICE);
@@ -1672,12 +1795,12 @@ static int p2dq_running_impl(struct task_struct *p)
 		cpu_ctx_clear_flag(cpuc, CPU_CTX_F_NICE_TASK);
 
 	cpuc->slice_ns = taskc->slice_ns;
-	cpuc->ran_for = 0;
+	cpuc->ran_for  = 0;
 	// racy, but don't care
 	if (p->scx.dsq_vtime > llcx->vtime &&
 	    p->scx.dsq_vtime < llcx->vtime + max_dsq_time_slice()) {
-		__sync_val_compare_and_swap(&llcx->vtime,
-					    llcx->vtime, p->scx.dsq_vtime);
+		__sync_val_compare_and_swap(&llcx->vtime, llcx->vtime,
+					    p->scx.dsq_vtime);
 	}
 
 	// If the task is running in the least interactive DSQ, bump the
@@ -1702,14 +1825,14 @@ static int p2dq_running_impl(struct task_struct *p)
 
 void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable)
 {
-	task_ctx *taskc;
+	task_ctx       *taskc;
 	struct llc_ctx *llcx;
 	struct cpu_ctx *cpuc;
-	u64 used, scaled_used, last_dsq_slice_ns;
-	u64 now = bpf_ktime_get_ns();
+	u64		used, scaled_used, last_dsq_slice_ns;
+	u64		now = bpf_ktime_get_ns();
 
 	if (unlikely(!(taskc = lookup_task_ctx(p)) ||
-	    !(llcx = lookup_llc_ctx(taskc->llc_id))))
+		     !(llcx = lookup_llc_ctx(taskc->llc_id))))
 		return;
 
 	// can't happen, appease the verifier
@@ -1728,13 +1851,13 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable)
 		task_ctx_clear_flag(taskc, TASK_CTX_F_WAS_NICE);
 	}
 
-	taskc->last_dsq_id = taskc->dsq_id;
+	taskc->last_dsq_id    = taskc->dsq_id;
 	taskc->last_dsq_index = taskc->dsq_index;
-	taskc->used = 0;
+	taskc->used	      = 0;
 
-	last_dsq_slice_ns = taskc->slice_ns;
-	used = now - taskc->last_run_at;
-	scaled_used = scale_by_task_weight_inverse(p, used);
+	last_dsq_slice_ns     = taskc->slice_ns;
+	used		      = now - taskc->last_run_at;
+	scaled_used	      = scale_by_task_weight_inverse(p, used);
 
 	p->scx.dsq_vtime += scaled_used;
 	__sync_fetch_and_add(&llcx->vtime, used);
@@ -1742,16 +1865,19 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable)
 	/* Update PELT metrics if enabled */
 	if (p2dq_config.pelt_enabled) {
 		update_task_pelt(taskc, now, used);
-		aggregate_pelt_to_llc(llcx, taskc,
-				      task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE),
-				      !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS));
+		aggregate_pelt_to_llc(
+			llcx, taskc,
+			task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE),
+			!task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS));
 	}
 
 	/* Legacy load tracking (when PELT disabled) */
 	if (!p2dq_config.pelt_enabled) {
 		__sync_fetch_and_add(&llcx->load, used);
-		if (taskc->dsq_index >= 0 && taskc->dsq_index < MAX_DSQS_PER_LLC)
-			__sync_fetch_and_add(&llcx->dsq_load[taskc->dsq_index], used);
+		if (taskc->dsq_index >= 0 &&
+		    taskc->dsq_index < MAX_DSQS_PER_LLC)
+			__sync_fetch_and_add(&llcx->dsq_load[taskc->dsq_index],
+					     used);
 
 		if (task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE))
 			__sync_fetch_and_add(&llcx->intr_load, used);
@@ -1761,25 +1887,30 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable)
 			__sync_fetch_and_add(&llcx->affn_load, used);
 	}
 
-	trace("STOPPING %s weight %d slice %llu used %llu scaled %llu",
-	      p->comm, p->scx.weight, last_dsq_slice_ns, used, scaled_used);
+	trace("STOPPING %s weight %d slice %llu used %llu scaled %llu", p->comm,
+	      p->scx.weight, last_dsq_slice_ns, used, scaled_used);
 
 	if (!runnable) {
 		used = now - taskc->last_run_started;
 
 		// Affinitized tasks need stricter thresholds to prevent monopolization
-		bool is_affinitized = !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS);
+		bool is_affinitized =
+			!task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS);
 		u64 inc_threshold = is_affinitized ?
-			((19 * last_dsq_slice_ns) / 20) :  // 95% for affinitized
-			((9 * last_dsq_slice_ns) / 10);     // 90% for normal
-		u64 dec_threshold = is_affinitized ?
-			(last_dsq_slice_ns / 4) :           // 25% for affinitized
-			(last_dsq_slice_ns / 2);            // 50% for normal
+					    ((19 * last_dsq_slice_ns) /
+					     20) : // 95% for affinitized
+					    ((9 * last_dsq_slice_ns) /
+					     10); // 90% for normal
+		u64 dec_threshold =
+			is_affinitized ?
+				(last_dsq_slice_ns / 4) : // 25% for affinitized
+				(last_dsq_slice_ns / 2); // 50% for normal
 
 		// On stopping determine if the task can move to a longer DSQ by
 		// comparing the used time to the scaled DSQ slice.
 		if (used >= inc_threshold) {
-			if (taskc->dsq_index < p2dq_config.nr_dsqs_per_llc - 1 &&
+			if (taskc->dsq_index <
+				    p2dq_config.nr_dsqs_per_llc - 1 &&
 			    p->scx.weight >= 100) {
 				taskc->dsq_index += 1;
 				stat_inc(P2DQ_STAT_DSQ_CHANGE);
@@ -1788,13 +1919,12 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable)
 			} else {
 				stat_inc(P2DQ_STAT_DSQ_SAME);
 			}
-		// If under threshold, move the task back down.
+			// If under threshold, move the task back down.
 		} else if (used < dec_threshold) {
 			if (taskc->dsq_index > 0) {
 				taskc->dsq_index -= 1;
 				stat_inc(P2DQ_STAT_DSQ_CHANGE);
-				trace("%s[%p]: DSQ dec %llu -> %u",
-				      p->comm, p,
+				trace("%s[%p]: DSQ dec %llu -> %u", p->comm, p,
 				      taskc->last_dsq_index, taskc->dsq_index);
 			} else {
 				stat_inc(P2DQ_STAT_DSQ_SAME);
@@ -1810,12 +1940,15 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable)
 
 		if (p2dq_config.task_slice) {
 			if (used >= ((7 * last_dsq_slice_ns) / 8)) {
-				taskc->slice_ns = clamp_slice((5 * taskc->slice_ns) >> 2);
+				taskc->slice_ns =
+					clamp_slice((5 * taskc->slice_ns) >> 2);
 			} else if (used < last_dsq_slice_ns / 2) {
-				taskc->slice_ns = clamp_slice((7 * taskc->slice_ns) >> 3);
+				taskc->slice_ns =
+					clamp_slice((7 * taskc->slice_ns) >> 3);
 			}
 		} else {
-			taskc->slice_ns = task_dsq_slice_ns(p, taskc->dsq_index);
+			taskc->slice_ns =
+				task_dsq_slice_ns(p, taskc->dsq_index);
 		}
 		taskc->last_run_started = 0;
 		if (is_interactive(taskc))
@@ -1828,10 +1961,10 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable)
 static bool consume_llc(struct llc_ctx *llcx)
 {
 	struct task_struct *p;
-	task_ctx *taskc;
-	struct cpu_ctx *cpuc;
-	s32 cpu;
-	u64 pid;
+	task_ctx	   *taskc;
+	struct cpu_ctx	   *cpuc;
+	s32		    cpu;
+	u64		    pid;
 
 	if (!llcx)
 		return false;
@@ -1840,8 +1973,7 @@ static bool consume_llc(struct llc_ctx *llcx)
 	if (!(cpuc = lookup_cpu_ctx(cpu)))
 		return false;
 
-	if (p2dq_config.dhq_enabled &&
-	    scx_dhq_nr_queued(llcx->mig_dhq) > 0) {
+	if (p2dq_config.dhq_enabled && scx_dhq_nr_queued(llcx->mig_dhq) > 0) {
 		pid = scx_dhq_pop_strand(llcx->mig_dhq, llcx->dhq_strand);
 		if (!pid) {
 			trace("DHQ pop returned NULL");
@@ -1860,13 +1992,10 @@ static bool consume_llc(struct llc_ctx *llcx)
 		}
 
 		/* Insert to LLC DSQ and let move_to_local handle affinity atomically */
-		trace("DHQ %llu insert %s[%d] to LLC DSQ",
-		      llcx->mig_dhq, p->comm, p->pid);
-		scx_bpf_dsq_insert_vtime(p,
-					 cpuc->llc_dsq,
-					 taskc->slice_ns,
-					 p->scx.dsq_vtime,
-					 taskc->enq_flags);
+		trace("DHQ %llu insert %s[%d] to LLC DSQ", llcx->mig_dhq,
+		      p->comm, p->pid);
+		scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns,
+					 p->scx.dsq_vtime, taskc->enq_flags);
 		bpf_task_release(p);
 
 		/* Try to dispatch from LLC DSQ (handles affinity check atomically) */
@@ -1875,22 +2004,19 @@ static bool consume_llc(struct llc_ctx *llcx)
 
 		goto try_dsq;
 	} else if (p2dq_config.atq_enabled &&
-	    scx_atq_nr_queued(llcx->mig_atq) > 0) {
+		   scx_atq_nr_queued(llcx->mig_atq) > 0) {
 		taskc = (task_ctx *)scx_atq_pop(llcx->mig_atq);
-		p = bpf_task_from_pid((s32)taskc->pid);
+		p     = bpf_task_from_pid((s32)taskc->pid);
 		if (!p) {
 			trace("ATQ failed to get pid %llu", taskc->pid);
 			return false;
 		}
 
-/* Insert to LLC DSQ and let move_to_local handle affinity atomically */
-		trace("ATQ %llu insert %s[%d] to LLC DSQ",
-		      llcx->mig_atq, p->comm, p->pid);
-		scx_bpf_dsq_insert_vtime(p,
-					 cpuc->llc_dsq,
-					 taskc->slice_ns,
-					 p->scx.dsq_vtime,
-					 taskc->enq_flags);
+		/* Insert to LLC DSQ and let move_to_local handle affinity atomically */
+		trace("ATQ %llu insert %s[%d] to LLC DSQ", llcx->mig_atq,
+		      p->comm, p->pid);
+		scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns,
+					 p->scx.dsq_vtime, taskc->enq_flags);
 		bpf_task_release(p);
 
 		/* Try to dispatch from LLC DSQ (handles affinity check atomically) */
@@ -1905,11 +2031,12 @@ static bool consume_llc(struct llc_ctx *llcx)
 	return false;
 }
 
-static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, struct cpu_ctx *cpuc)
+static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
+					     struct cpu_ctx *cpuc)
 {
 	struct llc_ctx *first, *second, *left, *right;
-	int i;
-	u64 cur_load;
+	int		i;
+	u64		cur_load;
 
 	// Single-LLC fast path: skip pick-2 entirely
 	if (unlikely(lb_config.single_llc_mode))
@@ -1920,11 +2047,10 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
 
 	// If on a single LLC there isn't anything left to try.
 	if (unlikely(topo_config.nr_llcs == 1 ||
-	    lb_config.dispatch_pick2_disable ||
-	    topo_config.nr_llcs >= MAX_LLCS))
+		     lb_config.dispatch_pick2_disable ||
+		     topo_config.nr_llcs >= MAX_LLCS))
 		return -EINVAL;
 
-
 	if (lb_config.min_nr_queued_pick2 > 0) {
 		u64 nr_queued = llc_nr_queued(cur_llcx);
 		if (nr_queued < lb_config.min_nr_queued_pick2)
@@ -1944,8 +2070,10 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
 	 * from. This yields better work conservation on machines with a large
 	 * number of LLCs.
 	 */
-	left = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[0]) : rand_llc_ctx();
-	right = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[1]) : rand_llc_ctx();
+	left  = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[0]) :
+					   rand_llc_ctx();
+	right = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[1]) :
+					   rand_llc_ctx();
 
 	if (!left || !right)
 		return -EINVAL;
@@ -1959,33 +2087,31 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
 			return -EINVAL;
 	}
 
-
 	if (llc_get_load(right) > llc_get_load(left)) {
-		first = right;
+		first  = right;
 		second = left;
 	} else {
-		first = left;
+		first  = left;
 		second = right;
 	}
 
 	// Handle the edge case where there are two LLCs and the current has
 	// more load. Since it's already been checked start with the other LLC.
 	if (topo_config.nr_llcs == 2 && first->id == cur_llcx->id) {
-		first = second;
+		first  = second;
 		second = cur_llcx;
 	}
 
-	trace("PICK2 cpu[%d] first[%d] %llu second[%d] %llu",
-	      cpu, first->id, llc_get_load(first), second->id, llc_get_load(second));
+	trace("PICK2 cpu[%d] first[%d] %llu second[%d] %llu", cpu, first->id,
+	      llc_get_load(first), second->id, llc_get_load(second));
 
-	cur_load = llc_get_load(cur_llcx) + ((llc_get_load(cur_llcx) * lb_config.slack_factor) / 100);
+	cur_load = llc_get_load(cur_llcx) +
+		   ((llc_get_load(cur_llcx) * lb_config.slack_factor) / 100);
 
-	if (llc_get_load(first) >= cur_load &&
-	    consume_llc(first))
+	if (llc_get_load(first) >= cur_load && consume_llc(first))
 		return 0;
 
-	if (llc_get_load(second) >= cur_load &&
-	    consume_llc(second))
+	if (llc_get_load(second) >= cur_load && consume_llc(second))
 		return 0;
 
 	if (saturated) {
@@ -1996,8 +2122,7 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
 			return 0;
 
 		// If the system is saturated then be aggressive in trying to load balance.
-		if (topo_config.nr_llcs > 2 &&
-		    (first = rand_llc_ctx()) &&
+		if (topo_config.nr_llcs > 2 && (first = rand_llc_ctx()) &&
 		    consume_llc(first))
 			return 0;
 	}
@@ -2005,18 +2130,17 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx,
 	return 0;
 }
 
-
 static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 {
 	struct task_struct *p;
-	task_ctx *taskc;
-	struct cpu_ctx *cpuc;
-	struct llc_ctx *llcx;
-	u64 pid, dsq_id = 0;
-	scx_atq_t *min_atq = NULL;
-	scx_dhq_t *min_dhq = NULL;
-
-	cpuc = lookup_cpu_ctx(cpu);
+	task_ctx	   *taskc;
+	struct cpu_ctx	   *cpuc;
+	struct llc_ctx	   *llcx;
+	u64		    pid, dsq_id = 0;
+	scx_atq_t	   *min_atq = NULL;
+	scx_dhq_t	   *min_dhq = NULL;
+
+	cpuc			    = lookup_cpu_ctx(cpu);
 	if (unlikely(!cpuc)) {
 		scx_bpf_error("no valid CPU contexts in dispatch");
 		return;
@@ -2031,38 +2155,52 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 	if (p) {
 		if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 			min_vtime = p->scx.dsq_vtime;
-			dsq_id = cpuc->affn_dsq;
+			dsq_id	  = cpuc->affn_dsq;
 		} else {
 			// Task at head of affn_dsq can't run here - move it to correct affn_dsq
 			// This prevents livelock where mismatched tasks block the queue
-			s32 target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
+			s32 target_cpu =
+				bpf_cpumask_any_distribute(p->cpus_ptr);
 			if (target_cpu >= 0 && target_cpu < NR_CPUS) {
-				struct cpu_ctx *target_cpuc = lookup_cpu_ctx(target_cpu);
+				struct cpu_ctx *target_cpuc =
+					lookup_cpu_ctx(target_cpu);
 				if (target_cpuc) {
-					bpf_for_each(scx_dsq, p, cpuc->affn_dsq, 0) {
-						if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+					bpf_for_each(scx_dsq, p, cpuc->affn_dsq,
+						     0) {
+						if (bpf_cpumask_test_cpu(
+							    cpu, p->cpus_ptr)) {
 							// Found a task that belongs here, stop cleanup
 							break;
 						}
 						// Move mismatched task to its target CPU's affn_dsq
-						target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr);
-						if (target_cpu >= 0 && target_cpu < NR_CPUS) {
-							target_cpuc = lookup_cpu_ctx(target_cpu);
+						target_cpu =
+							bpf_cpumask_any_distribute(
+								p->cpus_ptr);
+						if (target_cpu >= 0 &&
+						    target_cpu < NR_CPUS) {
+							target_cpuc = lookup_cpu_ctx(
+								target_cpu);
 							if (target_cpuc) {
-								__COMPAT_scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER,
-												p,
-												target_cpuc->affn_dsq,
-												0);
+								__COMPAT_scx_bpf_dsq_move_vtime(
+									BPF_FOR_EACH_ITER,
+									p,
+									target_cpuc
+										->affn_dsq,
+									0);
 								trace("DISPATCH cpu[%d] moved affn task %d to cpu[%d] affn_dsq",
-								      cpu, p->pid, target_cpu);
+								      cpu,
+								      p->pid,
+								      target_cpu);
 							}
 						}
 					}
 					// Re-peek after cleanup
-					p = __COMPAT_scx_bpf_dsq_peek(cpuc->affn_dsq);
-					if (p && bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+					p = __COMPAT_scx_bpf_dsq_peek(
+						cpuc->affn_dsq);
+					if (p && bpf_cpumask_test_cpu(
+							 cpu, p->cpus_ptr)) {
 						min_vtime = p->scx.dsq_vtime;
-						dsq_id = cpuc->affn_dsq;
+						dsq_id	  = cpuc->affn_dsq;
 					}
 				}
 			}
@@ -2077,7 +2215,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 
 	if (llcx && llcx->cpumask) {
 		s32 other_cpu;
-		bpf_for(other_cpu, 0, topo_config.nr_cpus) {
+		bpf_for(other_cpu, 0, topo_config.nr_cpus)
+		{
 			struct bpf_cpumask *llc_cpumask;
 
 			if (other_cpu == cpu)
@@ -2087,7 +2226,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 			if (!llc_cpumask)
 				continue;
 
-			if (!bpf_cpumask_test_cpu(other_cpu, cast_mask(llc_cpumask)))
+			if (!bpf_cpumask_test_cpu(other_cpu,
+						  cast_mask(llc_cpumask)))
 				continue;
 
 			struct cpu_ctx *other_cpuc = lookup_cpu_ctx(other_cpu);
@@ -2099,7 +2239,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 			if (p && bpf_cpumask_test_cpu(cpu, p->cpus_ptr) &&
 			    (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) {
 				min_vtime = p->scx.dsq_vtime;
-				dsq_id = other_cpuc->affn_dsq;
+				dsq_id	  = other_cpuc->affn_dsq;
 			}
 		}
 	}
@@ -2110,28 +2250,33 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 	if (p && (p->scx.dsq_vtime < min_vtime || min_vtime == 0) &&
 	    bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 		min_vtime = p->scx.dsq_vtime;
-		dsq_id = cpuc->llc_dsq;
+		dsq_id	  = cpuc->llc_dsq;
 	}
 
 	// Migration eligible vtime
 	if (topo_config.nr_llcs > 1) {
 		if (p2dq_config.dhq_enabled) {
-			pid = scx_dhq_peek_strand(cpuc->mig_dhq, cpuc->dhq_strand);
+			pid = scx_dhq_peek_strand(cpuc->mig_dhq,
+						  cpuc->dhq_strand);
 			if (pid && (p = bpf_task_from_pid((s32)pid))) {
-				if (likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) &&
-				    (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) {
+				if (likely(bpf_cpumask_test_cpu(cpu,
+								p->cpus_ptr)) &&
+				    (p->scx.dsq_vtime < min_vtime ||
+				     min_vtime == 0)) {
 					min_vtime = p->scx.dsq_vtime;
-					min_dhq = cpuc->mig_dhq;
+					min_dhq	  = cpuc->mig_dhq;
 				}
 				bpf_task_release(p);
 			}
 		} else if (p2dq_config.atq_enabled) {
 			pid = scx_atq_peek(cpuc->mig_atq);
 			if ((p = bpf_task_from_pid((s32)pid))) {
-				if (likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) &&
-				    (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) {
+				if (likely(bpf_cpumask_test_cpu(cpu,
+								p->cpus_ptr)) &&
+				    (p->scx.dsq_vtime < min_vtime ||
+				     min_vtime == 0)) {
 					min_vtime = p->scx.dsq_vtime;
-					min_atq = cpuc->mig_atq;
+					min_atq	  = cpuc->mig_atq;
 					/*
 					 * With ATQs we can peek and pop to check that
 					 * the popped task is the same as the peeked task.
@@ -2145,10 +2290,11 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 		} else {
 			// Peek migration DSQ - only consider tasks that can run here
 			p = __COMPAT_scx_bpf_dsq_peek(cpuc->mig_dsq);
-			if (p && likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) &&
+			if (p &&
+			    likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) &&
 			    (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) {
 				min_vtime = p->scx.dsq_vtime;
-				dsq_id = cpuc->mig_dsq;
+				dsq_id	  = cpuc->mig_dsq;
 			}
 		}
 	}
@@ -2161,7 +2307,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 
 	// First try the DHQ/ATQ with the lowest vtime for fairness.
 	if (unlikely(min_dhq)) {
-		trace("DHQ dispatching %llu with min vtime %llu", min_dhq, min_vtime);
+		trace("DHQ dispatching %llu with min vtime %llu", min_dhq,
+		      min_vtime);
 		pid = scx_dhq_pop_strand(min_dhq, cpuc->dhq_strand);
 		if (likely(pid && (p = bpf_task_from_pid((s32)pid)))) {
 			if (unlikely(!(taskc = lookup_task_ctx(p)))) {
@@ -2173,8 +2320,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 			/* Check if task can still run on current CPU */
 
 			/* Insert to LLC DSQ for atomic affinity handling */
-			scx_bpf_dsq_insert_vtime(p,
-						 cpuc->llc_dsq,
+			scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq,
 						 taskc->slice_ns,
 						 p->scx.dsq_vtime,
 						 taskc->enq_flags);
@@ -2185,7 +2331,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 			return;
 		}
 	} else if (unlikely(min_atq)) {
-		trace("ATQ dispatching %llu with min vtime %llu", min_atq, min_vtime);
+		trace("ATQ dispatching %llu with min vtime %llu", min_atq,
+		      min_vtime);
 		pid = scx_atq_pop(min_atq);
 		if (likely((p = bpf_task_from_pid((s32)pid)))) {
 			/*
@@ -2198,10 +2345,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 				return;
 			}
 
-
 			/* Insert to LLC DSQ for atomic affinity handling */
-			scx_bpf_dsq_insert_vtime(p,
-						 cpuc->llc_dsq,
+			scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq,
 						 taskc->slice_ns,
 						 p->scx.dsq_vtime,
 						 taskc->enq_flags);
@@ -2212,7 +2357,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 			return;
 		}
 	} else {
-		if (likely(valid_dsq(dsq_id) && scx_bpf_dsq_move_to_local(dsq_id)))
+		if (likely(valid_dsq(dsq_id) &&
+			   scx_bpf_dsq_move_to_local(dsq_id)))
 			return;
 	}
 
@@ -2224,17 +2370,24 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 		    scx_bpf_dsq_move_to_local(cpuc->llc_dsq))
 			return;
 
-		if ((llcx = lookup_llc_ctx(cpuc->llc_id)) && llcx->nr_shards > 1) {
+		if ((llcx = lookup_llc_ctx(cpuc->llc_id)) &&
+		    llcx->nr_shards > 1) {
 			// Then try other shards in the LLC for work stealing
 			u32 shard_idx;
-			bpf_for(shard_idx, 0, llcx->nr_shards) {
+			bpf_for(shard_idx, 0, llcx->nr_shards)
+			{
 				u32 offset = cpuc->id % llcx->nr_shards;
-				shard_idx = wrap_index(offset + shard_idx, 0, llcx->nr_shards);
+				shard_idx  = wrap_index(offset + shard_idx, 0,
+							llcx->nr_shards);
 				// TODO: should probably take min vtime to be fair
-				if (shard_idx < MAX_LLC_SHARDS && shard_idx < llcx->nr_shards) {
-					u64 shard_dsq = *MEMBER_VPTR(llcx->shard_dsqs, [shard_idx]);
-					if (shard_dsq != cpuc->llc_dsq && shard_dsq != dsq_id &&
-					    scx_bpf_dsq_move_to_local(shard_dsq))
+				if (shard_idx < MAX_LLC_SHARDS &&
+				    shard_idx < llcx->nr_shards) {
+					u64 shard_dsq = *MEMBER_VPTR(
+						llcx->shard_dsqs, [shard_idx]);
+					if (shard_dsq != cpuc->llc_dsq &&
+					    shard_dsq != dsq_id &&
+					    scx_bpf_dsq_move_to_local(
+						    shard_dsq))
 						return;
 				}
 			}
@@ -2257,8 +2410,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 			/* Check if task can still run on current CPU */
 
 			/* Insert to LLC DSQ for atomic affinity handling */
-			scx_bpf_dsq_insert_vtime(p,
-						 cpuc->llc_dsq,
+			scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq,
 						 taskc->slice_ns,
 						 p->scx.dsq_vtime,
 						 taskc->enq_flags);
@@ -2279,8 +2431,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 			/* Check if task can still run on current CPU */
 
 			/* Insert to LLC DSQ for atomic affinity handling */
-			scx_bpf_dsq_insert_vtime(p,
-						 cpuc->llc_dsq,
+			scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq,
 						 taskc->slice_ns,
 						 p->scx.dsq_vtime,
 						 taskc->enq_flags);
@@ -2292,13 +2443,13 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 		}
 	} else {
 		if (likely(cpuc && dsq_id != cpuc->mig_dsq &&
-		    scx_bpf_dsq_move_to_local(cpuc->mig_dsq)))
+			   scx_bpf_dsq_move_to_local(cpuc->mig_dsq)))
 			return;
 	}
 
 	// Lookup LLC ctx (should never fail at this point)
 	if (unlikely(p2dq_config.llc_shards <= 1 &&
-	    !(llcx = lookup_llc_ctx(cpuc->llc_id)))) {
+		     !(llcx = lookup_llc_ctx(cpuc->llc_id)))) {
 		scx_bpf_error("invalid llc id %u", cpuc->llc_id);
 		return;
 	}
@@ -2313,10 +2464,10 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev)
 void BPF_STRUCT_OPS(p2dq_set_cpumask, struct task_struct *p,
 		    const struct cpumask *cpumask)
 {
-	task_ctx *taskc;
+	task_ctx       *taskc;
 	struct cpu_ctx *cpuc;
 	struct llc_ctx *llcx;
-	bool was_all_cpus, is_all_cpus;
+	bool		was_all_cpus, is_all_cpus;
 
 	if (!(taskc = lookup_task_ctx(p)))
 		return;
@@ -2334,8 +2485,7 @@ void BPF_STRUCT_OPS(p2dq_set_cpumask, struct task_struct *p,
 
 	// If affinity narrowed from all CPUs to restricted, and task is in
 	// migration DSQ, move it to LLC DSQ to prevent cross-LLC livelock
-	if (was_all_cpus && !is_all_cpus &&
-	    valid_dsq(taskc->dsq_id) &&
+	if (was_all_cpus && !is_all_cpus && valid_dsq(taskc->dsq_id) &&
 	    (taskc->dsq_id & P2DQ_MIG_DSQ)) {
 		s32 cpu = scx_bpf_task_cpu(p);
 		if (cpu < 0 || cpu >= topo_config.nr_cpus)
@@ -2359,21 +2509,22 @@ void BPF_STRUCT_OPS(p2dq_set_cpumask, struct task_struct *p,
 void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle)
 {
 	const struct cpumask *idle_cpumask;
-	struct llc_ctx *llcx;
-	u64 idle_score;
-	int ret, priority;
-	u32 percent_idle;
+	struct llc_ctx	     *llcx;
+	u64		      idle_score;
+	int		      ret, priority;
+	u32		      percent_idle;
 
 	idle_cpumask = scx_bpf_get_idle_cpumask();
 
 	percent_idle = idle_cpu_percent(idle_cpumask);
-	saturated = percent_idle < p2dq_config.saturated_percent;
+	saturated    = percent_idle < p2dq_config.saturated_percent;
 
 	if (saturated) {
 		min_llc_runs_pick2 = min(2, lb_config.min_llc_runs_pick2);
 	} else {
-		u32 llc_scaler = log2_u32(topo_config.nr_llcs);
-		min_llc_runs_pick2 = min(log2_u32(percent_idle) + llc_scaler, lb_config.min_llc_runs_pick2);
+		u32 llc_scaler	   = log2_u32(topo_config.nr_llcs);
+		min_llc_runs_pick2 = min(log2_u32(percent_idle) + llc_scaler,
+					 lb_config.min_llc_runs_pick2);
 	}
 
 	if (!(llcx = lookup_cpu_llc_ctx(cpu))) {
@@ -2386,9 +2537,9 @@ void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle)
 	if (idle) {
 		llc_ctx_clear_flag(llcx, LLC_CTX_F_SATURATED);
 		overloaded = false;
-	} else if (!idle && llcx->cpumask && idle_cpumask && llcx->tmp_cpumask) {
-		bpf_cpumask_and(llcx->tmp_cpumask,
-				cast_mask(llcx->cpumask),
+	} else if (!idle && llcx->cpumask && idle_cpumask &&
+		   llcx->tmp_cpumask) {
+		bpf_cpumask_and(llcx->tmp_cpumask, cast_mask(llcx->cpumask),
 				idle_cpumask);
 		if (llcx->tmp_cpumask &&
 		    bpf_cpumask_weight(cast_mask(llcx->tmp_cpumask)) == 0)
@@ -2410,7 +2561,7 @@ void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle)
 		priority = 1;
 
 	// Since we use a minheap convert the highest prio to lowest score.
-	idle_score = scx_bpf_now() - ((1<<7) * (u64)priority);
+	idle_score = scx_bpf_now() - ((1 << 7) * (u64)priority);
 
 	if ((ret = arena_spin_lock((void __arena *)&llcx->idle_lock)))
 		return;
@@ -2421,18 +2572,19 @@ void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle)
 	return;
 }
 
-static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args *args)
+static s32 p2dq_init_task_impl(struct task_struct	 *p,
+			       struct scx_init_task_args *args)
 {
 	struct mask_wrapper *wrapper;
-	struct bpf_cpumask *cpumask;
-	task_ctx *taskc;
-	struct cpu_ctx *cpuc;
-	struct llc_ctx *llcx;
-	u64 slice_ns;
+	struct bpf_cpumask  *cpumask;
+	task_ctx	    *taskc;
+	struct cpu_ctx	    *cpuc;
+	struct llc_ctx	    *llcx;
+	u64		     slice_ns;
 
-	s32 task_cpu = scx_bpf_task_cpu(p);
+	s32		     task_cpu = scx_bpf_task_cpu(p);
 
-	taskc = scx_task_alloc(p);
+	taskc			      = scx_task_alloc(p);
 	if (!taskc) {
 		scx_bpf_error("task_ctx allocation failure");
 		return -ENOMEM;
@@ -2461,10 +2613,10 @@ static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args
 		return -EINVAL;
 	}
 
-	slice_ns = scale_by_task_weight(p,
-					dsq_time_slice(p2dq_config.init_dsq_index));
+	slice_ns = scale_by_task_weight(
+		p, dsq_time_slice(p2dq_config.init_dsq_index));
 
-	taskc->llc_id = cpuc->llc_id;
+	taskc->llc_id  = cpuc->llc_id;
 	taskc->node_id = cpuc->node_id;
 
 	// Adjust starting index based on niceness
@@ -2476,8 +2628,8 @@ static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args
 		taskc->dsq_index = p2dq_config.nr_dsqs_per_llc - 1;
 	}
 	taskc->last_dsq_index = taskc->dsq_index;
-	taskc->slice_ns = slice_ns;
-	taskc->enq_flags = 0;
+	taskc->slice_ns	      = slice_ns;
+	taskc->enq_flags      = 0;
 
 	if (p->cpus_ptr == &p->cpus_mask &&
 	    p->nr_cpus_allowed == topo_config.nr_cpus)
@@ -2511,11 +2663,67 @@ void BPF_STRUCT_OPS(p2dq_exit_task, struct task_struct *p,
 	scx_task_free(p);
 }
 
+static int init_cluster(u32 cluster_index)
+{
+	struct cluster_ctx *clusterx;
+	u32		    cluster_id = cluster_ids[cluster_index];
+	int		    ret;
+
+	clusterx = bpf_map_lookup_elem(&cluster_ctxs, &cluster_id);
+	if (!clusterx) {
+		scx_bpf_error("No cluster %u", cluster_id);
+		return -ENOENT;
+	}
+
+	clusterx->id	      = *MEMBER_VPTR(cluster_ids, [cluster_index]);
+	clusterx->nr_cpus     = 0;
+	clusterx->vtime	      = 0;
+	clusterx->load	      = 0;
+	clusterx->affn_load   = 0;
+	clusterx->state_flags = 0;
+
+	// Create cluster-local DSQ
+	clusterx->dsq = clusterx->id | (MAX_CLUSTERS << 8);
+	ret	      = scx_bpf_create_dsq(clusterx->dsq, clusterx->node_id);
+	if (ret) {
+		scx_bpf_error("failed to create cluster DSQ %llu",
+			      clusterx->dsq);
+		return -EINVAL;
+	}
+
+	// Initialize cluster cpumasks
+	ret = init_cpumask(&clusterx->cpumask);
+	if (ret) {
+		scx_bpf_error("failed to create cluster cpumask");
+		return ret;
+	}
+
+	ret = init_cpumask(&clusterx->tmp_cpumask);
+	if (ret) {
+		scx_bpf_error("failed to create cluster tmp_cpumask");
+		return ret;
+	}
+
+	ret = init_cpumask(&clusterx->big_cpumask);
+	if (ret) {
+		scx_bpf_error("failed to create cluster big cpumask");
+		return ret;
+	}
+
+	ret = init_cpumask(&clusterx->little_cpumask);
+	if (ret) {
+		scx_bpf_error("failed to create cluster little cpumask");
+		return ret;
+	}
+
+	return 0;
+}
+
 static int init_llc(u32 llc_index)
 {
 	struct llc_ctx *llcx;
-	u32 llc_id = llc_ids[llc_index];
-	int i, ret;
+	u32		llc_id = llc_ids[llc_index];
+	int		i, ret;
 
 	llcx = bpf_map_lookup_elem(&llc_ctxs, &llc_id);
 	if (!llcx) {
@@ -2523,13 +2731,13 @@ static int init_llc(u32 llc_index)
 		return -ENOENT;
 	}
 
-	llcx->vtime = 0;
-	llcx->id = *MEMBER_VPTR(llc_ids, [llc_index]);
-	llcx->index = llc_index;
+	llcx->vtime   = 0;
+	llcx->id      = *MEMBER_VPTR(llc_ids, [llc_index]);
+	llcx->index   = llc_index;
 	llcx->nr_cpus = 0;
-	llcx->vtime = 0;
+	llcx->vtime   = 0;
 
-	ret = llc_create_atqs(llcx);
+	ret	      = llc_create_atqs(llcx);
 	if (ret) {
 		return ret;
 	}
@@ -2540,14 +2748,14 @@ static int init_llc(u32 llc_index)
 	}
 
 	llcx->dsq = llcx->id | MAX_LLCS;
-	ret = scx_bpf_create_dsq(llcx->dsq, llcx->node_id);
+	ret	  = scx_bpf_create_dsq(llcx->dsq, llcx->node_id);
 	if (ret) {
 		scx_bpf_error("failed to create DSQ %llu", llcx->dsq);
 		return -EINVAL;
 	}
 
 	llcx->mig_dsq = llcx->id | P2DQ_MIG_DSQ;
-	ret = scx_bpf_create_dsq(llcx->mig_dsq, llcx->node_id);
+	ret	      = scx_bpf_create_dsq(llcx->mig_dsq, llcx->node_id);
 	if (ret) {
 		scx_bpf_error("failed to create DSQ %llu", llcx->mig_dsq);
 		return -EINVAL;
@@ -2588,17 +2796,21 @@ static int init_llc(u32 llc_index)
 	llcx->nr_shards = p2dq_config.llc_shards;
 
 	if (p2dq_config.llc_shards > 1) {
-		llcx->nr_shards = min(min(p2dq_config.llc_shards, llcx->nr_cpus), MAX_LLC_SHARDS);
+		llcx->nr_shards =
+			min(min(p2dq_config.llc_shards, llcx->nr_cpus),
+			    MAX_LLC_SHARDS);
 
-		bpf_for(i, 0, llcx->nr_shards) {
+		bpf_for(i, 0, llcx->nr_shards)
+		{
 			u64 shard_dsq = shard_dsq_id(llc_id, i);
 			if (i < MAX_LLC_SHARDS) // verifier
 				llcx->shard_dsqs[i] = shard_dsq;
 
 			ret = scx_bpf_create_dsq(shard_dsq, llcx->node_id);
 			if (ret) {
-				scx_bpf_error("failed to create shard DSQ %llu for LLC %u shard %u",
-					      shard_dsq, llc_id, i);
+				scx_bpf_error(
+					"failed to create shard DSQ %llu for LLC %u shard %u",
+					shard_dsq, llc_id, i);
 				return ret;
 			}
 		}
@@ -2610,7 +2822,7 @@ static int init_llc(u32 llc_index)
 static int init_node(u32 node_id)
 {
 	struct node_ctx *nodec;
-	int ret;
+	int		 ret;
 
 	nodec = bpf_map_lookup_elem(&node_ctxs, &node_id);
 	if (!nodec) {
@@ -2620,7 +2832,7 @@ static int init_node(u32 node_id)
 
 	nodec->id = node_id;
 
-	ret = init_cpumask(&nodec->cpumask);
+	ret	  = init_cpumask(&nodec->cpumask);
 	if (ret) {
 		scx_bpf_error("failed to create node cpumask");
 		return ret;
@@ -2641,16 +2853,19 @@ static int init_node(u32 node_id)
 // Initializes per CPU data structures.
 static s32 init_cpu(int cpu)
 {
-	struct node_ctx *nodec;
-	struct llc_ctx *llcx;
-	struct cpu_ctx *cpuc;
+	struct node_ctx	   *nodec;
+	struct llc_ctx	   *llcx;
+	struct cluster_ctx *clusterx = NULL;
+	struct cpu_ctx	   *cpuc;
 
 	if (!(cpuc = lookup_cpu_ctx(cpu)))
 		return -ENOENT;
 
-	cpuc->id = cpu;
-	cpuc->llc_id = cpu_llc_ids[cpu];
-	cpuc->node_id = cpu_node_ids[cpu];
+	cpuc->id	 = cpu;
+	cpuc->llc_id	 = cpu_llc_ids[cpu];
+	cpuc->cluster_id = cpu_cluster_ids[cpu];
+	cpuc->node_id	 = cpu_node_ids[cpu];
+	// cluster_id will be populated from Rust userspace via cpu_cluster_ids[]
 	if (big_core_ids[cpu] == 1)
 		cpu_ctx_set_flag(cpuc, CPU_CTX_F_IS_BIG);
 	else
@@ -2663,13 +2878,30 @@ static s32 init_cpu(int cpu)
 		return -ENOENT;
 	}
 
+	// Lookup cluster context if clusters are enabled
+	if (topo_config.has_clusters &&
+	    cpuc->cluster_id < topo_config.nr_clusters) {
+		clusterx = lookup_cluster_ctx(cpuc->cluster_id);
+		if (!clusterx) {
+			scx_bpf_error(
+				"failed to get cluster ctx for cpu %u cluster %u",
+				cpu, cpuc->cluster_id);
+			return -ENOENT;
+		}
+		clusterx->nr_cpus += 1;
+		// Copy for each CPU in cluster (gets overwritten, doesn't matter)
+		clusterx->llc_id  = cpuc->llc_id;
+		clusterx->node_id = cpuc->node_id;
+		cpuc->cluster_dsq = clusterx->dsq;
+	}
+
 	// copy for each cpu, doesn't matter if it gets overwritten.
 	llcx->nr_cpus += 1;
-	llcx->id = cpu_llc_ids[cpu];
-	llcx->node_id = cpu_node_ids[cpu];
-	nodec->id = cpu_node_ids[cpu];
-	cpuc->mig_atq = llcx->mig_atq;
-	cpuc->mig_dhq = llcx->mig_dhq;
+	llcx->id	 = cpu_llc_ids[cpu];
+	llcx->node_id	 = cpu_node_ids[cpu];
+	nodec->id	 = cpu_node_ids[cpu];
+	cpuc->mig_atq	 = llcx->mig_atq;
+	cpuc->mig_dhq	 = llcx->mig_dhq;
 	cpuc->dhq_strand = llcx->dhq_strand;
 
 	if (cpu_ctx_test_flag(cpuc, CPU_CTX_F_IS_BIG)) {
@@ -2681,11 +2913,15 @@ static s32 init_cpu(int cpu)
 			bpf_cpumask_set_cpu(cpu, nodec->big_cpumask);
 		if (llcx->big_cpumask)
 			bpf_cpumask_set_cpu(cpu, llcx->big_cpumask);
+		if (clusterx && clusterx->big_cpumask)
+			bpf_cpumask_set_cpu(cpu, clusterx->big_cpumask);
 		bpf_rcu_read_unlock();
 	} else {
 		bpf_rcu_read_lock();
 		if (llcx->little_cpumask)
 			bpf_cpumask_set_cpu(cpu, llcx->little_cpumask);
+		if (clusterx && clusterx->little_cpumask)
+			bpf_cpumask_set_cpu(cpu, clusterx->little_cpumask);
 		bpf_rcu_read_unlock();
 	}
 
@@ -2696,10 +2932,16 @@ static s32 init_cpu(int cpu)
 		bpf_cpumask_set_cpu(cpu, nodec->cpumask);
 	if (llcx->cpumask)
 		bpf_cpumask_set_cpu(cpu, llcx->cpumask);
+	if (clusterx && clusterx->cpumask)
+		bpf_cpumask_set_cpu(cpu, clusterx->cpumask);
 	bpf_rcu_read_unlock();
 
-	trace("CFG CPU[%d]NODE[%d]LLC[%d] initialized",
-	    cpu, cpuc->node_id, cpuc->llc_id);
+	if (topo_config.has_clusters)
+		trace("CFG CPU[%d]NODE[%d]LLC[%d]CLUSTER[%d] initialized", cpu,
+		      cpuc->node_id, cpuc->llc_id, cpuc->cluster_id);
+	else
+		trace("CFG CPU[%d]NODE[%d]LLC[%d] initialized", cpu,
+		      cpuc->node_id, cpuc->llc_id);
 
 	return 0;
 }
@@ -2707,11 +2949,12 @@ static s32 init_cpu(int cpu)
 static bool load_balance_timer(void)
 {
 	struct llc_ctx *llcx, *lb_llcx;
-	int j;
-	u64 ideal_sum, load_sum = 0, interactive_sum = 0;
-	u32 llc_id, llc_index, lb_llc_index, lb_llc_id;
+	int		j;
+	u64		ideal_sum, load_sum = 0, interactive_sum = 0;
+	u32		llc_id, llc_index, lb_llc_index, lb_llc_id;
 
-	bpf_for(llc_index, 0, topo_config.nr_llcs) {
+	bpf_for(llc_index, 0, topo_config.nr_llcs)
+	{
 		// verifier
 		if (llc_index >= MAX_LLCS)
 			break;
@@ -2722,7 +2965,8 @@ static bool load_balance_timer(void)
 			return false;
 		}
 
-		lb_llc_index = (llc_index + llc_lb_offset) % topo_config.nr_llcs;
+		lb_llc_index =
+			(llc_index + llc_lb_offset) % topo_config.nr_llcs;
 		if (lb_llc_index < 0 || lb_llc_index >= MAX_LLCS) {
 			scx_bpf_error("failed to lookup lb_llc");
 			return false;
@@ -2735,63 +2979,84 @@ static bool load_balance_timer(void)
 		}
 
 		/* Use PELT metrics if enabled, otherwise use simple counters */
-		u64 llc_load = p2dq_config.pelt_enabled ? llcx->util_avg : llcx->load;
-		u64 lb_llc_load = p2dq_config.pelt_enabled ? lb_llcx->util_avg : lb_llcx->load;
-		u64 llc_intr_load = p2dq_config.pelt_enabled ? llcx->intr_util_avg : llcx->intr_load;
+		u64 llc_load	= p2dq_config.pelt_enabled ? llcx->util_avg :
+							     llcx->load;
+		u64 lb_llc_load = p2dq_config.pelt_enabled ? lb_llcx->util_avg :
+							     lb_llcx->load;
+		u64 llc_intr_load = p2dq_config.pelt_enabled ?
+					    llcx->intr_util_avg :
+					    llcx->intr_load;
 
 		load_sum += llc_load;
 		interactive_sum += llc_intr_load;
 
 		s64 load_imbalance = 0;
-		if(llc_load > lb_llc_load)
-			load_imbalance = (100 * (llc_load - lb_llc_load)) / llc_load;
+		if (llc_load > lb_llc_load)
+			load_imbalance =
+				(100 * (llc_load - lb_llc_load)) / llc_load;
 
 		u32 lb_slack = (lb_config.slack_factor > 0 ?
-				lb_config.slack_factor : LOAD_BALANCE_SLACK);
+					lb_config.slack_factor :
+					LOAD_BALANCE_SLACK);
 
 		if (load_imbalance > lb_slack)
 			llcx->lb_llc_id = lb_llc_id;
 		else
 			llcx->lb_llc_id = MAX_LLCS;
 
-		dbg("LB llcx[%u] %llu lb_llcx[%u] %llu imbalance %lli",
-		    llc_id, llc_load, lb_llc_id, lb_llc_load, load_imbalance);
+		dbg("LB llcx[%u] %llu lb_llcx[%u] %llu imbalance %lli", llc_id,
+		    llc_load, lb_llc_id, lb_llc_load, load_imbalance);
 	}
 
-	dbg("LB Total load %llu, Total interactive %llu",
-	    load_sum, interactive_sum);
+	dbg("LB Total load %llu, Total interactive %llu", load_sum,
+	    interactive_sum);
 
-	llc_lb_offset = (llc_lb_offset % (topo_config.nr_llcs - 1)) + 1;
+	// Only rotate offset if we have more than 2 LLCs
+	// For 2 LLCs, offset 1 is the only valid value and doesn't need to change
+	// For 1 LLC, no load balancing between LLCs is needed
+	if (topo_config.nr_llcs > 2)
+		llc_lb_offset = (llc_lb_offset % (topo_config.nr_llcs - 1)) + 1;
 
-	if (!timeline_config.autoslice || load_sum == 0 || load_sum < interactive_sum)
+	if (!timeline_config.autoslice || load_sum == 0 ||
+	    load_sum < interactive_sum)
 		goto reset_load;
 
 	if (interactive_sum == 0) {
 		dsq_time_slices[0] = (11 * dsq_time_slices[0]) / 10;
-		bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) {
-			dsq_time_slices[j] = dsq_time_slices[0] << j << p2dq_config.dsq_shift;
+		bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc)
+		{
+			dsq_time_slices[j] = dsq_time_slices[0]
+					     << j << p2dq_config.dsq_shift;
 		}
 	} else {
 		ideal_sum = (load_sum * p2dq_config.interactive_ratio) / 100;
-		dbg("LB autoslice ideal/sum %llu/%llu", ideal_sum, interactive_sum);
+		dbg("LB autoslice ideal/sum %llu/%llu", ideal_sum,
+		    interactive_sum);
 		if (interactive_sum < ideal_sum) {
 			dsq_time_slices[0] = (11 * dsq_time_slices[0]) / 10;
 
-			bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) {
-				dsq_time_slices[j] = dsq_time_slices[0] << j << p2dq_config.dsq_shift;
+			bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc)
+			{
+				dsq_time_slices[j] = dsq_time_slices[0]
+						     << j
+						     << p2dq_config.dsq_shift;
 			}
 		} else {
-			dsq_time_slices[0] = max((10 * dsq_time_slices[0]) / 11, min_slice_ns);
-			bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) {
-				dsq_time_slices[j] = dsq_time_slices[0] << j << p2dq_config.dsq_shift;
+			dsq_time_slices[0] = max((10 * dsq_time_slices[0]) / 11,
+						 min_slice_ns);
+			bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc)
+			{
+				dsq_time_slices[j] = dsq_time_slices[0]
+						     << j
+						     << p2dq_config.dsq_shift;
 			}
 		}
 	}
 
-
 reset_load:
 
-	bpf_for(llc_index, 0, topo_config.nr_llcs) {
+	bpf_for(llc_index, 0, topo_config.nr_llcs)
+	{
 		llc_id = *MEMBER_VPTR(llc_ids, [llc_index]);
 		if (!(llcx = lookup_llc_ctx(llc_id)))
 			return false;
@@ -2801,7 +3066,7 @@ static bool load_balance_timer(void)
 		 * weighting. We only reset simple counters for legacy mode.
 		 */
 		if (!p2dq_config.pelt_enabled) {
-			llcx->load = 0;
+			llcx->load	= 0;
 			llcx->intr_load = 0;
 			llcx->affn_load = 0;
 		}
@@ -2809,23 +3074,34 @@ static bool load_balance_timer(void)
 		llcx->last_period_ns = scx_bpf_now();
 
 		if (!p2dq_config.pelt_enabled) {
-			bpf_for(j, 0, p2dq_config.nr_dsqs_per_llc) {
+			bpf_for(j, 0, p2dq_config.nr_dsqs_per_llc)
+			{
 				llcx->dsq_load[j] = 0;
 				if (llc_id == 0 && timeline_config.autoslice) {
-					if (j > 0 && dsq_time_slices[j] < dsq_time_slices[j-1]) {
-						dsq_time_slices[j] = dsq_time_slices[j-1] << p2dq_config.dsq_shift;
+					if (j > 0 &&
+					    dsq_time_slices[j] <
+						    dsq_time_slices[j - 1]) {
+						dsq_time_slices[j] =
+							dsq_time_slices[j - 1]
+							<< p2dq_config.dsq_shift;
 					}
-					dbg("LB autoslice interactive slice %llu", dsq_time_slices[j]);
+					dbg("LB autoslice interactive slice %llu",
+					    dsq_time_slices[j]);
 				}
 			}
 		} else {
 			/* Even with PELT, still validate autoslice timings */
 			if (llc_id == 0 && timeline_config.autoslice) {
-				bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) {
-					if (dsq_time_slices[j] < dsq_time_slices[j-1]) {
-						dsq_time_slices[j] = dsq_time_slices[j-1] << p2dq_config.dsq_shift;
+				bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc)
+				{
+					if (dsq_time_slices[j] <
+					    dsq_time_slices[j - 1]) {
+						dsq_time_slices[j] =
+							dsq_time_slices[j - 1]
+							<< p2dq_config.dsq_shift;
 					}
-					dbg("LB autoslice interactive slice %llu", dsq_time_slices[j]);
+					dbg("LB autoslice interactive slice %llu",
+					    dsq_time_slices[j]);
 				}
 			}
 		}
@@ -2844,7 +3120,6 @@ static bool run_timer_cb(int key)
 	}
 }
 
-
 static int timer_cb(void *map, int key, struct timer_wrapper *timerw)
 {
 	if (timerw->key < 0 || timerw->key > MAX_TIMERS) {
@@ -2852,27 +3127,26 @@ static int timer_cb(void *map, int key, struct timer_wrapper *timerw)
 	}
 
 	struct p2dq_timer *cb_timer = &p2dq_timers[timerw->key];
-	bool resched = run_timer_cb(timerw->key);
+	bool		   resched  = run_timer_cb(timerw->key);
 
 	if (!resched || !cb_timer || cb_timer->interval_ns == 0) {
 		trace("TIMER timer %d stopped", timerw->key);
 		return 0;
 	}
 
-	bpf_timer_start(&timerw->timer,
-			cb_timer->interval_ns,
+	bpf_timer_start(&timerw->timer, cb_timer->interval_ns,
 			cb_timer->start_flags);
 
 	return 0;
 }
 
-
 s32 static start_timers(void)
 {
 	struct timer_wrapper *timerw;
-	int timer_id, err;
+	int		      timer_id, err;
 
-	bpf_for(timer_id, 0, MAX_TIMERS) {
+	bpf_for(timer_id, 0, MAX_TIMERS)
+	{
 		timerw = bpf_map_lookup_elem(&timer_data, &timer_id);
 		if (!timerw || timer_id < 0 || timer_id > MAX_TIMERS) {
 			scx_bpf_error("Failed to lookup timer");
@@ -2886,7 +3160,8 @@ s32 static start_timers(void)
 		}
 		timerw->key = timer_id;
 
-		err = bpf_timer_init(&timerw->timer, &timer_data, new_timer->init_flags);
+		err	    = bpf_timer_init(&timerw->timer, &timer_data,
+					     new_timer->init_flags);
 		if (err < 0) {
 			scx_bpf_error("can't happen");
 			return -ENOENT;
@@ -2898,8 +3173,7 @@ s32 static start_timers(void)
 			return -ENOENT;
 		}
 
-		err = bpf_timer_start(&timerw->timer,
-				      new_timer->interval_ns,
+		err = bpf_timer_start(&timerw->timer, new_timer->interval_ns,
 				      new_timer->start_flags);
 		if (err < 0) {
 			scx_bpf_error("can't happen");
@@ -2914,8 +3188,8 @@ static s32 p2dq_init_impl()
 {
 	struct llc_ctx *llcx;
 	struct cpu_ctx *cpuc;
-	int i, ret;
-	u64 dsq_id;
+	int		i, ret;
+	u64		dsq_id;
 
 	ret = init_cpumask(&all_cpumask);
 	if (ret) {
@@ -2934,36 +3208,50 @@ static s32 p2dq_init_impl()
 	}
 
 	// First we initialize LLCs because DSQs are created at the LLC level.
-	bpf_for(i, 0, topo_config.nr_llcs) {
+	bpf_for(i, 0, topo_config.nr_llcs)
+	{
 		ret = init_llc(i);
 		if (ret)
 			return ret;
 	}
 
-	bpf_for(i, 0, topo_config.nr_nodes) {
+	// Initialize clusters if cluster awareness is enabled
+	if (topo_config.has_clusters) {
+		bpf_for(i, 0, topo_config.nr_clusters)
+		{
+			ret = init_cluster(i);
+			if (ret)
+				return ret;
+		}
+	}
+
+	bpf_for(i, 0, topo_config.nr_nodes)
+	{
 		ret = init_node(i);
 		if (ret)
 			return ret;
 	}
 
-	bpf_for(i, 0, topo_config.nr_cpus) {
+	bpf_for(i, 0, topo_config.nr_cpus)
+	{
 		ret = init_cpu(i);
 		if (ret)
 			return ret;
 	}
 
 	// Create DSQs for the LLCs
-	bpf_for(i, 0, topo_config.nr_cpus) {
+	bpf_for(i, 0, topo_config.nr_cpus)
+	{
 		if (!(cpuc = lookup_cpu_ctx(i)) ||
 		    !(llcx = lookup_llc_ctx(cpuc->llc_id)))
 			return -EINVAL;
 
-		if (cpuc &&
-		    llcx->node_cpumask &&
+		if (cpuc && llcx->node_cpumask &&
 		    llcx->node_id == cpuc->node_id) {
 			bpf_rcu_read_lock();
 			if (llcx->node_cpumask)
-				bpf_cpumask_set_cpu(cpuc->id, llcx->node_cpumask);
+				bpf_cpumask_set_cpu(cpuc->id,
+						    llcx->node_cpumask);
 			bpf_rcu_read_unlock();
 		}
 
@@ -2973,10 +3261,10 @@ static s32 p2dq_init_impl()
 
 		if (p2dq_config.llc_shards > 1 && llcx->nr_shards > 1) {
 			int shard_id = cpuc->core_id % llcx->nr_shards;
-			if (shard_id >= 0 &&
-			    shard_id < MAX_LLC_SHARDS &&
+			if (shard_id >= 0 && shard_id < MAX_LLC_SHARDS &&
 			    shard_id < llcx->nr_shards)
-				cpuc->llc_dsq = *MEMBER_VPTR(llcx->shard_dsqs, [shard_id]);
+				cpuc->llc_dsq = *MEMBER_VPTR(
+					llcx->shard_dsqs, [shard_id]);
 		}
 
 		dsq_id = cpu_dsq_id(i);
@@ -2987,11 +3275,12 @@ static s32 p2dq_init_impl()
 			return ret;
 		}
 		cpuc->affn_dsq = dsq_id;
-		cpuc->mig_dsq = llcx->mig_dsq;
+		cpuc->mig_dsq  = llcx->mig_dsq;
 	}
 
 	if (p2dq_config.cpu_priority) {
-		bpf_for(i, 0, topo_config.nr_llcs) {
+		bpf_for(i, 0, topo_config.nr_llcs)
+		{
 			if (!(llcx = lookup_llc_ctx(i)))
 				return -EINVAL;
 			llcx->idle_cpu_heap = scx_minheap_alloc(llcx->nr_cpus);
@@ -3022,17 +3311,19 @@ void BPF_STRUCT_OPS(p2dq_running, struct task_struct *p)
 	p2dq_running_impl(p);
 }
 
-void BPF_STRUCT_OPS(p2dq_enqueue, struct task_struct *p __arg_trusted, u64 enq_flags)
+void BPF_STRUCT_OPS(p2dq_enqueue, struct task_struct *p __arg_trusted,
+		    u64 enq_flags)
 {
 	struct enqueue_promise pro;
 	async_p2dq_enqueue(&pro, p, enq_flags);
 	complete_p2dq_enqueue(&pro, p);
 }
 
-void BPF_STRUCT_OPS(p2dq_dequeue, struct task_struct *p __arg_trusted, u64 deq_flags)
+void BPF_STRUCT_OPS(p2dq_dequeue, struct task_struct *p __arg_trusted,
+		    u64 deq_flags)
 {
 	task_ctx *taskc = lookup_task_ctx(p);
-	int ret;
+	int	  ret;
 
 	ret = scx_atq_cancel(&taskc->common);
 	if (ret)
@@ -3046,7 +3337,8 @@ void BPF_STRUCT_OPS(p2dq_dispatch, s32 cpu, struct task_struct *prev)
 	return p2dq_dispatch_impl(cpu, prev);
 }
 
-s32 BPF_STRUCT_OPS(p2dq_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+s32 BPF_STRUCT_OPS(p2dq_select_cpu, struct task_struct *p, s32 prev_cpu,
+		   u64 wake_flags)
 {
 	return p2dq_select_cpu_impl(p, prev_cpu, wake_flags);
 }
@@ -3057,19 +3349,14 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(p2dq_init_task, struct task_struct *p,
 	return p2dq_init_task_impl(p, args);
 }
 
-SCX_OPS_DEFINE(p2dq,
-	       .select_cpu		= (void *)p2dq_select_cpu,
-	       .enqueue			= (void *)p2dq_enqueue,
-	       .dequeue			= (void *)p2dq_dequeue,
-	       .dispatch		= (void *)p2dq_dispatch,
-	       .running			= (void *)p2dq_running,
-	       .stopping		= (void *)p2dq_stopping,
-	       .set_cpumask		= (void *)p2dq_set_cpumask,
-	       .update_idle		= (void *)p2dq_update_idle,
-	       .init_task		= (void *)p2dq_init_task,
-	       .exit_task		= (void *)p2dq_exit_task,
-	       .init			= (void *)p2dq_init,
-	       .exit			= (void *)p2dq_exit,
-	       .timeout_ms		= 25000,
-	       .name			= "p2dq");
+SCX_OPS_DEFINE(p2dq, .select_cpu = (void *)p2dq_select_cpu,
+	       .enqueue = (void *)p2dq_enqueue, .dequeue = (void *)p2dq_dequeue,
+	       .dispatch    = (void *)p2dq_dispatch,
+	       .running	    = (void *)p2dq_running,
+	       .stopping    = (void *)p2dq_stopping,
+	       .set_cpumask = (void *)p2dq_set_cpumask,
+	       .update_idle = (void *)p2dq_update_idle,
+	       .init_task   = (void *)p2dq_init_task,
+	       .exit_task = (void *)p2dq_exit_task, .init = (void *)p2dq_init,
+	       .exit = (void *)p2dq_exit, .timeout_ms = 25000, .name = "p2dq");
 #endif
diff --git a/scheds/rust/scx_p2dq/src/bpf/types.h b/scheds/rust/scx_p2dq/src/bpf/types.h
index 6ccc1ea7d3..6368971349 100644
--- a/scheds/rust/scx_p2dq/src/bpf/types.h
+++ b/scheds/rust/scx_p2dq/src/bpf/types.h
@@ -29,156 +29,213 @@ struct p2dq_timer {
 };
 
 /* cpu_ctx flag bits */
-#define CPU_CTX_F_INTERACTIVE		(1 << 0)
-#define CPU_CTX_F_IS_BIG		(1 << 1)
-#define CPU_CTX_F_NICE_TASK		(1 << 2)
-#define CPU_CTX_F_CLEAN_AFFN_DSQ	(1 << 3)
+#define CPU_CTX_F_INTERACTIVE (1 << 0)
+#define CPU_CTX_F_IS_BIG (1 << 1)
+#define CPU_CTX_F_NICE_TASK (1 << 2)
+#define CPU_CTX_F_CLEAN_AFFN_DSQ (1 << 3)
 
 /* Helper macros for cpu_ctx flags */
-#define cpu_ctx_set_flag(cpuc, flag)	((cpuc)->flags |= (flag))
-#define cpu_ctx_clear_flag(cpuc, flag)	((cpuc)->flags &= ~(flag))
-#define cpu_ctx_test_flag(cpuc, flag)	((cpuc)->flags & (flag))
+#define cpu_ctx_set_flag(cpuc, flag) ((cpuc)->flags |= (flag))
+#define cpu_ctx_clear_flag(cpuc, flag) ((cpuc)->flags &= ~(flag))
+#define cpu_ctx_test_flag(cpuc, flag) ((cpuc)->flags & (flag))
 
 struct cpu_ctx {
-	int				id;
-	u32				llc_id;
-	u64				affn_dsq;
-	u64				slice_ns;
-	u32				core_id;
-	u32				dsq_index;
-	u32				perf;
-	u32				flags;  /* Bitmask for interactive, is_big, nice_task */
-	u64				ran_for;
-	u32				node_id;
-	u64				mig_dsq;
-	u64				llc_dsq;
-	u64				max_load_dsq;
-
-	scx_atq_t			*mig_atq;
-	scx_dhq_t			*mig_dhq;
-	u64				dhq_strand;  /* Which DHQ strand (A or B) for this CPU's LLC */
+	int	   id;
+	u32	   llc_id;
+	u32	   cluster_id;
+	u64	   affn_dsq;
+	u64	   cluster_dsq;
+	u64	   slice_ns;
+	u32	   core_id;
+	u32	   dsq_index;
+	u32	   perf;
+	u32	   flags; /* Bitmask for interactive, is_big, nice_task */
+	u64	   ran_for;
+	u32	   node_id;
+	u64	   mig_dsq;
+	u64	   llc_dsq;
+	u64	   max_load_dsq;
+
+	scx_atq_t *mig_atq;
+	scx_dhq_t *mig_dhq;
+	u64 dhq_strand; /* Which DHQ strand (A or B) for this CPU's LLC */
+};
+
+/* cluster_ctx state flag bits */
+#define CLUSTER_CTX_F_SATURATED (1 << 0)
+
+/* Helper macros for cluster_ctx state flags */
+#define cluster_ctx_set_flag(clusterx, flag) ((clusterx)->state_flags |= (flag))
+#define cluster_ctx_clear_flag(clusterx, flag) \
+	((clusterx)->state_flags &= ~(flag))
+#define cluster_ctx_test_flag(clusterx, flag) ((clusterx)->state_flags & (flag))
+
+struct cluster_ctx {
+	/* Read-mostly fields - grouped together */
+	u32 id;
+	u32 kernel_id;
+	u32 llc_id;
+	u32 node_id;
+	u32 nr_cpus;
+	u64 dsq;
+	u64 last_period_ns;
+
+	/*
+	 * Hot atomic field #1: vtime - frequently updated
+	 * Padded to separate cache line from read-mostly fields above
+	 */
+	char __pad1[CACHE_LINE_SIZE];
+	u64  vtime;
+
+	/*
+	 * Hot atomic fields #2: load counters - frequently updated
+	 * Keep these together on same cache line since they're updated atomically together
+	 */
+	char __pad2[CACHE_LINE_SIZE - sizeof(u64)];
+	u64  load;
+	u64  affn_load;
+	u32  state_flags; /* Bitmask for saturated and other state */
+
+	/*
+	 * Hot atomic field #3: idle lock - frequently contended in idle CPU selection
+	 * Separate cache line from load counters above
+	 */
+	char __pad3[CACHE_LINE_SIZE - 2 * sizeof(u64) - sizeof(u32)];
+	arena_spinlock_t idle_lock;
+
+	/*
+	 * Read-mostly pointers - grouped together
+	 * Accessed during CPU selection but not updated frequently
+	 */
+	char __pad4[CACHE_LINE_SIZE - sizeof(arena_spinlock_t)];
+	struct bpf_cpumask __kptr *cpumask;
+	struct bpf_cpumask __kptr *big_cpumask;
+	struct bpf_cpumask __kptr *little_cpumask;
+	struct bpf_cpumask __kptr *tmp_cpumask;
+
+	scx_minheap_t		  *idle_cpu_heap;
 };
 
 /* llc_ctx state flag bits */
-#define LLC_CTX_F_SATURATED	(1 << 0)
+#define LLC_CTX_F_SATURATED (1 << 0)
 
 /* Helper macros for llc_ctx state flags */
-#define llc_ctx_set_flag(llcx, flag)	((llcx)->state_flags |= (flag))
-#define llc_ctx_clear_flag(llcx, flag)	((llcx)->state_flags &= ~(flag))
-#define llc_ctx_test_flag(llcx, flag)	((llcx)->state_flags & (flag))
+#define llc_ctx_set_flag(llcx, flag) ((llcx)->state_flags |= (flag))
+#define llc_ctx_clear_flag(llcx, flag) ((llcx)->state_flags &= ~(flag))
+#define llc_ctx_test_flag(llcx, flag) ((llcx)->state_flags & (flag))
 
 struct llc_ctx {
 	/* Read-mostly fields - grouped together */
-	u32				id;
-	u32				nr_cpus;
-	u32				node_id;
-	u32				lb_llc_id;
-	u32				index;
-	u64				dsq;
-	u64				mig_dsq;
-	u64				last_period_ns;
-	u64				dsq_load[MAX_DSQS_PER_LLC];
+	u32 id;
+	u32 nr_cpus;
+	u32 node_id;
+	u32 lb_llc_id;
+	u32 index;
+	u64 dsq;
+	u64 mig_dsq;
+	u64 last_period_ns;
+	u64 dsq_load[MAX_DSQS_PER_LLC];
 
 	/* CPU sharding related fields */
-	u32				nr_shards;
-	u64				shard_dsqs[MAX_LLC_SHARDS];
+	u32 nr_shards;
+	u64 shard_dsqs[MAX_LLC_SHARDS];
 
 	/*
 	 * Hot atomic field #1: vtime - frequently updated in p2dq_stopping()
 	 * Padded to separate cache line from read-mostly fields above
 	 */
-	char				__pad1[CACHE_LINE_SIZE];
-	u64				vtime;
+	char __pad1[CACHE_LINE_SIZE];
+	u64  vtime;
 
 	/*
 	 * Hot atomic fields #2: load counters - frequently updated in p2dq_stopping()
 	 * Keep these together on same cache line since they're updated atomically together
 	 * Pad to separate from vtime above
 	 */
-	char				__pad2[CACHE_LINE_SIZE - sizeof(u64)];
-	u64				load;
-	u64				affn_load;
-	u64				intr_load;
-	u32				state_flags;  /* Bitmask for saturated and other state */
+	char __pad2[CACHE_LINE_SIZE - sizeof(u64)];
+	u64  load;
+	u64  affn_load;
+	u64  intr_load;
+	u32  state_flags; /* Bitmask for saturated and other state */
 
 	/* PELT (Per-Entity Load Tracking) aggregate fields */
-	u64				util_avg;       /* Aggregate utilization average */
-	u64				load_avg;       /* Aggregate load average */
-	u64				intr_util_avg;  /* Interactive task utilization average */
-	u64				affn_util_avg;  /* Affinitized task utilization average */
+	u64 util_avg; /* Aggregate utilization average */
+	u64 load_avg; /* Aggregate load average */
+	u64 intr_util_avg; /* Interactive task utilization average */
+	u64 affn_util_avg; /* Affinitized task utilization average */
 
 	/*
 	 * Hot atomic field #3: idle lock - frequently contended in idle CPU selection
 	 * Separate cache line from load counters above
 	 */
-	char				__pad3[CACHE_LINE_SIZE - 7*sizeof(u64) - sizeof(u32)];
-	arena_spinlock_t		idle_lock;
+	char __pad3[CACHE_LINE_SIZE - 7 * sizeof(u64) - sizeof(u32)];
+	arena_spinlock_t idle_lock;
 
 	/*
 	 * Read-mostly pointers - grouped together
 	 * Accessed during CPU selection but not updated frequently
 	 */
-	char				__pad4[CACHE_LINE_SIZE - sizeof(arena_spinlock_t)];
-	struct bpf_cpumask __kptr	*cpumask;
-	struct bpf_cpumask __kptr	*big_cpumask;
-	struct bpf_cpumask __kptr	*little_cpumask;
-	struct bpf_cpumask __kptr	*node_cpumask;
-	struct bpf_cpumask __kptr	*tmp_cpumask;
-
-	scx_atq_t			*mig_atq;
-	scx_dhq_t			*mig_dhq;
-	u64				dhq_strand;  /* Which DHQ strand (A or B) for this LLC */
-	scx_minheap_t			*idle_cpu_heap;
+	char __pad4[CACHE_LINE_SIZE - sizeof(arena_spinlock_t)];
+	struct bpf_cpumask __kptr *cpumask;
+	struct bpf_cpumask __kptr *big_cpumask;
+	struct bpf_cpumask __kptr *little_cpumask;
+	struct bpf_cpumask __kptr *node_cpumask;
+	struct bpf_cpumask __kptr *tmp_cpumask;
+
+	scx_atq_t		  *mig_atq;
+	scx_dhq_t		  *mig_dhq;
+	u64	       dhq_strand; /* Which DHQ strand (A or B) for this LLC */
+	scx_minheap_t *idle_cpu_heap;
 };
 
 struct node_ctx {
-	u32				id;
-	struct bpf_cpumask __kptr	*cpumask;
-	struct bpf_cpumask __kptr	*big_cpumask;
+	u32			   id;
+	struct bpf_cpumask __kptr *cpumask;
+	struct bpf_cpumask __kptr *big_cpumask;
 };
 
 /* task_ctx flag bits */
-#define TASK_CTX_F_INTERACTIVE	(1 << 0)
-#define TASK_CTX_F_WAS_NICE	(1 << 1)
-#define TASK_CTX_F_IS_KWORKER	(1 << 2)
-#define TASK_CTX_F_ALL_CPUS	(1 << 3)
+#define TASK_CTX_F_INTERACTIVE (1 << 0)
+#define TASK_CTX_F_WAS_NICE (1 << 1)
+#define TASK_CTX_F_IS_KWORKER (1 << 2)
+#define TASK_CTX_F_ALL_CPUS (1 << 3)
 
 /* Helper macros for task_ctx flags */
-#define task_ctx_set_flag(taskc, flag)		((taskc)->flags |= (flag))
-#define task_ctx_clear_flag(taskc, flag)	((taskc)->flags &= ~(flag))
-#define task_ctx_test_flag(taskc, flag)		((taskc)->flags & (flag))
+#define task_ctx_set_flag(taskc, flag) ((taskc)->flags |= (flag))
+#define task_ctx_clear_flag(taskc, flag) ((taskc)->flags &= ~(flag))
+#define task_ctx_test_flag(taskc, flag) ((taskc)->flags & (flag))
 
 struct task_p2dq {
 	/*
 	 * Do NOT change the position of common. It should be at the beginning
 	 * of the task_ctx.
 	 */
-	struct scx_task_common	common;
-	s32			pid;
+	struct scx_task_common common;
+	s32		       pid;
 
 	/*
 	 * PELT (Per-Entity Load Tracking) fields.
 	 * Placed early in the structure (low offset) to help BPF verifier
 	 * track arena pointer through complex control flow.
 	 */
-	u64			pelt_last_update_time;
-	u32			util_sum;
-	u32			util_avg;
-	u32			period_contrib;
-
-	u64			dsq_id;
-	u64			slice_ns;
-	int			dsq_index;
-	u32			llc_id;
-	u32			node_id;
-	u64			used;
-	u64			last_dsq_id;
-	u64 			last_run_started;
-	u64 			last_run_at;
-	u64			llc_runs; /* how many runs on the current LLC */
-	u64			enq_flags;
-	int			last_dsq_index;
-	u32			flags;  /* Bitmask for interactive, was_nice, is_kworker, all_cpus */
+	u64 pelt_last_update_time;
+	u32 util_sum;
+	u32 util_avg;
+	u32 period_contrib;
+
+	u64 dsq_id;
+	u64 slice_ns;
+	int dsq_index;
+	u32 llc_id;
+	u32 node_id;
+	u64 used;
+	u64 last_dsq_id;
+	u64 last_run_started;
+	u64 last_run_at;
+	u64 llc_runs; /* how many runs on the current LLC */
+	u64 enq_flags;
+	int last_dsq_index;
+	u32 flags; /* Bitmask for interactive, was_nice, is_kworker, all_cpus */
 };
 
 typedef struct task_p2dq __arena task_ctx;
@@ -194,52 +251,52 @@ enum enqueue_promise_kind {
 };
 
 struct enqueue_promise_vtime {
-	u64	dsq_id;
-	u64	enq_flags;
-	u64	slice_ns;
-	u64	vtime;
+	u64	   dsq_id;
+	u64	   enq_flags;
+	u64	   slice_ns;
+	u64	   vtime;
 
-	scx_atq_t	*atq;
+	scx_atq_t *atq;
 };
 
 struct enqueue_promise_fifo {
-	u64	dsq_id;
-	u64	enq_flags;
-	u64	slice_ns;
+	u64	   dsq_id;
+	u64	   enq_flags;
+	u64	   slice_ns;
 
-	scx_atq_t	*atq;
+	scx_atq_t *atq;
 };
 
 struct enqueue_promise_dhq {
-	u64	dsq_id;
-	u64	enq_flags;
-	u64	slice_ns;
-	u64	vtime;
-	u64	strand;
+	u64	   dsq_id;
+	u64	   enq_flags;
+	u64	   slice_ns;
+	u64	   vtime;
+	u64	   strand;
 
-	scx_dhq_t	*dhq;
+	scx_dhq_t *dhq;
 };
 
 /* enqueue_promise flag bits */
-#define ENQUEUE_PROMISE_F_KICK_IDLE		(1 << 0)
-#define ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE	(1 << 1)
+#define ENQUEUE_PROMISE_F_KICK_IDLE (1 << 0)
+#define ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE (1 << 1)
 
 /* Helper macros for enqueue_promise flags */
-#define enqueue_promise_set_flag(pro, flag)	((pro)->flags |= (flag))
-#define enqueue_promise_clear_flag(pro, flag)	((pro)->flags &= ~(flag))
-#define enqueue_promise_test_flag(pro, flag)	((pro)->flags & (flag))
+#define enqueue_promise_set_flag(pro, flag) ((pro)->flags |= (flag))
+#define enqueue_promise_clear_flag(pro, flag) ((pro)->flags &= ~(flag))
+#define enqueue_promise_test_flag(pro, flag) ((pro)->flags & (flag))
 
 // This struct is zeroed at the beginning of `async_p2dq_enqueue` and only
 // relevant fields are set, so assume 0 as default when adding fields.
 struct enqueue_promise {
-	enum enqueue_promise_kind	kind;
+	enum enqueue_promise_kind kind;
 
-	s32				cpu;
-	u32				flags;  /* Bitmask for kick_idle, has_cleared_idle */
+	s32			  cpu;
+	u32 flags; /* Bitmask for kick_idle, has_cleared_idle */
 
 	union {
-		struct enqueue_promise_vtime	vtime;
-		struct enqueue_promise_fifo	fifo;
-		struct enqueue_promise_dhq	dhq;
+		struct enqueue_promise_vtime vtime;
+		struct enqueue_promise_fifo  fifo;
+		struct enqueue_promise_dhq   dhq;
 	};
 };
diff --git a/scheds/rust/scx_p2dq/src/lib.rs b/scheds/rust/scx_p2dq/src/lib.rs
index d03bb56e93..bc25dd7698 100644
--- a/scheds/rust/scx_p2dq/src/lib.rs
+++ b/scheds/rust/scx_p2dq/src/lib.rs
@@ -388,9 +388,11 @@ macro_rules! init_open_skel {
             let rodata = skel.maps.rodata_data.as_mut().unwrap();
             rodata.topo_config.nr_cpus = *$crate::NR_CPU_IDS as u32;
             rodata.topo_config.nr_llcs = $topo.all_llcs.clone().keys().len() as u32;
+            rodata.topo_config.nr_clusters = $topo.all_clusters.clone().keys().len() as u32;
             rodata.topo_config.nr_nodes = $topo.nodes.clone().keys().len() as u32;
             rodata.topo_config.smt_enabled = MaybeUninit::new($topo.smt_enabled);
             rodata.topo_config.has_little_cores = MaybeUninit::new($topo.has_little_cores());
+            rodata.topo_config.has_clusters = MaybeUninit::new(!$topo.all_clusters.is_empty());
 
             // timeline config
             rodata.timeline_config.min_slice_us = opts.min_slice_us;
@@ -460,6 +462,12 @@ macro_rules! init_open_skel {
 #[macro_export]
 macro_rules! init_skel {
     ($skel: expr, $topo: expr) => {
+        // Populate cluster IDs
+        for cluster in $topo.all_clusters.values() {
+            $skel.maps.bss_data.as_mut().unwrap().cluster_ids[cluster.id] = cluster.id as u64;
+        }
+
+        // Populate CPU data including cluster_id
         for cpu in $topo.all_cpus.values() {
             $skel.maps.bss_data.as_mut().unwrap().big_core_ids[cpu.id] =
                 if cpu.core_type == ($crate::CoreType::Big { turbo: true }) {
@@ -470,7 +478,18 @@ macro_rules! init_skel {
             $skel.maps.bss_data.as_mut().unwrap().cpu_core_ids[cpu.id] = cpu.core_id as u32;
             $skel.maps.bss_data.as_mut().unwrap().cpu_llc_ids[cpu.id] = cpu.llc_id as u64;
             $skel.maps.bss_data.as_mut().unwrap().cpu_node_ids[cpu.id] = cpu.node_id as u64;
+
+            // Find cluster_id for this CPU by searching through topology
+            let mut cluster_id = 0u32;
+            for cluster in $topo.all_clusters.values() {
+                if cluster.all_cpus.contains_key(&cpu.id) {
+                    cluster_id = cluster.id as u32;
+                    break;
+                }
+            }
+            $skel.maps.bss_data.as_mut().unwrap().cpu_cluster_ids[cpu.id] = cluster_id;
         }
+
         for llc in $topo.all_llcs.values() {
             $skel.maps.bss_data.as_mut().unwrap().llc_ids[llc.id] = llc.id as u64;
         }