diff --git a/rust/scx_arena/scx_arena/src/arenalib.rs b/rust/scx_arena/scx_arena/src/arenalib.rs index 55a3f6a379..064fb3a4cd 100644 --- a/rust/scx_arena/scx_arena/src/arenalib.rs +++ b/rust/scx_arena/scx_arena/src/arenalib.rs @@ -173,6 +173,10 @@ impl<'a> ArenaLib<'a> { )?; } + // Drop all_clusters to release Arc references to cores before processing cores + // Clusters may hold Arc references to cores, so we need to drop them first + drop(topo.all_clusters); + for (core_id, core) in topo.all_cores { self.setup_topology_node( Arc::::into_inner(core) diff --git a/rust/scx_arena/selftests/src/main.rs b/rust/scx_arena/selftests/src/main.rs index d205ac7db3..c83941be1e 100644 --- a/rust/scx_arena/selftests/src/main.rs +++ b/rust/scx_arena/selftests/src/main.rs @@ -145,6 +145,9 @@ fn setup_topology(skel: &mut BpfSkel<'_>) -> Result<()> { )?; } + // Drop all_clusters to release Arc references to cores before processing cores + drop(topo.all_clusters); + for (_, core) in topo.all_cores { setup_topology_node( skel, diff --git a/rust/scx_utils/src/topology.rs b/rust/scx_utils/src/topology.rs index efa2e6d062..47734a6ebc 100644 --- a/rust/scx_utils/src/topology.rs +++ b/rust/scx_utils/src/topology.rs @@ -167,12 +167,31 @@ pub struct Core { pub node_id: usize, } +#[derive(Debug, Clone)] +pub struct Cluster { + /// Monotonically increasing unique id + pub id: usize, + /// The kernel id of the L2 cache or cluster + pub kernel_id: usize, + pub cores: BTreeMap>, + /// Cpumask of all CPUs in this cluster. + pub span: Cpumask, + + /// Ancestor IDs. + pub llc_id: usize, + pub node_id: usize, + + /// Skip indices to access lower level members easily. + pub all_cpus: BTreeMap>, +} + #[derive(Debug, Clone)] pub struct Llc { /// Monotonically increasing unique id pub id: usize, /// The kernel id of the llc pub kernel_id: usize, + pub clusters: BTreeMap>, pub cores: BTreeMap>, /// Cpumask of all CPUs in this llc. pub span: Cpumask, @@ -181,6 +200,7 @@ pub struct Llc { pub node_id: usize, /// Skip indices to access lower level members easily. + pub all_clusters: BTreeMap>, pub all_cpus: BTreeMap>, } @@ -210,6 +230,7 @@ pub struct Topology { /// Skip indices to access lower level members easily. pub all_llcs: BTreeMap>, + pub all_clusters: BTreeMap>, pub all_cores: BTreeMap>, pub all_cpus: BTreeMap>, } @@ -220,6 +241,7 @@ impl Topology { // objects can only be modified while there's only one reference, // skip indices must be built from bottom to top. let mut topo_llcs = BTreeMap::new(); + let mut topo_clusters = BTreeMap::new(); let mut topo_cores = BTreeMap::new(); let mut topo_cpus = BTreeMap::new(); @@ -229,33 +251,84 @@ impl Topology { for (&llc_id, llc) in node.llcs.iter_mut() { let llc_mut = Arc::get_mut(llc).unwrap(); + let mut llc_clusters = BTreeMap::new(); let mut llc_cpus = BTreeMap::new(); - for (&core_id, core) in llc_mut.cores.iter_mut() { - let core_mut = Arc::get_mut(core).unwrap(); - let smt_level = core_mut.cpus.len(); - - for (&cpu_id, cpu) in core_mut.cpus.iter_mut() { - let cpu_mut = Arc::get_mut(cpu).unwrap(); - cpu_mut.smt_level = smt_level; - - if topo_cpus - .insert(cpu_id, cpu.clone()) - .or(node_cpus.insert(cpu_id, cpu.clone())) - .or(llc_cpus.insert(cpu_id, cpu.clone())) - .is_some() - { - bail!("Duplicate CPU ID {}", cpu_id); + for (&cluster_id, cluster) in llc_mut.clusters.iter_mut() { + let cluster_mut = Arc::get_mut(cluster).unwrap(); + let mut cluster_cpus = BTreeMap::new(); + + for (&core_id, core) in cluster_mut.cores.iter_mut() { + let core_mut = Arc::get_mut(core).unwrap(); + let smt_level = core_mut.cpus.len(); + + for (&cpu_id, cpu) in core_mut.cpus.iter_mut() { + let cpu_mut = Arc::get_mut(cpu).unwrap(); + cpu_mut.smt_level = smt_level; + + if topo_cpus + .insert(cpu_id, cpu.clone()) + .or(node_cpus.insert(cpu_id, cpu.clone())) + .or(llc_cpus.insert(cpu_id, cpu.clone())) + .or(cluster_cpus.insert(cpu_id, cpu.clone())) + .is_some() + { + bail!("Duplicate CPU ID {}", cpu_id); + } + } + + // Note that in some weird architectures, core ids can be + // duplicated in different LLC domains. + topo_cores + .insert(core_id, core.clone()) + .or(node_cores.insert(core_id, core.clone())); + } + + cluster_mut.all_cpus = cluster_cpus; + + if topo_clusters.insert(cluster_id, cluster.clone()).is_some() { + bail!("Duplicate Cluster ID {}", cluster_id); + } + llc_clusters.insert(cluster_id, cluster.clone()); + } + + // Fallback: if LLC has no clusters (e.g., virtual LLCs), process cores directly + if llc_mut.clusters.is_empty() { + for (&core_id, core) in llc_mut.cores.iter_mut() { + let core_mut = Arc::get_mut(core).unwrap(); + let smt_level = core_mut.cpus.len(); + + for (&cpu_id, cpu) in core_mut.cpus.iter_mut() { + let cpu_mut = Arc::get_mut(cpu).unwrap(); + cpu_mut.smt_level = smt_level; + + if topo_cpus + .insert(cpu_id, cpu.clone()) + .or(node_cpus.insert(cpu_id, cpu.clone())) + .or(llc_cpus.insert(cpu_id, cpu.clone())) + .is_some() + { + bail!("Duplicate CPU ID {}", cpu_id); + } } + + // Note that in some weird architectures, core ids can be + // duplicated in different LLC domains. + topo_cores + .insert(core_id, core.clone()) + .or(node_cores.insert(core_id, core.clone())); } + } - // Note that in some weird architectures, core ids can be - // duplicated in different LLC domains. - topo_cores - .insert(core_id, core.clone()) - .or(node_cores.insert(core_id, core.clone())); + // Populate llc.cores from cluster.cores before LLC is cloned + // This must be done while we still have exclusive access via llc_mut + for (_cluster_id, cluster) in llc_mut.clusters.iter() { + for (&core_id, core) in cluster.cores.iter() { + llc_mut.cores.insert(core_id, core.clone()); + } } + llc_mut.all_clusters = llc_clusters; llc_mut.all_cpus = llc_cpus; if topo_llcs.insert(llc_id, llc.clone()).is_some() { @@ -272,6 +345,7 @@ impl Topology { span, smt_enabled: is_smt_active().unwrap_or(false), all_llcs: topo_llcs, + all_clusters: topo_clusters, all_cores: topo_cores, all_cpus: topo_cpus, }) @@ -372,6 +446,8 @@ struct TopoCtx { node_core_kernel_ids: BTreeMap<(usize, usize, usize), usize>, /// Mapping of NUMA node LLC ids node_llc_kernel_ids: BTreeMap<(usize, usize, usize), usize>, + /// Mapping of NUMA node LLC cluster ids (node_id, llc_id, cluster_kernel_id) -> cluster_id + node_llc_cluster_kernel_ids: BTreeMap<(usize, usize, usize), usize>, /// Mapping of L2 ids l2_ids: BTreeMap, /// Mapping of L3 ids @@ -382,11 +458,13 @@ impl TopoCtx { fn new() -> TopoCtx { let core_kernel_ids = BTreeMap::new(); let llc_kernel_ids = BTreeMap::new(); + let cluster_kernel_ids = BTreeMap::new(); let l2_ids = BTreeMap::new(); let l3_ids = BTreeMap::new(); TopoCtx { node_core_kernel_ids: core_kernel_ids, node_llc_kernel_ids: llc_kernel_ids, + node_llc_cluster_kernel_ids: cluster_kernel_ids, l2_ids, l3_ids, } @@ -523,8 +601,10 @@ fn create_insert_cpu( let llc = node.llcs.entry(*llc_id).or_insert(Arc::new(Llc { id: *llc_id, + clusters: BTreeMap::new(), cores: BTreeMap::new(), span: Cpumask::new(), + all_clusters: BTreeMap::new(), all_cpus: BTreeMap::new(), node_id: node.id, @@ -532,6 +612,41 @@ fn create_insert_cpu( })); let llc_mut = Arc::get_mut(llc).unwrap(); + // Determine cluster kernel ID: use cluster_id if available (>= 0), else use L2 ID + // cluster_id is isize, with -1 indicating no cluster support + let cluster_kernel_id = if cluster_id >= 0 { + cluster_id as usize + } else if l2_id != usize::MAX { + l2_id + } else { + // No cluster information available, use LLC as cluster + llc_kernel_id + }; + + // Create unique cluster ID using (node.id, llc_id, cluster_kernel_id) + let num_clusters = topo_ctx.node_llc_cluster_kernel_ids.len(); + let cluster_id_unique = topo_ctx + .node_llc_cluster_kernel_ids + .entry((node.id, *llc_id, cluster_kernel_id)) + .or_insert(num_clusters); + + // Create or get cluster + let cluster = llc_mut + .clusters + .entry(*cluster_id_unique) + .or_insert(Arc::new(Cluster { + id: *cluster_id_unique, + kernel_id: cluster_kernel_id, + cores: BTreeMap::new(), + span: Cpumask::new(), + + llc_id: *llc_id, + node_id: node.id, + + all_cpus: BTreeMap::new(), + })); + let cluster_mut = Arc::get_mut(cluster).unwrap(); + let core_type = if cs.avg_rcap < cs.max_rcap && rcap == cs.max_rcap { CoreType::Big { turbo: true } } else if !cs.has_biglittle || rcap >= cs.avg_rcap { @@ -546,7 +661,8 @@ fn create_insert_cpu( .entry((node.id, package_id, core_kernel_id)) .or_insert(num_cores); - let core = llc_mut.cores.entry(*core_id).or_insert(Arc::new(Core { + // Insert core into cluster + let core = cluster_mut.cores.entry(*core_id).or_insert(Arc::new(Core { id: *core_id, cpus: BTreeMap::new(), span: Cpumask::new(), @@ -589,6 +705,7 @@ fn create_insert_cpu( // Update all of the devices' spans to include this CPU. core_mut.span.set_cpu(id)?; + cluster_mut.span.set_cpu(id)?; llc_mut.span.set_cpu(id)?; node.span.set_cpu(id)?; @@ -776,9 +893,11 @@ fn replace_with_virt_llcs( Arc::new(Llc { id: vllc_id, kernel_id, + clusters: BTreeMap::new(), cores: BTreeMap::new(), span: Cpumask::new(), node_id: node.id, + all_clusters: BTreeMap::new(), all_cpus: BTreeMap::new(), }), ); @@ -884,6 +1003,15 @@ fn create_default_node( create_insert_cpu(*cpu_id, &mut node, online_mask, topo_ctx, &cs, flatten_llc)?; } + // Clear clusters before creating virtual LLCs to avoid multiple Arc references to cores + // replace_with_virt_llcs() will create new LLCs without clusters anyway + if nr_cores_per_vllc.is_some() { + for (_llc_id, llc) in node.llcs.iter_mut() { + let llc_mut = Arc::get_mut(llc).unwrap(); + llc_mut.clusters.clear(); + } + } + if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc { replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, 0)?; } @@ -985,6 +1113,15 @@ fn create_numa_nodes( create_insert_cpu(cpu_id, &mut node, online_mask, topo_ctx, &cs, false)?; } + // Clear clusters before creating virtual LLCs to avoid multiple Arc references to cores + // replace_with_virt_llcs() will create new LLCs without clusters anyway + if nr_cores_per_vllc.is_some() { + for (_llc_id, llc) in node.llcs.iter_mut() { + let llc_mut = Arc::get_mut(llc).unwrap(); + llc_mut.clusters.clear(); + } + } + if let Some((min_cores_val, max_cores_val)) = nr_cores_per_vllc { next_virt_llc_id = replace_with_virt_llcs(&mut node, min_cores_val, max_cores_val, next_virt_llc_id)?; diff --git a/scheds/rust/scx_p2dq/src/bpf/intf.h b/scheds/rust/scx_p2dq/src/bpf/intf.h index fbb7715c61..f42ded849f 100644 --- a/scheds/rust/scx_p2dq/src/bpf/intf.h +++ b/scheds/rust/scx_p2dq/src/bpf/intf.h @@ -14,43 +14,44 @@ #endif #ifndef __KERNEL__ -typedef unsigned char u8; -typedef unsigned int u32; +typedef unsigned char u8; +typedef unsigned int u32; typedef unsigned long long u64; #endif - enum consts { - MAX_CPUS = 512, - MAX_NUMA_NODES = 64, - MAX_LLCS = 64, - MAX_DSQS_PER_LLC = 8, - MAX_LLC_SHARDS = 32, - MAX_TASK_PRIO = 39, - MAX_TOPO_NODES = 1024, + MAX_CPUS = 512, + MAX_NUMA_NODES = 64, + MAX_LLCS = 64, + MAX_CLUSTERS = 128, + MAX_DSQS_PER_LLC = 8, + MAX_LLC_SHARDS = 32, + MAX_TASK_PRIO = 39, + MAX_TOPO_NODES = 1024, - NSEC_PER_USEC = 1000ULL, - NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC), - MSEC_PER_SEC = 1000ULL, - NSEC_PER_SEC = NSEC_PER_MSEC * MSEC_PER_SEC, + NSEC_PER_USEC = 1000ULL, + NSEC_PER_MSEC = (1000ULL * NSEC_PER_USEC), + MSEC_PER_SEC = 1000ULL, + NSEC_PER_SEC = NSEC_PER_MSEC * MSEC_PER_SEC, - MIN_SLICE_USEC = 10ULL, - MIN_SLICE_NSEC = (10ULL * NSEC_PER_USEC), + MIN_SLICE_USEC = 10ULL, + MIN_SLICE_NSEC = (10ULL * NSEC_PER_USEC), - LOAD_BALANCE_SLACK = 20ULL, + LOAD_BALANCE_SLACK = 20ULL, - P2DQ_MIG_DSQ = 1LLU << 60, - P2DQ_INTR_DSQ = 1LLU << 32, + P2DQ_MIG_DSQ = 1LLU << 60, + P2DQ_INTR_DSQ = 1LLU << 32, // PELT (Per-Entity Load Tracking) constants - PELT_HALFLIFE_MS = 32, // 32ms half-life for exponential decay - PELT_PERIOD_MS = 1, // 1ms update period (simplified from kernel's 1024us) - PELT_MAX_UTIL = 1024, // Maximum utilization value - PELT_DECAY_SHIFT = 7, // Decay factor: (127/128) ≈ 0.98 per ms - PELT_SUM_MAX = 131072, // Maximum sum value (128 * 1024) + PELT_HALFLIFE_MS = 32, // 32ms half-life for exponential decay + PELT_PERIOD_MS = + 1, // 1ms update period (simplified from kernel's 1024us) + PELT_MAX_UTIL = 1024, // Maximum utilization value + PELT_DECAY_SHIFT = 7, // Decay factor: (127/128) ≈ 0.98 per ms + PELT_SUM_MAX = 131072, // Maximum sum value (128 * 1024) // kernel definitions - CLOCK_BOOTTIME = 7, + CLOCK_BOOTTIME = 7, }; enum p2dq_timers_defs { diff --git a/scheds/rust/scx_p2dq/src/bpf/main.bpf.c b/scheds/rust/scx_p2dq/src/bpf/main.bpf.c index 655580ffe2..cd9ac9161e 100644 --- a/scheds/rust/scx_p2dq/src/bpf/main.bpf.c +++ b/scheds/rust/scx_p2dq/src/bpf/main.bpf.c @@ -35,7 +35,6 @@ #include "intf.h" #include "types.h" - #include #include #include @@ -50,44 +49,56 @@ char _license[] SEC("license") = "GPL"; UEI_DEFINE(uei); -#define dbg(fmt, args...) do { if (debug) bpf_printk(fmt, ##args); } while (0) -#define trace(fmt, args...) do { if (debug > 1) bpf_printk(fmt, ##args); } while (0) +#define dbg(fmt, args...) \ + do { \ + if (debug) \ + bpf_printk(fmt, ##args); \ + } while (0) +#define trace(fmt, args...) \ + do { \ + if (debug > 1) \ + bpf_printk(fmt, ##args); \ + } while (0) const volatile struct { - u32 nr_cpus; - u32 nr_llcs; - u32 nr_nodes; + u32 nr_cpus; + u32 nr_llcs; + u32 nr_clusters; + u32 nr_nodes; bool smt_enabled; bool has_little_cores; + bool has_clusters; } topo_config = { - .nr_cpus = 64, - .nr_llcs = 32, - .nr_nodes = 32, + .nr_cpus = 64, + .nr_llcs = 32, + .nr_clusters = 64, + .nr_nodes = 32, - .smt_enabled = true, + .smt_enabled = true, .has_little_cores = false, + .has_clusters = false, }; const volatile struct { - u64 min_slice_us; - u64 max_exec_ns; + u64 min_slice_us; + u64 max_exec_ns; bool autoslice; bool deadline; } timeline_config = { .min_slice_us = 100, - .max_exec_ns = 20 * NSEC_PER_MSEC, - .autoslice = true, - .deadline = true, + .max_exec_ns = 20 * NSEC_PER_MSEC, + .autoslice = true, + .deadline = true, }; const volatile struct { - u64 backoff_ns; - u64 dispatch_lb_busy; - u64 min_llc_runs_pick2; - u64 min_nr_queued_pick2; - u64 slack_factor; - u64 wakeup_lb_busy; + u64 backoff_ns; + u64 dispatch_lb_busy; + u64 min_llc_runs_pick2; + u64 min_nr_queued_pick2; + u64 slack_factor; + u64 wakeup_lb_busy; bool dispatch_lb_interactive; bool dispatch_pick2_disable; @@ -96,30 +107,30 @@ const volatile struct { bool wakeup_llc_migrations; bool single_llc_mode; } lb_config = { - .backoff_ns = 5LLU * NSEC_PER_MSEC, - .dispatch_lb_busy = 75, - .min_llc_runs_pick2 = 4, - .min_nr_queued_pick2 = 10, - .slack_factor = LOAD_BALANCE_SLACK, - .wakeup_lb_busy = 90, + .backoff_ns = 5LLU * NSEC_PER_MSEC, + .dispatch_lb_busy = 75, + .min_llc_runs_pick2 = 4, + .min_nr_queued_pick2 = 10, + .slack_factor = LOAD_BALANCE_SLACK, + .wakeup_lb_busy = 90, .dispatch_lb_interactive = false, - .dispatch_pick2_disable = false, - .eager_load_balance = true, - .max_dsq_pick2 = false, - .wakeup_llc_migrations = false, - .single_llc_mode = false, + .dispatch_pick2_disable = false, + .eager_load_balance = true, + .max_dsq_pick2 = false, + .wakeup_llc_migrations = false, + .single_llc_mode = false, }; const volatile struct { - u32 nr_dsqs_per_llc; - int init_dsq_index; - u64 dsq_shift; - u32 interactive_ratio; - u32 saturated_percent; - u32 sched_mode; - u32 llc_shards; - u64 dhq_max_imbalance; + u32 nr_dsqs_per_llc; + int init_dsq_index; + u64 dsq_shift; + u32 interactive_ratio; + u32 saturated_percent; + u32 sched_mode; + u32 llc_shards; + u64 dhq_max_imbalance; bool atq_enabled; bool dhq_enabled; @@ -131,43 +142,45 @@ const volatile struct { bool kthreads_local; bool pelt_enabled; } p2dq_config = { - .sched_mode = MODE_DEFAULT, - .nr_dsqs_per_llc = 3, - .init_dsq_index = 0, - .dsq_shift = 2, - .interactive_ratio = 10, - .saturated_percent = 5, - .llc_shards = 0, - .dhq_max_imbalance = 3, - - .atq_enabled = false, - .dhq_enabled = false, - .cpu_priority = false, - .task_slice = true, - .freq_control = false, - .interactive_sticky = false, + .sched_mode = MODE_DEFAULT, + .nr_dsqs_per_llc = 3, + .init_dsq_index = 0, + .dsq_shift = 2, + .interactive_ratio = 10, + .saturated_percent = 5, + .llc_shards = 0, + .dhq_max_imbalance = 3, + + .atq_enabled = false, + .dhq_enabled = false, + .cpu_priority = false, + .task_slice = true, + .freq_control = false, + .interactive_sticky = false, .keep_running_enabled = true, - .kthreads_local = true, - .pelt_enabled = true, + .kthreads_local = true, + .pelt_enabled = true, }; -const volatile u32 debug = 2; -const u32 zero_u32 = 0; +const volatile u32 debug = 2; +const u32 zero_u32 = 0; extern const volatile u32 nr_cpu_ids; -const u64 lb_timer_intvl_ns = 250LLU * NSEC_PER_MSEC; +const u64 lb_timer_intvl_ns = 250LLU * NSEC_PER_MSEC; -static u32 llc_lb_offset = 1; -static u64 min_llc_runs_pick2 = 1; -static bool saturated = false; -static bool overloaded = false; +static u32 llc_lb_offset = 1; +static u64 min_llc_runs_pick2 = 1; +static bool saturated = false; +static bool overloaded = false; -u64 llc_ids[MAX_LLCS]; -u32 cpu_core_ids[MAX_CPUS]; -u64 cpu_llc_ids[MAX_CPUS]; -u64 cpu_node_ids[MAX_CPUS]; -u64 big_core_ids[MAX_CPUS]; -u64 dsq_time_slices[MAX_DSQS_PER_LLC]; +u64 llc_ids[MAX_LLCS]; +u64 cluster_ids[MAX_CLUSTERS]; +u32 cpu_cluster_ids[MAX_CPUS]; +u32 cpu_core_ids[MAX_CPUS]; +u64 cpu_llc_ids[MAX_CPUS]; +u64 cpu_node_ids[MAX_CPUS]; +u64 big_core_ids[MAX_CPUS]; +u64 dsq_time_slices[MAX_DSQS_PER_LLC]; /* DHQ per LLC pair for migration (MAX_LLCS / 2 DHQs) */ scx_dhq_t *llc_pair_dhqs[MAX_LLCS / 2]; @@ -176,7 +189,7 @@ u32 llcs_per_node[MAX_NUMA_NODES]; /* Global DHQ counter for unique indexing */ u32 global_dhq_count = 0; -u64 min_slice_ns = 500; +u64 min_slice_ns = 500; private(A) struct bpf_cpumask __kptr *all_cpumask; private(A) struct bpf_cpumask __kptr *big_cpumask; @@ -217,13 +230,13 @@ static __always_inline u64 min_dsq_time_slice(void) static __always_inline u64 clamp_slice(u64 slice_ns) { - return min(max(min_dsq_time_slice(), slice_ns), - max_dsq_time_slice()); + return min(max(min_dsq_time_slice(), slice_ns), max_dsq_time_slice()); } static __always_inline u64 shard_dsq_id(u32 llc_id, u32 shard_id) { - return ((MAX_DSQS_PER_LLC * MAX_LLCS) << 3) + (llc_id * MAX_DSQS_PER_LLC) + shard_id; + return ((MAX_DSQS_PER_LLC * MAX_LLCS) << 3) + + (llc_id * MAX_DSQS_PER_LLC) + shard_id; } static __always_inline u64 cpu_dsq_id(s32 cpu) @@ -267,7 +280,7 @@ static int init_cpumask(struct bpf_cpumask **mask_p) static s32 pref_idle_cpu(struct llc_ctx *llcx) { struct scx_minheap_elem helem; - int ret; + int ret; if ((ret = arena_spin_lock((void __arena *)&llcx->idle_lock))) return ret; @@ -305,7 +318,8 @@ static __always_inline u32 pelt_decay(u32 val, u32 periods) u32 i; /* Bound iterations for BPF verifier (max 256 periods = 256ms) */ - bpf_for(i, 0, periods) { + bpf_for(i, 0, periods) + { if (i >= 256) break; val = (val * 127) >> 7; @@ -322,7 +336,8 @@ static __always_inline u32 pelt_decay(u32 val, u32 periods) * @now: Current timestamp in ns * @delta_ns: Runtime delta (0 for decay-only update) */ -static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta_ns) +static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, + u64 delta_ns) { u64 elapsed_ns, elapsed_ms; u32 periods, delta_ms; @@ -333,9 +348,9 @@ static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta if (!taskc->pelt_last_update_time) { /* First update - initialize */ taskc->pelt_last_update_time = now; - taskc->util_sum = 0; - taskc->util_avg = 0; - taskc->period_contrib = 0; + taskc->util_sum = 0; + taskc->util_avg = 0; + taskc->period_contrib = 0; return; } @@ -354,7 +369,7 @@ static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta periods = (u32)elapsed_ms; if (periods > 256) - periods = 256; /* Cap for verifier */ + periods = 256; /* Cap for verifier */ if (taskc->util_sum > 0) { taskc->util_sum = pelt_decay(taskc->util_sum, periods); @@ -390,9 +405,9 @@ static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta * @is_affinitized: Whether task is affinitized to this LLC */ static __always_inline void aggregate_pelt_to_llc(struct llc_ctx *llcx, - task_ctx *taskc, - bool is_interactive, - bool is_affinitized) + task_ctx *taskc, + bool is_interactive, + bool is_affinitized) { if (!p2dq_config.pelt_enabled) return; @@ -406,7 +421,6 @@ static __always_inline void aggregate_pelt_to_llc(struct llc_ctx *llcx, __sync_fetch_and_add(&llcx->affn_util_avg, taskc->util_avg); } - static u32 idle_cpu_percent(const struct cpumask *idle_cpumask) { return (100 * nr_idle_cpus(idle_cpumask)) / topo_config.nr_cpus; @@ -460,8 +474,8 @@ static int llc_create_atqs(struct llc_ctx *llcx) return 0; if (topo_config.nr_llcs > 1) { - llcx->mig_atq = (scx_atq_t *)scx_atq_create_size(false, - topo_config.nr_cpus); + llcx->mig_atq = (scx_atq_t *)scx_atq_create_size( + false, topo_config.nr_cpus); if (!llcx->mig_atq) { scx_bpf_error("ATQ failed to create ATQ for LLC %u", llcx->id); @@ -502,13 +516,15 @@ static int llc_create_dhqs(struct llc_ctx *llcx) node_llc_count = llcs_per_node[node_id]; /* Strand: A for first LLC in pair, B for second */ - strand = (node_llc_count % 2 == 0) ? SCX_DHQ_STRAND_A : SCX_DHQ_STRAND_B; + strand = (node_llc_count % 2 == 0) ? SCX_DHQ_STRAND_A : + SCX_DHQ_STRAND_B; /* First LLC in a pair: create a new DHQ */ if (strand == SCX_DHQ_STRAND_A) { dhq_index = global_dhq_count; if (dhq_index >= (MAX_LLCS / 2)) { - scx_bpf_error("DHQ: dhq_index %u >= MAX_LLCS/2", dhq_index); + scx_bpf_error("DHQ: dhq_index %u >= MAX_LLCS/2", + dhq_index); return -EINVAL; } @@ -517,12 +533,13 @@ static int llc_create_dhqs(struct llc_ctx *llcx) * for queued tasks under load without excessive memory usage. * Max imbalance controls strand balance for cross-LLC load balancing. */ - u64 dhq_capacity = topo_config.nr_cpus * 4; + u64 dhq_capacity = topo_config.nr_cpus * 4; llc_pair_dhqs[dhq_index] = (scx_dhq_t *)scx_dhq_create_balanced( - false, /* vtime mode */ - dhq_capacity, /* fixed capacity */ - SCX_DHQ_MODE_PRIORITY, /* lowest vtime wins */ - p2dq_config.dhq_max_imbalance /* max_imbalance from config */ + false, /* vtime mode */ + dhq_capacity, /* fixed capacity */ + SCX_DHQ_MODE_PRIORITY, /* lowest vtime wins */ + p2dq_config + .dhq_max_imbalance /* max_imbalance from config */ ); if (!llc_pair_dhqs[dhq_index]) { scx_bpf_error("DHQ failed to create DHQ %u for node %u", @@ -533,22 +550,23 @@ static int llc_create_dhqs(struct llc_ctx *llcx) dhq_index, node_id, llcx->id, dhq_capacity); /* Assign DHQ and strand to this LLC */ - llcx->mig_dhq = llc_pair_dhqs[dhq_index]; + llcx->mig_dhq = llc_pair_dhqs[dhq_index]; llcx->dhq_strand = strand; global_dhq_count++; } else { /* Second LLC in pair: use the most recently created DHQ */ dhq_index = global_dhq_count - 1; if (dhq_index >= (MAX_LLCS / 2) || !llc_pair_dhqs[dhq_index]) { - scx_bpf_error("DHQ: DHQ %u not available for second LLC %u", - dhq_index, llcx->id); + scx_bpf_error( + "DHQ: DHQ %u not available for second LLC %u", + dhq_index, llcx->id); return -EINVAL; } trace("DHQ %u assigned to LLC %u (node %u, strand B)", dhq_index, llcx->id, node_id); /* Assign DHQ and strand to this LLC */ - llcx->mig_dhq = llc_pair_dhqs[dhq_index]; + llcx->mig_dhq = llc_pair_dhqs[dhq_index]; llcx->dhq_strand = strand; } @@ -557,15 +575,13 @@ static int llc_create_dhqs(struct llc_ctx *llcx) return 0; } - struct p2dq_timer p2dq_timers[MAX_TIMERS] = { - {lb_timer_intvl_ns, - CLOCK_BOOTTIME, 0}, + { lb_timer_intvl_ns, CLOCK_BOOTTIME, 0 }, }; struct timer_wrapper { struct bpf_timer timer; - int key; + int key; }; struct { @@ -575,13 +591,12 @@ struct { __type(value, struct timer_wrapper); } timer_data SEC(".maps"); - struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __type(key, u32); __type(value, struct cpu_ctx); __uint(max_entries, 1); -} cpu_ctxs SEC(".maps"); +} cpu_ctxs SEC(".maps"); static struct cpu_ctx *lookup_cpu_ctx(int cpu) { @@ -590,8 +605,7 @@ static struct cpu_ctx *lookup_cpu_ctx(int cpu) if (cpu < 0) { cpuc = bpf_map_lookup_elem(&cpu_ctxs, &zero_u32); } else { - cpuc = bpf_map_lookup_percpu_elem(&cpu_ctxs, - &zero_u32, cpu); + cpuc = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero_u32, cpu); } if (!cpuc) { @@ -609,6 +623,13 @@ struct { __uint(max_entries, MAX_LLCS); } llc_ctxs SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, u32); + __type(value, struct cluster_ctx); + __uint(max_entries, MAX_CLUSTERS); +} cluster_ctxs SEC(".maps"); + static struct llc_ctx *lookup_llc_ctx(u32 llc_id) { struct llc_ctx *llcx; @@ -622,6 +643,19 @@ static struct llc_ctx *lookup_llc_ctx(u32 llc_id) return llcx; } +static struct cluster_ctx *lookup_cluster_ctx(u32 cluster_id) +{ + struct cluster_ctx *clusterx; + + clusterx = bpf_map_lookup_elem(&cluster_ctxs, &cluster_id); + if (!clusterx) { + scx_bpf_error("no cluster_ctx for cluster %u", cluster_id); + return NULL; + } + + return clusterx; +} + static struct llc_ctx *lookup_cpu_llc_ctx(s32 cpu) { if (cpu >= topo_config.nr_cpus || cpu < 0) { @@ -638,7 +672,7 @@ struct { __type(value, struct node_ctx); __uint(max_entries, MAX_NUMA_NODES); __uint(map_flags, 0); -} node_ctxs SEC(".maps"); +} node_ctxs SEC(".maps"); static struct node_ctx *lookup_node_ctx(u32 node_id) { @@ -662,7 +696,7 @@ struct { __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct mask_wrapper); -} task_masks SEC(".maps"); +} task_masks SEC(".maps"); static task_ctx *lookup_task_ctx(struct task_struct *p) { @@ -679,11 +713,11 @@ struct { __type(key, u32); __type(value, u64); __uint(max_entries, P2DQ_NR_STATS); -} stats SEC(".maps"); +} stats SEC(".maps"); static inline void stat_add(enum stat_idx idx, u64 amount) { - u32 idx_v = idx; + u32 idx_v = idx; u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); if (cnt_p) (*cnt_p) += amount; @@ -713,7 +747,8 @@ static bool can_migrate(task_ctx *taskc, struct llc_ctx *llcx) if (topo_config.nr_llcs < 2 || !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS) || - (!lb_config.dispatch_lb_interactive && task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE))) + (!lb_config.dispatch_lb_interactive && + task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE))) return false; if (lb_config.max_dsq_pick2 && @@ -736,11 +771,11 @@ static void set_deadline_slice(struct task_struct *p, task_ctx *taskc, struct llc_ctx *llcx) { u64 nr_idle; - u64 max_ns = scale_by_task_weight(p, max_dsq_time_slice()); + u64 max_ns = scale_by_task_weight(p, max_dsq_time_slice()); u64 nr_queued = llc_nr_queued(llcx); const struct cpumask *idle_cpumask = scx_bpf_get_idle_cpumask(); - nr_idle = bpf_cpumask_weight(idle_cpumask); + nr_idle = bpf_cpumask_weight(idle_cpumask); scx_bpf_put_cpumask(idle_cpumask); if (nr_idle == 0) @@ -794,8 +829,7 @@ static bool keep_running(struct cpu_ctx *cpuc, struct llc_ctx *llcx, struct task_struct *p) { // Only tasks in the most interactive DSQs can keep running. - if (!p2dq_config.keep_running_enabled || - !llcx || !cpuc || + if (!p2dq_config.keep_running_enabled || !llcx || !cpuc || cpuc->dsq_index == p2dq_config.nr_dsqs_per_llc - 1 || p->scx.flags & SCX_TASK_QUEUED || cpuc->ran_for >= timeline_config.max_exec_ns) @@ -817,10 +851,10 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc, s32 prev_cpu, bool *is_idle) { const struct cpumask *idle_smtmask, *idle_cpumask; - struct mask_wrapper *wrapper; - struct bpf_cpumask *mask; - struct llc_ctx *llcx; - s32 cpu = prev_cpu; + struct mask_wrapper *wrapper; + struct bpf_cpumask *mask; + struct llc_ctx *llcx; + s32 cpu = prev_cpu; // Migration-disabled tasks must stay on their current CPU if (is_migration_disabled(p)) { @@ -831,8 +865,7 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc, idle_cpumask = scx_bpf_get_idle_cpumask(); idle_smtmask = scx_bpf_get_idle_smtmask(); - if (!(llcx = lookup_llc_ctx(taskc->llc_id)) || - !llcx->cpumask) + if (!(llcx = lookup_llc_ctx(taskc->llc_id)) || !llcx->cpumask) goto found_cpu; // First try last CPU @@ -851,8 +884,7 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc, goto found_cpu; if (llcx->cpumask) - bpf_cpumask_and(mask, cast_mask(llcx->cpumask), - p->cpus_ptr); + bpf_cpumask_and(mask, cast_mask(llcx->cpumask), p->cpus_ptr); // First try to find an idle SMT in the LLC if (topo_config.smt_enabled) { @@ -872,8 +904,7 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc, // Next try to find an idle CPU in the node if (llcx->node_cpumask && mask) { - bpf_cpumask_and(mask, - cast_mask(llcx->node_cpumask), + bpf_cpumask_and(mask, cast_mask(llcx->node_cpumask), p->cpus_ptr); cpu = __pick_idle_cpu(mask, 0); @@ -893,16 +924,37 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc, return cpu; } -static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, - s32 prev_cpu, u64 wake_flags, bool *is_idle) +/* + * Pick an idle CPU within a cluster, intersecting with task's allowed CPUs. + * Returns idle CPU >= 0 on success, -1 if no idle CPU available in cluster. + */ +static __always_inline s32 pick_idle_cpu_in_cluster(struct task_struct *p, + struct cpu_ctx *cpuc, + s32 prev_cpu, int flags) +{ + struct cluster_ctx *clusterx; + + if (!topo_config.has_clusters || !cpuc) + return -1; + + clusterx = lookup_cluster_ctx(cpuc->cluster_id); + if (!clusterx || !clusterx->cpumask) + return -1; + + // scx_bpf_pick_idle_cpu already respects task affinity (p->cpus_ptr) + return __pick_idle_cpu(clusterx->cpumask, flags); +} + +static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, s32 prev_cpu, + u64 wake_flags, bool *is_idle) { const struct cpumask *idle_smtmask, *idle_cpumask; - struct llc_ctx *llcx; - s32 pref_cpu, cpu = prev_cpu; - bool migratable = false; + struct llc_ctx *llcx; + s32 pref_cpu, cpu = prev_cpu; + bool migratable = false; - idle_cpumask = scx_bpf_get_idle_cpumask(); - idle_smtmask = scx_bpf_get_idle_smtmask(); + idle_cpumask = scx_bpf_get_idle_cpumask(); + idle_smtmask = scx_bpf_get_idle_smtmask(); if (!idle_cpumask || !idle_smtmask) goto found_cpu; @@ -913,7 +965,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, goto found_cpu; } - if (p2dq_config.interactive_sticky && task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)) { + if (p2dq_config.interactive_sticky && + task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)) { *is_idle = scx_bpf_test_and_clear_cpu_idle(prev_cpu); goto found_cpu; } @@ -921,13 +974,13 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, if (idle_cpumask && bpf_cpumask_empty(idle_cpumask)) goto found_cpu; - if (!(llcx = lookup_llc_ctx(taskc->llc_id)) || - !llcx->cpumask) + if (!(llcx = lookup_llc_ctx(taskc->llc_id)) || !llcx->cpumask) goto found_cpu; migratable = can_migrate(taskc, llcx); if (topo_config.nr_llcs > 1 && - (llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) || saturated || overloaded) && + (llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) || saturated || + overloaded) && !migratable) { cpu = prev_cpu; goto found_cpu; @@ -946,10 +999,24 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, // Interactive tasks aren't worth migrating across LLCs. if (task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE) || (topo_config.nr_llcs == 2 && topo_config.nr_nodes == 2)) { + // Try cluster-level idle CPU first for interactive tasks + if (topo_config.has_clusters) { + struct cpu_ctx *prev_cpuc = + lookup_cpu_ctx(prev_cpu); + if (prev_cpuc) { + cpu = pick_idle_cpu_in_cluster( + p, prev_cpuc, prev_cpu, 0); + if (cpu >= 0) { + stat_inc(P2DQ_STAT_WAKE_LLC); + *is_idle = true; + goto found_cpu; + } + } + } + // Try an idle CPU in the LLC. if (llcx->cpumask && - (cpu = __pick_idle_cpu(llcx->cpumask, 0) - ) >= 0) { + (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0) { stat_inc(P2DQ_STAT_WAKE_LLC); *is_idle = true; goto found_cpu; @@ -961,7 +1028,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, } struct task_struct *waker = (void *)bpf_get_current_task_btf(); - task_ctx *waker_taskc = scx_task_data(waker); + task_ctx *waker_taskc = scx_task_data(waker); // Shouldn't happen, but makes code easier to follow if (!waker_taskc) { stat_inc(P2DQ_STAT_WAKE_PREV); @@ -970,21 +1037,47 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, if (waker_taskc->llc_id == llcx->id || !lb_config.wakeup_llc_migrations) { + // If clusters enabled, check if waker and wakee in same cluster + if (topo_config.has_clusters) { + struct cpu_ctx *waker_cpuc = + lookup_cpu_ctx(scx_bpf_task_cpu(waker)); + struct cpu_ctx *prev_cpuc = + lookup_cpu_ctx(prev_cpu); + + if (waker_cpuc && prev_cpuc && + waker_cpuc->cluster_id == + prev_cpuc->cluster_id) { + // Try idle core in same cluster first + if (topo_config.smt_enabled) { + cpu = pick_idle_cpu_in_cluster( + p, prev_cpuc, prev_cpu, + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + *is_idle = true; + goto found_cpu; + } + } + // Try any idle CPU in same cluster + cpu = pick_idle_cpu_in_cluster( + p, prev_cpuc, prev_cpu, 0); + if (cpu >= 0) { + *is_idle = true; + goto found_cpu; + } + } + } + // Try an idle smt core in the LLC. - if (topo_config.smt_enabled && - llcx->cpumask && + if (topo_config.smt_enabled && llcx->cpumask && (cpu = __pick_idle_cpu(llcx->cpumask, - SCX_PICK_IDLE_CORE) - ) >= 0) { + SCX_PICK_IDLE_CORE)) >= 0) { stat_inc(P2DQ_STAT_WAKE_LLC); *is_idle = true; goto found_cpu; } // Try an idle cpu in the LLC. if (llcx->cpumask && - (cpu = __pick_idle_cpu(llcx->cpumask, - 0) - ) >= 0) { + (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0) { stat_inc(P2DQ_STAT_WAKE_LLC); *is_idle = true; goto found_cpu; @@ -996,7 +1089,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, } // If wakeup LLC are allowed then migrate to the waker llc. - struct llc_ctx *waker_llcx = lookup_llc_ctx(waker_taskc->llc_id); + struct llc_ctx *waker_llcx = + lookup_llc_ctx(waker_taskc->llc_id); if (!waker_llcx) { stat_inc(P2DQ_STAT_WAKE_PREV); cpu = prev_cpu; @@ -1005,8 +1099,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, if (waker_llcx->cpumask && (cpu = __pick_idle_cpu(waker_llcx->cpumask, - SCX_PICK_IDLE_CORE) - ) >= 0) { + SCX_PICK_IDLE_CORE)) >= 0) { stat_inc(P2DQ_STAT_WAKE_MIG); *is_idle = true; goto found_cpu; @@ -1014,9 +1107,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, // Couldn't find an idle core so just migrate to the CPU if (waker_llcx->cpumask && - (cpu = __pick_idle_cpu(waker_llcx->cpumask, - 0) - ) >= 0) { + (cpu = __pick_idle_cpu(waker_llcx->cpumask, 0)) >= 0) { stat_inc(P2DQ_STAT_WAKE_MIG); *is_idle = true; goto found_cpu; @@ -1029,10 +1120,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, } if (p2dq_config.sched_mode == MODE_PERF && - topo_config.has_little_cores && - llcx->big_cpumask) { - cpu = __pick_idle_cpu(llcx->big_cpumask, - SCX_PICK_IDLE_CORE); + topo_config.has_little_cores && llcx->big_cpumask) { + cpu = __pick_idle_cpu(llcx->big_cpumask, SCX_PICK_IDLE_CORE); if (cpu >= 0) { *is_idle = true; goto found_cpu; @@ -1047,8 +1136,7 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, } if (p2dq_config.sched_mode == MODE_EFFICIENCY && - topo_config.has_little_cores && - llcx->little_cpumask) { + topo_config.has_little_cores && llcx->little_cpumask) { cpu = __pick_idle_cpu(llcx->little_cpumask, SCX_PICK_IDLE_CORE); if (cpu >= 0) { *is_idle = true; @@ -1063,21 +1151,18 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, } } - - if (llcx->lb_llc_id < MAX_LLCS && - taskc->llc_runs == 0) { + if (llcx->lb_llc_id < MAX_LLCS && taskc->llc_runs == 0) { u32 target_llc_id = llcx->lb_llc_id; - llcx->lb_llc_id = MAX_LLCS; + llcx->lb_llc_id = MAX_LLCS; if (!(llcx = lookup_llc_ctx(target_llc_id))) goto found_cpu; stat_inc(P2DQ_STAT_SELECT_PICK2); } - if (topo_config.has_little_cores && - llcx->little_cpumask && llcx->big_cpumask) { + if (topo_config.has_little_cores && llcx->little_cpumask && + llcx->big_cpumask) { if (task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)) { - cpu = __pick_idle_cpu(llcx->little_cpumask, - 0); + cpu = __pick_idle_cpu(llcx->little_cpumask, 0); if (cpu >= 0) { *is_idle = true; goto found_cpu; @@ -1097,31 +1182,59 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, if (llcx->cpumask && pref_cpu >= 0 && scx_bpf_test_and_clear_cpu_idle(pref_cpu)) { *is_idle = true; - cpu = pref_cpu; + cpu = pref_cpu; trace("PREF idle %s->%d", p->comm, pref_cpu); goto found_cpu; } } + /* + * Try cluster-level idle CPU search before LLC-wide search, but only if the LLC + * is under pressure. In low-contention scenarios, cluster selection adds overhead + * without providing cache locality benefits since the LLC search will succeed anyway. + * + * We use LLC load as a proxy for pressure: only use cluster search when load + * indicates more than 1ms of work per CPU (llc->nr_cpus * NSEC_PER_MSEC). + */ + if (topo_config.has_clusters && + llcx->load > (llcx->nr_cpus * NSEC_PER_MSEC)) { + struct cpu_ctx *prev_cpuc = lookup_cpu_ctx(prev_cpu); + if (prev_cpuc) { + // First try idle core within prev_cpu's cluster + cpu = pick_idle_cpu_in_cluster(p, prev_cpuc, prev_cpu, + SCX_PICK_IDLE_CORE); + if (cpu >= 0) { + *is_idle = true; + goto found_cpu; + } + + // Then try any idle CPU within prev_cpu's cluster + cpu = pick_idle_cpu_in_cluster(p, prev_cpuc, prev_cpu, + 0); + if (cpu >= 0) { + *is_idle = true; + goto found_cpu; + } + } + } + // Next try in the local LLC (usually succeeds) if (likely(llcx->cpumask && - (cpu = __pick_idle_cpu(llcx->cpumask, - SCX_PICK_IDLE_CORE) - ) >= 0)) { + (cpu = __pick_idle_cpu(llcx->cpumask, SCX_PICK_IDLE_CORE)) >= + 0)) { *is_idle = true; goto found_cpu; } // Try a idle CPU in the llc (also likely to succeed) if (likely(llcx->cpumask && - (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0)) { + (cpu = __pick_idle_cpu(llcx->cpumask, 0)) >= 0)) { *is_idle = true; goto found_cpu; } if (topo_config.nr_llcs > 1 && - llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) && - migratable && + llc_ctx_test_flag(llcx, LLC_CTX_F_SATURATED) && migratable && llcx->node_cpumask) { cpu = scx_bpf_pick_idle_cpu(cast_mask(llcx->node_cpumask), SCX_PICK_IDLE_CORE); @@ -1130,7 +1243,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, goto found_cpu; } if (llcx->node_cpumask) { - cpu = scx_bpf_pick_idle_cpu(cast_mask(llcx->node_cpumask), 0); + cpu = scx_bpf_pick_idle_cpu( + cast_mask(llcx->node_cpumask), 0); if (cpu >= 0) { *is_idle = true; goto found_cpu; @@ -1144,7 +1258,8 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, goto found_cpu; } if (all_cpumask) { - cpu = scx_bpf_pick_idle_cpu(cast_mask(all_cpumask), 0); + cpu = scx_bpf_pick_idle_cpu( + cast_mask(all_cpumask), 0); if (cpu >= 0) { *is_idle = true; goto found_cpu; @@ -1162,12 +1277,12 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, return cpu; } - -static s32 p2dq_select_cpu_impl(struct task_struct *p, s32 prev_cpu, u64 wake_flags) +static s32 p2dq_select_cpu_impl(struct task_struct *p, s32 prev_cpu, + u64 wake_flags) { task_ctx *taskc; - bool is_idle = false; - s32 cpu; + bool is_idle = false; + s32 cpu; if (!(taskc = lookup_task_ctx(p))) return prev_cpu; @@ -1182,16 +1297,16 @@ static s32 p2dq_select_cpu_impl(struct task_struct *p, s32 prev_cpu, u64 wake_fl // Only direct dispatch non-affinitized tasks // Affinitized tasks will be queued by enqueue to prevent livelock if (task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS)) { - scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, taskc->slice_ns, 0); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, taskc->slice_ns, + 0); } } - trace("SELECT [%d][%s] %i->%i idle %i", - p->pid, p->comm, prev_cpu, cpu, is_idle); + trace("SELECT [%d][%s] %i->%i idle %i", p->pid, p->comm, prev_cpu, cpu, + is_idle); return cpu; } - /* * Perform the enqueue logic for `p` but don't enqueue it where possible. This * is primarily used so that scx_chaos can decide to enqueue a task either @@ -1210,8 +1325,8 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, { struct cpu_ctx *cpuc; struct llc_ctx *llcx; - task_ctx *taskc; - s32 cpu = scx_bpf_task_cpu(p); + task_ctx *taskc; + s32 cpu = scx_bpf_task_cpu(p); // Default to 0 and set to failed. __builtin_memset(ret, 0, sizeof(*ret)); @@ -1221,13 +1336,10 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, * Per-cpu kthreads are considered interactive and dispatched directly * into the local DSQ. */ - if (unlikely(p2dq_config.kthreads_local && - (p->flags & PF_KTHREAD) && - p->nr_cpus_allowed == 1)) { + if (unlikely(p2dq_config.kthreads_local && (p->flags & PF_KTHREAD) && + p->nr_cpus_allowed == 1)) { stat_inc(P2DQ_STAT_DIRECT); - scx_bpf_dsq_insert(p, - SCX_DSQ_LOCAL, - min_dsq_time_slice(), + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, min_dsq_time_slice(), enq_flags); if (scx_bpf_test_and_clear_cpu_idle(cpu)) scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); @@ -1235,7 +1347,7 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, return; } - if(!(taskc = lookup_task_ctx(p))) { + if (!(taskc = lookup_task_ctx(p))) { scx_bpf_error("invalid lookup"); return; } @@ -1248,17 +1360,17 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, bool has_cleared_idle = false; if (!__COMPAT_is_enq_cpu_selected(enq_flags) || !bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) - cpu = pick_idle_affinitized_cpu(p, - taskc, - cpu, + cpu = pick_idle_affinitized_cpu(p, taskc, cpu, &has_cleared_idle); else has_cleared_idle = scx_bpf_test_and_clear_cpu_idle(cpu); if (has_cleared_idle) - enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); + enqueue_promise_set_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); else - enqueue_promise_clear_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); + enqueue_promise_clear_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); ret->cpu = cpu; if (!(cpuc = lookup_cpu_ctx(cpu)) || @@ -1288,26 +1400,36 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, struct llc_ctx *prev_llcx; if (prev_cpu >= 0 && prev_cpu < NR_CPUS && (prev_cpuc = lookup_cpu_ctx(prev_cpu)) && - (prev_llcx = lookup_llc_ctx(prev_cpuc->llc_id)) && + (prev_llcx = lookup_llc_ctx( + prev_cpuc->llc_id)) && prev_llcx->cpumask) { // Check if any CPU in prev LLC matches affinity - s32 llc_cpu = scx_bpf_pick_idle_cpu(cast_mask(prev_llcx->cpumask), 0); - if (llc_cpu >= 0 && bpf_cpumask_test_cpu(llc_cpu, p->cpus_ptr)) { + s32 llc_cpu = scx_bpf_pick_idle_cpu( + cast_mask(prev_llcx->cpumask), + 0); + if (llc_cpu >= 0 && + bpf_cpumask_test_cpu(llc_cpu, + p->cpus_ptr)) { target_cpu = llc_cpu; } else { // Fallback to random CPU in affinity mask - target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr); + target_cpu = + bpf_cpumask_any_distribute( + p->cpus_ptr); } } else { // Fallback to random CPU in affinity mask - target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr); + target_cpu = bpf_cpumask_any_distribute( + p->cpus_ptr); } } // Update cpuc and llcx to match target_cpu if (!(cpuc = lookup_cpu_ctx(target_cpu)) || !(llcx = lookup_llc_ctx(cpuc->llc_id))) { - scx_bpf_error("invalid lookup for target_cpu %d", target_cpu); + scx_bpf_error( + "invalid lookup for target_cpu %d", + target_cpu); return; } ret->cpu = target_cpu; @@ -1327,9 +1449,11 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, u64 old_slice = taskc->slice_ns; // Scale slice inversely with queue depth // Add 1 to account for the task we're about to enqueue - taskc->slice_ns = clamp_slice(taskc->slice_ns / (nr_queued + 1)); + taskc->slice_ns = clamp_slice(taskc->slice_ns / + (nr_queued + 1)); trace("PENALIZE [%d][%s] cpu=%d nr_queued=%llu old_slice=%llu new_slice=%llu", - p->pid, p->comm, target_cpu, nr_queued, old_slice, taskc->slice_ns); + p->pid, p->comm, target_cpu, nr_queued, + old_slice, taskc->slice_ns); } } @@ -1338,21 +1462,23 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, // Always queue affinitized tasks to affn_dsq (no direct dispatch) // This prevents tight wakeup loops and allows proper idle state - u64 task_vtime_affn = p->scx.dsq_vtime; + u64 task_vtime_affn = p->scx.dsq_vtime; - ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; - ret->vtime.dsq_id = taskc->dsq_id; - ret->vtime.slice_ns = taskc->slice_ns; + ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; + ret->vtime.dsq_id = taskc->dsq_id; + ret->vtime.slice_ns = taskc->slice_ns; ret->vtime.enq_flags = enq_flags; - ret->vtime.vtime = task_vtime_affn; + ret->vtime.vtime = task_vtime_affn; // Kick target CPU if we cleared idle state - if (enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE)) - enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_KICK_IDLE); + if (enqueue_promise_test_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE)) + enqueue_promise_set_flag(ret, + ENQUEUE_PROMISE_F_KICK_IDLE); trace("ENQUEUE %s weight %d slice %llu vtime %llu llc vtime %llu affn_dsq", - p->comm, p->scx.weight, taskc->slice_ns, - task_vtime_affn, llcx->vtime); + p->comm, p->scx.weight, taskc->slice_ns, task_vtime_affn, + llcx->vtime); return; } @@ -1360,24 +1486,22 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, // If an idle CPU hasn't been found in select_cpu find one now if (!__COMPAT_is_enq_cpu_selected(enq_flags)) { bool has_cleared_idle = false; - cpu = pick_idle_cpu(p, - taskc, - cpu, - 0, - &has_cleared_idle); + cpu = pick_idle_cpu(p, taskc, cpu, 0, &has_cleared_idle); if (has_cleared_idle) - enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); + enqueue_promise_set_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); else - enqueue_promise_clear_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); + enqueue_promise_clear_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); if (!(cpuc = lookup_cpu_ctx(cpu)) || - !(llcx = lookup_llc_ctx(cpuc->llc_id))) { + !(llcx = lookup_llc_ctx(cpuc->llc_id))) { scx_bpf_error("invalid lookup"); return; } s32 task_cpu = scx_bpf_task_cpu(p); - ret->cpu = cpu; + ret->cpu = cpu; update_vtime(p, cpuc, taskc, llcx); if (timeline_config.deadline) set_deadline_slice(p, taskc, llcx); @@ -1385,28 +1509,31 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, if (cpu_ctx_test_flag(cpuc, CPU_CTX_F_NICE_TASK)) enq_flags |= SCX_ENQ_PREEMPT; - if ((enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) || + if ((enqueue_promise_test_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) || cpu_ctx_test_flag(cpuc, CPU_CTX_F_NICE_TASK))) { ret->kind = P2DQ_ENQUEUE_PROMISE_FIFO; // For migration-disabled tasks, use SCX_DSQ_LOCAL to dispatch // to the task's current CPU, not SCX_DSQ_LOCAL_ON|cpu if (cpu != task_cpu && !is_migration_disabled(p)) { - ret->fifo.dsq_id = SCX_DSQ_LOCAL_ON|cpu; + ret->fifo.dsq_id = SCX_DSQ_LOCAL_ON | cpu; } else { ret->fifo.dsq_id = SCX_DSQ_LOCAL; } - ret->fifo.slice_ns = taskc->slice_ns; + ret->fifo.slice_ns = taskc->slice_ns; ret->fifo.enq_flags = enq_flags; - if (enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE)) - enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_KICK_IDLE); + if (enqueue_promise_test_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE)) + enqueue_promise_set_flag( + ret, ENQUEUE_PROMISE_F_KICK_IDLE); return; } // Only allow tasks with full CPU affinity into migration DSQs // Affinitized tasks stay in LLC DSQ to prevent cross-LLC livelock bool migrate = likely(!lb_config.single_llc_mode) && - can_migrate(taskc, llcx) && - task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS); + can_migrate(taskc, llcx) && + task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS); u64 task_vtime_early = p->scx.dsq_vtime; @@ -1415,40 +1542,40 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, if (p2dq_config.dhq_enabled) { taskc->enq_flags = enq_flags; ret->kind = P2DQ_ENQUEUE_PROMISE_DHQ_VTIME; - ret->dhq.dsq_id = cpuc->llc_dsq; - ret->dhq.dhq = llcx->mig_dhq; - ret->dhq.strand = llcx->dhq_strand; - ret->dhq.slice_ns = taskc->slice_ns; - ret->dhq.vtime = task_vtime_early; + ret->dhq.dsq_id = cpuc->llc_dsq; + ret->dhq.dhq = llcx->mig_dhq; + ret->dhq.strand = llcx->dhq_strand; + ret->dhq.slice_ns = taskc->slice_ns; + ret->dhq.vtime = task_vtime_early; ret->dhq.enq_flags = enq_flags; } else if (p2dq_config.atq_enabled) { taskc->enq_flags = enq_flags; ret->kind = P2DQ_ENQUEUE_PROMISE_ATQ_VTIME; - ret->vtime.dsq_id = cpuc->llc_dsq; - ret->vtime.atq = llcx->mig_atq; + ret->vtime.dsq_id = cpuc->llc_dsq; + ret->vtime.atq = llcx->mig_atq; ret->vtime.slice_ns = taskc->slice_ns; - ret->vtime.vtime = task_vtime_early; + ret->vtime.vtime = task_vtime_early; } else { - ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; + ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; ret->vtime.dsq_id = taskc->dsq_id; - ret->vtime.slice_ns = taskc->slice_ns; + ret->vtime.slice_ns = taskc->slice_ns; ret->vtime.enq_flags = enq_flags; - ret->vtime.vtime = task_vtime_early; + ret->vtime.vtime = task_vtime_early; } stat_inc(P2DQ_STAT_ENQ_MIG); } else { - taskc->dsq_id = cpuc->llc_dsq; - ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; - ret->vtime.dsq_id = taskc->dsq_id; - ret->vtime.slice_ns = taskc->slice_ns; + taskc->dsq_id = cpuc->llc_dsq; + ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; + ret->vtime.dsq_id = taskc->dsq_id; + ret->vtime.slice_ns = taskc->slice_ns; ret->vtime.enq_flags = enq_flags; - ret->vtime.vtime = task_vtime_early; + ret->vtime.vtime = task_vtime_early; stat_inc(P2DQ_STAT_ENQ_LLC); } trace("ENQUEUE %s weight %d slice %llu vtime %llu llc vtime %llu", - p->comm, p->scx.weight, taskc->slice_ns, - task_vtime_early, llcx->vtime); + p->comm, p->scx.weight, taskc->slice_ns, task_vtime_early, + llcx->vtime); return; } @@ -1469,27 +1596,32 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, bool has_cleared_idle = scx_bpf_test_and_clear_cpu_idle(cpu); if (has_cleared_idle) - enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); + enqueue_promise_set_flag(ret, + ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); else - enqueue_promise_clear_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); + enqueue_promise_clear_flag(ret, + ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE); - if ((enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) || + if ((enqueue_promise_test_flag(ret, + ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE) || cpu_ctx_test_flag(cpuc, CPU_CTX_F_NICE_TASK)) && bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { - ret->kind = P2DQ_ENQUEUE_PROMISE_FIFO; - ret->fifo.dsq_id = SCX_DSQ_LOCAL; - ret->fifo.slice_ns = taskc->slice_ns; + ret->kind = P2DQ_ENQUEUE_PROMISE_FIFO; + ret->fifo.dsq_id = SCX_DSQ_LOCAL; + ret->fifo.slice_ns = taskc->slice_ns; ret->fifo.enq_flags = enq_flags; - if (enqueue_promise_test_flag(ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE)) - enqueue_promise_set_flag(ret, ENQUEUE_PROMISE_F_KICK_IDLE); + if (enqueue_promise_test_flag( + ret, ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE)) + enqueue_promise_set_flag(ret, + ENQUEUE_PROMISE_F_KICK_IDLE); return; } // Only allow tasks with full CPU affinity into migration DSQs // Affinitized tasks stay in LLC DSQ to prevent cross-LLC livelock bool migrate = likely(!lb_config.single_llc_mode) && - can_migrate(taskc, llcx) && - task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS); + can_migrate(taskc, llcx) && + task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS); if (migrate) { taskc->dsq_id = llcx->mig_dsq; stat_inc(P2DQ_STAT_ENQ_MIG); @@ -1497,23 +1629,23 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, u64 task_vtime_mig = p->scx.dsq_vtime; if (p2dq_config.dhq_enabled) { - taskc->enq_flags = enq_flags; - ret->kind = P2DQ_ENQUEUE_PROMISE_DHQ_VTIME; - ret->dhq.dsq_id = cpuc->llc_dsq; - ret->dhq.dhq = llcx->mig_dhq; - ret->dhq.strand = llcx->dhq_strand; - ret->dhq.slice_ns = taskc->slice_ns; - ret->dhq.vtime = task_vtime_mig; + taskc->enq_flags = enq_flags; + ret->kind = P2DQ_ENQUEUE_PROMISE_DHQ_VTIME; + ret->dhq.dsq_id = cpuc->llc_dsq; + ret->dhq.dhq = llcx->mig_dhq; + ret->dhq.strand = llcx->dhq_strand; + ret->dhq.slice_ns = taskc->slice_ns; + ret->dhq.vtime = task_vtime_mig; ret->dhq.enq_flags = enq_flags; return; } else if (p2dq_config.atq_enabled) { - taskc->enq_flags = enq_flags; - ret->kind = P2DQ_ENQUEUE_PROMISE_ATQ_VTIME; - ret->vtime.dsq_id = cpuc->llc_dsq; - ret->vtime.atq = llcx->mig_atq; + taskc->enq_flags = enq_flags; + ret->kind = P2DQ_ENQUEUE_PROMISE_ATQ_VTIME; + ret->vtime.dsq_id = cpuc->llc_dsq; + ret->vtime.atq = llcx->mig_atq; ret->vtime.slice_ns = taskc->slice_ns; - ret->vtime.vtime = task_vtime_mig; + ret->vtime.vtime = task_vtime_mig; return; } @@ -1525,35 +1657,31 @@ static void async_p2dq_enqueue(struct enqueue_promise *ret, u64 task_vtime = p->scx.dsq_vtime; trace("ENQUEUE %s weight %d slice %llu vtime %llu llc vtime %llu", - p->comm, p->scx.weight, taskc->slice_ns, - task_vtime, llcx->vtime); + p->comm, p->scx.weight, taskc->slice_ns, task_vtime, llcx->vtime); - ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; - ret->vtime.dsq_id = taskc->dsq_id; + ret->kind = P2DQ_ENQUEUE_PROMISE_VTIME; + ret->vtime.dsq_id = taskc->dsq_id; ret->vtime.enq_flags = enq_flags; - ret->vtime.slice_ns = taskc->slice_ns; - ret->vtime.vtime = task_vtime; + ret->vtime.slice_ns = taskc->slice_ns; + ret->vtime.vtime = task_vtime; } -static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struct *p) +static void complete_p2dq_enqueue(struct enqueue_promise *pro, + struct task_struct *p) { task_ctx *taskc; - int ret; + int ret; switch (pro->kind) { case P2DQ_ENQUEUE_PROMISE_COMPLETE: break; case P2DQ_ENQUEUE_PROMISE_FIFO: - scx_bpf_dsq_insert(p, - pro->fifo.dsq_id, - pro->fifo.slice_ns, + scx_bpf_dsq_insert(p, pro->fifo.dsq_id, pro->fifo.slice_ns, pro->fifo.enq_flags); break; case P2DQ_ENQUEUE_PROMISE_VTIME: - scx_bpf_dsq_insert_vtime(p, - pro->vtime.dsq_id, - pro->vtime.slice_ns, - pro->vtime.vtime, + scx_bpf_dsq_insert_vtime(p, pro->vtime.dsq_id, + pro->vtime.slice_ns, pro->vtime.vtime, pro->vtime.enq_flags); break; case P2DQ_ENQUEUE_PROMISE_ATQ_FIFO: @@ -1563,7 +1691,7 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc } taskc = lookup_task_ctx(p); - ret = scx_atq_insert(pro->fifo.atq, &taskc->common); + ret = scx_atq_insert(pro->fifo.atq, &taskc->common); if (ret) { scx_bpf_error("error %d on scx_atq_insert", ret); break; @@ -1579,9 +1707,8 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc } taskc = lookup_task_ctx(p); - ret = scx_atq_insert_vtime(pro->vtime.atq, - &taskc->common, - pro->vtime.vtime); + ret = scx_atq_insert_vtime(pro->vtime.atq, &taskc->common, + pro->vtime.vtime); if (ret) { scx_bpf_error("error %d on scx_atq_insert", ret); break; @@ -1592,15 +1719,12 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc scx_bpf_error("invalid DHQ"); break; } - ret = scx_dhq_insert_vtime(pro->dhq.dhq, - (u64)p->pid, - pro->dhq.vtime, - pro->dhq.strand); + ret = scx_dhq_insert_vtime(pro->dhq.dhq, (u64)p->pid, + pro->dhq.vtime, pro->dhq.strand); if (ret) { // The DHQ insert failed (EAGAIN if imbalanced, ENOSPC if full) // Fallback to the DSQ - scx_bpf_dsq_insert_vtime(p, - pro->dhq.dsq_id, + scx_bpf_dsq_insert_vtime(p, pro->dhq.dsq_id, pro->dhq.slice_ns, pro->dhq.vtime, pro->dhq.enq_flags); @@ -1626,10 +1750,10 @@ static void complete_p2dq_enqueue(struct enqueue_promise *pro, struct task_struc static int p2dq_running_impl(struct task_struct *p) { - task_ctx *taskc; + task_ctx *taskc; struct cpu_ctx *cpuc; struct llc_ctx *llcx; - s32 task_cpu = scx_bpf_task_cpu(p); + s32 task_cpu = scx_bpf_task_cpu(p); if (!(taskc = lookup_task_ctx(p)) || !(cpuc = lookup_cpu_ctx(task_cpu)) || @@ -1639,9 +1763,8 @@ static int p2dq_running_impl(struct task_struct *p) if (taskc->llc_id != cpuc->llc_id) { task_refresh_llc_runs(taskc); stat_inc(P2DQ_STAT_LLC_MIGRATION); - trace("RUNNING %d cpu %d->%d llc %d->%d", - p->pid, cpuc->id, task_cpu, - taskc->llc_id, llcx->id); + trace("RUNNING %d cpu %d->%d llc %d->%d", p->pid, cpuc->id, + task_cpu, taskc->llc_id, llcx->id); } else { if (taskc->llc_runs == 0) task_refresh_llc_runs(taskc); @@ -1652,7 +1775,7 @@ static int p2dq_running_impl(struct task_struct *p) stat_inc(P2DQ_STAT_NODE_MIGRATION); } - taskc->llc_id = llcx->id; + taskc->llc_id = llcx->id; taskc->node_id = llcx->node_id; if (p->scx.weight < 100) task_ctx_set_flag(taskc, TASK_CTX_F_WAS_NICE); @@ -1672,12 +1795,12 @@ static int p2dq_running_impl(struct task_struct *p) cpu_ctx_clear_flag(cpuc, CPU_CTX_F_NICE_TASK); cpuc->slice_ns = taskc->slice_ns; - cpuc->ran_for = 0; + cpuc->ran_for = 0; // racy, but don't care if (p->scx.dsq_vtime > llcx->vtime && p->scx.dsq_vtime < llcx->vtime + max_dsq_time_slice()) { - __sync_val_compare_and_swap(&llcx->vtime, - llcx->vtime, p->scx.dsq_vtime); + __sync_val_compare_and_swap(&llcx->vtime, llcx->vtime, + p->scx.dsq_vtime); } // If the task is running in the least interactive DSQ, bump the @@ -1702,14 +1825,14 @@ static int p2dq_running_impl(struct task_struct *p) void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) { - task_ctx *taskc; + task_ctx *taskc; struct llc_ctx *llcx; struct cpu_ctx *cpuc; - u64 used, scaled_used, last_dsq_slice_ns; - u64 now = bpf_ktime_get_ns(); + u64 used, scaled_used, last_dsq_slice_ns; + u64 now = bpf_ktime_get_ns(); if (unlikely(!(taskc = lookup_task_ctx(p)) || - !(llcx = lookup_llc_ctx(taskc->llc_id)))) + !(llcx = lookup_llc_ctx(taskc->llc_id)))) return; // can't happen, appease the verifier @@ -1728,13 +1851,13 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) task_ctx_clear_flag(taskc, TASK_CTX_F_WAS_NICE); } - taskc->last_dsq_id = taskc->dsq_id; + taskc->last_dsq_id = taskc->dsq_id; taskc->last_dsq_index = taskc->dsq_index; - taskc->used = 0; + taskc->used = 0; - last_dsq_slice_ns = taskc->slice_ns; - used = now - taskc->last_run_at; - scaled_used = scale_by_task_weight_inverse(p, used); + last_dsq_slice_ns = taskc->slice_ns; + used = now - taskc->last_run_at; + scaled_used = scale_by_task_weight_inverse(p, used); p->scx.dsq_vtime += scaled_used; __sync_fetch_and_add(&llcx->vtime, used); @@ -1742,16 +1865,19 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) /* Update PELT metrics if enabled */ if (p2dq_config.pelt_enabled) { update_task_pelt(taskc, now, used); - aggregate_pelt_to_llc(llcx, taskc, - task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE), - !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS)); + aggregate_pelt_to_llc( + llcx, taskc, + task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE), + !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS)); } /* Legacy load tracking (when PELT disabled) */ if (!p2dq_config.pelt_enabled) { __sync_fetch_and_add(&llcx->load, used); - if (taskc->dsq_index >= 0 && taskc->dsq_index < MAX_DSQS_PER_LLC) - __sync_fetch_and_add(&llcx->dsq_load[taskc->dsq_index], used); + if (taskc->dsq_index >= 0 && + taskc->dsq_index < MAX_DSQS_PER_LLC) + __sync_fetch_and_add(&llcx->dsq_load[taskc->dsq_index], + used); if (task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE)) __sync_fetch_and_add(&llcx->intr_load, used); @@ -1761,25 +1887,30 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) __sync_fetch_and_add(&llcx->affn_load, used); } - trace("STOPPING %s weight %d slice %llu used %llu scaled %llu", - p->comm, p->scx.weight, last_dsq_slice_ns, used, scaled_used); + trace("STOPPING %s weight %d slice %llu used %llu scaled %llu", p->comm, + p->scx.weight, last_dsq_slice_ns, used, scaled_used); if (!runnable) { used = now - taskc->last_run_started; // Affinitized tasks need stricter thresholds to prevent monopolization - bool is_affinitized = !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS); + bool is_affinitized = + !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS); u64 inc_threshold = is_affinitized ? - ((19 * last_dsq_slice_ns) / 20) : // 95% for affinitized - ((9 * last_dsq_slice_ns) / 10); // 90% for normal - u64 dec_threshold = is_affinitized ? - (last_dsq_slice_ns / 4) : // 25% for affinitized - (last_dsq_slice_ns / 2); // 50% for normal + ((19 * last_dsq_slice_ns) / + 20) : // 95% for affinitized + ((9 * last_dsq_slice_ns) / + 10); // 90% for normal + u64 dec_threshold = + is_affinitized ? + (last_dsq_slice_ns / 4) : // 25% for affinitized + (last_dsq_slice_ns / 2); // 50% for normal // On stopping determine if the task can move to a longer DSQ by // comparing the used time to the scaled DSQ slice. if (used >= inc_threshold) { - if (taskc->dsq_index < p2dq_config.nr_dsqs_per_llc - 1 && + if (taskc->dsq_index < + p2dq_config.nr_dsqs_per_llc - 1 && p->scx.weight >= 100) { taskc->dsq_index += 1; stat_inc(P2DQ_STAT_DSQ_CHANGE); @@ -1788,13 +1919,12 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) } else { stat_inc(P2DQ_STAT_DSQ_SAME); } - // If under threshold, move the task back down. + // If under threshold, move the task back down. } else if (used < dec_threshold) { if (taskc->dsq_index > 0) { taskc->dsq_index -= 1; stat_inc(P2DQ_STAT_DSQ_CHANGE); - trace("%s[%p]: DSQ dec %llu -> %u", - p->comm, p, + trace("%s[%p]: DSQ dec %llu -> %u", p->comm, p, taskc->last_dsq_index, taskc->dsq_index); } else { stat_inc(P2DQ_STAT_DSQ_SAME); @@ -1810,12 +1940,15 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) if (p2dq_config.task_slice) { if (used >= ((7 * last_dsq_slice_ns) / 8)) { - taskc->slice_ns = clamp_slice((5 * taskc->slice_ns) >> 2); + taskc->slice_ns = + clamp_slice((5 * taskc->slice_ns) >> 2); } else if (used < last_dsq_slice_ns / 2) { - taskc->slice_ns = clamp_slice((7 * taskc->slice_ns) >> 3); + taskc->slice_ns = + clamp_slice((7 * taskc->slice_ns) >> 3); } } else { - taskc->slice_ns = task_dsq_slice_ns(p, taskc->dsq_index); + taskc->slice_ns = + task_dsq_slice_ns(p, taskc->dsq_index); } taskc->last_run_started = 0; if (is_interactive(taskc)) @@ -1828,10 +1961,10 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) static bool consume_llc(struct llc_ctx *llcx) { struct task_struct *p; - task_ctx *taskc; - struct cpu_ctx *cpuc; - s32 cpu; - u64 pid; + task_ctx *taskc; + struct cpu_ctx *cpuc; + s32 cpu; + u64 pid; if (!llcx) return false; @@ -1840,8 +1973,7 @@ static bool consume_llc(struct llc_ctx *llcx) if (!(cpuc = lookup_cpu_ctx(cpu))) return false; - if (p2dq_config.dhq_enabled && - scx_dhq_nr_queued(llcx->mig_dhq) > 0) { + if (p2dq_config.dhq_enabled && scx_dhq_nr_queued(llcx->mig_dhq) > 0) { pid = scx_dhq_pop_strand(llcx->mig_dhq, llcx->dhq_strand); if (!pid) { trace("DHQ pop returned NULL"); @@ -1860,13 +1992,10 @@ static bool consume_llc(struct llc_ctx *llcx) } /* Insert to LLC DSQ and let move_to_local handle affinity atomically */ - trace("DHQ %llu insert %s[%d] to LLC DSQ", - llcx->mig_dhq, p->comm, p->pid); - scx_bpf_dsq_insert_vtime(p, - cpuc->llc_dsq, - taskc->slice_ns, - p->scx.dsq_vtime, - taskc->enq_flags); + trace("DHQ %llu insert %s[%d] to LLC DSQ", llcx->mig_dhq, + p->comm, p->pid); + scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns, + p->scx.dsq_vtime, taskc->enq_flags); bpf_task_release(p); /* Try to dispatch from LLC DSQ (handles affinity check atomically) */ @@ -1875,22 +2004,19 @@ static bool consume_llc(struct llc_ctx *llcx) goto try_dsq; } else if (p2dq_config.atq_enabled && - scx_atq_nr_queued(llcx->mig_atq) > 0) { + scx_atq_nr_queued(llcx->mig_atq) > 0) { taskc = (task_ctx *)scx_atq_pop(llcx->mig_atq); - p = bpf_task_from_pid((s32)taskc->pid); + p = bpf_task_from_pid((s32)taskc->pid); if (!p) { trace("ATQ failed to get pid %llu", taskc->pid); return false; } -/* Insert to LLC DSQ and let move_to_local handle affinity atomically */ - trace("ATQ %llu insert %s[%d] to LLC DSQ", - llcx->mig_atq, p->comm, p->pid); - scx_bpf_dsq_insert_vtime(p, - cpuc->llc_dsq, - taskc->slice_ns, - p->scx.dsq_vtime, - taskc->enq_flags); + /* Insert to LLC DSQ and let move_to_local handle affinity atomically */ + trace("ATQ %llu insert %s[%d] to LLC DSQ", llcx->mig_atq, + p->comm, p->pid); + scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns, + p->scx.dsq_vtime, taskc->enq_flags); bpf_task_release(p); /* Try to dispatch from LLC DSQ (handles affinity check atomically) */ @@ -1905,11 +2031,12 @@ static bool consume_llc(struct llc_ctx *llcx) return false; } -static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, struct cpu_ctx *cpuc) +static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, + struct cpu_ctx *cpuc) { struct llc_ctx *first, *second, *left, *right; - int i; - u64 cur_load; + int i; + u64 cur_load; // Single-LLC fast path: skip pick-2 entirely if (unlikely(lb_config.single_llc_mode)) @@ -1920,11 +2047,10 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, // If on a single LLC there isn't anything left to try. if (unlikely(topo_config.nr_llcs == 1 || - lb_config.dispatch_pick2_disable || - topo_config.nr_llcs >= MAX_LLCS)) + lb_config.dispatch_pick2_disable || + topo_config.nr_llcs >= MAX_LLCS)) return -EINVAL; - if (lb_config.min_nr_queued_pick2 > 0) { u64 nr_queued = llc_nr_queued(cur_llcx); if (nr_queued < lb_config.min_nr_queued_pick2) @@ -1944,8 +2070,10 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, * from. This yields better work conservation on machines with a large * number of LLCs. */ - left = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[0]) : rand_llc_ctx(); - right = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[1]) : rand_llc_ctx(); + left = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[0]) : + rand_llc_ctx(); + right = topo_config.nr_llcs == 2 ? lookup_llc_ctx(llc_ids[1]) : + rand_llc_ctx(); if (!left || !right) return -EINVAL; @@ -1959,33 +2087,31 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, return -EINVAL; } - if (llc_get_load(right) > llc_get_load(left)) { - first = right; + first = right; second = left; } else { - first = left; + first = left; second = right; } // Handle the edge case where there are two LLCs and the current has // more load. Since it's already been checked start with the other LLC. if (topo_config.nr_llcs == 2 && first->id == cur_llcx->id) { - first = second; + first = second; second = cur_llcx; } - trace("PICK2 cpu[%d] first[%d] %llu second[%d] %llu", - cpu, first->id, llc_get_load(first), second->id, llc_get_load(second)); + trace("PICK2 cpu[%d] first[%d] %llu second[%d] %llu", cpu, first->id, + llc_get_load(first), second->id, llc_get_load(second)); - cur_load = llc_get_load(cur_llcx) + ((llc_get_load(cur_llcx) * lb_config.slack_factor) / 100); + cur_load = llc_get_load(cur_llcx) + + ((llc_get_load(cur_llcx) * lb_config.slack_factor) / 100); - if (llc_get_load(first) >= cur_load && - consume_llc(first)) + if (llc_get_load(first) >= cur_load && consume_llc(first)) return 0; - if (llc_get_load(second) >= cur_load && - consume_llc(second)) + if (llc_get_load(second) >= cur_load && consume_llc(second)) return 0; if (saturated) { @@ -1996,8 +2122,7 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, return 0; // If the system is saturated then be aggressive in trying to load balance. - if (topo_config.nr_llcs > 2 && - (first = rand_llc_ctx()) && + if (topo_config.nr_llcs > 2 && (first = rand_llc_ctx()) && consume_llc(first)) return 0; } @@ -2005,18 +2130,17 @@ static __always_inline int dispatch_pick_two(s32 cpu, struct llc_ctx *cur_llcx, return 0; } - static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) { struct task_struct *p; - task_ctx *taskc; - struct cpu_ctx *cpuc; - struct llc_ctx *llcx; - u64 pid, dsq_id = 0; - scx_atq_t *min_atq = NULL; - scx_dhq_t *min_dhq = NULL; - - cpuc = lookup_cpu_ctx(cpu); + task_ctx *taskc; + struct cpu_ctx *cpuc; + struct llc_ctx *llcx; + u64 pid, dsq_id = 0; + scx_atq_t *min_atq = NULL; + scx_dhq_t *min_dhq = NULL; + + cpuc = lookup_cpu_ctx(cpu); if (unlikely(!cpuc)) { scx_bpf_error("no valid CPU contexts in dispatch"); return; @@ -2031,38 +2155,52 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) if (p) { if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { min_vtime = p->scx.dsq_vtime; - dsq_id = cpuc->affn_dsq; + dsq_id = cpuc->affn_dsq; } else { // Task at head of affn_dsq can't run here - move it to correct affn_dsq // This prevents livelock where mismatched tasks block the queue - s32 target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr); + s32 target_cpu = + bpf_cpumask_any_distribute(p->cpus_ptr); if (target_cpu >= 0 && target_cpu < NR_CPUS) { - struct cpu_ctx *target_cpuc = lookup_cpu_ctx(target_cpu); + struct cpu_ctx *target_cpuc = + lookup_cpu_ctx(target_cpu); if (target_cpuc) { - bpf_for_each(scx_dsq, p, cpuc->affn_dsq, 0) { - if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + bpf_for_each(scx_dsq, p, cpuc->affn_dsq, + 0) { + if (bpf_cpumask_test_cpu( + cpu, p->cpus_ptr)) { // Found a task that belongs here, stop cleanup break; } // Move mismatched task to its target CPU's affn_dsq - target_cpu = bpf_cpumask_any_distribute(p->cpus_ptr); - if (target_cpu >= 0 && target_cpu < NR_CPUS) { - target_cpuc = lookup_cpu_ctx(target_cpu); + target_cpu = + bpf_cpumask_any_distribute( + p->cpus_ptr); + if (target_cpu >= 0 && + target_cpu < NR_CPUS) { + target_cpuc = lookup_cpu_ctx( + target_cpu); if (target_cpuc) { - __COMPAT_scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, - p, - target_cpuc->affn_dsq, - 0); + __COMPAT_scx_bpf_dsq_move_vtime( + BPF_FOR_EACH_ITER, + p, + target_cpuc + ->affn_dsq, + 0); trace("DISPATCH cpu[%d] moved affn task %d to cpu[%d] affn_dsq", - cpu, p->pid, target_cpu); + cpu, + p->pid, + target_cpu); } } } // Re-peek after cleanup - p = __COMPAT_scx_bpf_dsq_peek(cpuc->affn_dsq); - if (p && bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { + p = __COMPAT_scx_bpf_dsq_peek( + cpuc->affn_dsq); + if (p && bpf_cpumask_test_cpu( + cpu, p->cpus_ptr)) { min_vtime = p->scx.dsq_vtime; - dsq_id = cpuc->affn_dsq; + dsq_id = cpuc->affn_dsq; } } } @@ -2077,7 +2215,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) if (llcx && llcx->cpumask) { s32 other_cpu; - bpf_for(other_cpu, 0, topo_config.nr_cpus) { + bpf_for(other_cpu, 0, topo_config.nr_cpus) + { struct bpf_cpumask *llc_cpumask; if (other_cpu == cpu) @@ -2087,7 +2226,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) if (!llc_cpumask) continue; - if (!bpf_cpumask_test_cpu(other_cpu, cast_mask(llc_cpumask))) + if (!bpf_cpumask_test_cpu(other_cpu, + cast_mask(llc_cpumask))) continue; struct cpu_ctx *other_cpuc = lookup_cpu_ctx(other_cpu); @@ -2099,7 +2239,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) if (p && bpf_cpumask_test_cpu(cpu, p->cpus_ptr) && (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) { min_vtime = p->scx.dsq_vtime; - dsq_id = other_cpuc->affn_dsq; + dsq_id = other_cpuc->affn_dsq; } } } @@ -2110,28 +2250,33 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) if (p && (p->scx.dsq_vtime < min_vtime || min_vtime == 0) && bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { min_vtime = p->scx.dsq_vtime; - dsq_id = cpuc->llc_dsq; + dsq_id = cpuc->llc_dsq; } // Migration eligible vtime if (topo_config.nr_llcs > 1) { if (p2dq_config.dhq_enabled) { - pid = scx_dhq_peek_strand(cpuc->mig_dhq, cpuc->dhq_strand); + pid = scx_dhq_peek_strand(cpuc->mig_dhq, + cpuc->dhq_strand); if (pid && (p = bpf_task_from_pid((s32)pid))) { - if (likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) && - (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) { + if (likely(bpf_cpumask_test_cpu(cpu, + p->cpus_ptr)) && + (p->scx.dsq_vtime < min_vtime || + min_vtime == 0)) { min_vtime = p->scx.dsq_vtime; - min_dhq = cpuc->mig_dhq; + min_dhq = cpuc->mig_dhq; } bpf_task_release(p); } } else if (p2dq_config.atq_enabled) { pid = scx_atq_peek(cpuc->mig_atq); if ((p = bpf_task_from_pid((s32)pid))) { - if (likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) && - (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) { + if (likely(bpf_cpumask_test_cpu(cpu, + p->cpus_ptr)) && + (p->scx.dsq_vtime < min_vtime || + min_vtime == 0)) { min_vtime = p->scx.dsq_vtime; - min_atq = cpuc->mig_atq; + min_atq = cpuc->mig_atq; /* * With ATQs we can peek and pop to check that * the popped task is the same as the peeked task. @@ -2145,10 +2290,11 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) } else { // Peek migration DSQ - only consider tasks that can run here p = __COMPAT_scx_bpf_dsq_peek(cpuc->mig_dsq); - if (p && likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) && + if (p && + likely(bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) && (p->scx.dsq_vtime < min_vtime || min_vtime == 0)) { min_vtime = p->scx.dsq_vtime; - dsq_id = cpuc->mig_dsq; + dsq_id = cpuc->mig_dsq; } } } @@ -2161,7 +2307,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) // First try the DHQ/ATQ with the lowest vtime for fairness. if (unlikely(min_dhq)) { - trace("DHQ dispatching %llu with min vtime %llu", min_dhq, min_vtime); + trace("DHQ dispatching %llu with min vtime %llu", min_dhq, + min_vtime); pid = scx_dhq_pop_strand(min_dhq, cpuc->dhq_strand); if (likely(pid && (p = bpf_task_from_pid((s32)pid)))) { if (unlikely(!(taskc = lookup_task_ctx(p)))) { @@ -2173,8 +2320,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) /* Check if task can still run on current CPU */ /* Insert to LLC DSQ for atomic affinity handling */ - scx_bpf_dsq_insert_vtime(p, - cpuc->llc_dsq, + scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns, p->scx.dsq_vtime, taskc->enq_flags); @@ -2185,7 +2331,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) return; } } else if (unlikely(min_atq)) { - trace("ATQ dispatching %llu with min vtime %llu", min_atq, min_vtime); + trace("ATQ dispatching %llu with min vtime %llu", min_atq, + min_vtime); pid = scx_atq_pop(min_atq); if (likely((p = bpf_task_from_pid((s32)pid)))) { /* @@ -2198,10 +2345,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) return; } - /* Insert to LLC DSQ for atomic affinity handling */ - scx_bpf_dsq_insert_vtime(p, - cpuc->llc_dsq, + scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns, p->scx.dsq_vtime, taskc->enq_flags); @@ -2212,7 +2357,8 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) return; } } else { - if (likely(valid_dsq(dsq_id) && scx_bpf_dsq_move_to_local(dsq_id))) + if (likely(valid_dsq(dsq_id) && + scx_bpf_dsq_move_to_local(dsq_id))) return; } @@ -2224,17 +2370,24 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) scx_bpf_dsq_move_to_local(cpuc->llc_dsq)) return; - if ((llcx = lookup_llc_ctx(cpuc->llc_id)) && llcx->nr_shards > 1) { + if ((llcx = lookup_llc_ctx(cpuc->llc_id)) && + llcx->nr_shards > 1) { // Then try other shards in the LLC for work stealing u32 shard_idx; - bpf_for(shard_idx, 0, llcx->nr_shards) { + bpf_for(shard_idx, 0, llcx->nr_shards) + { u32 offset = cpuc->id % llcx->nr_shards; - shard_idx = wrap_index(offset + shard_idx, 0, llcx->nr_shards); + shard_idx = wrap_index(offset + shard_idx, 0, + llcx->nr_shards); // TODO: should probably take min vtime to be fair - if (shard_idx < MAX_LLC_SHARDS && shard_idx < llcx->nr_shards) { - u64 shard_dsq = *MEMBER_VPTR(llcx->shard_dsqs, [shard_idx]); - if (shard_dsq != cpuc->llc_dsq && shard_dsq != dsq_id && - scx_bpf_dsq_move_to_local(shard_dsq)) + if (shard_idx < MAX_LLC_SHARDS && + shard_idx < llcx->nr_shards) { + u64 shard_dsq = *MEMBER_VPTR( + llcx->shard_dsqs, [shard_idx]); + if (shard_dsq != cpuc->llc_dsq && + shard_dsq != dsq_id && + scx_bpf_dsq_move_to_local( + shard_dsq)) return; } } @@ -2257,8 +2410,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) /* Check if task can still run on current CPU */ /* Insert to LLC DSQ for atomic affinity handling */ - scx_bpf_dsq_insert_vtime(p, - cpuc->llc_dsq, + scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns, p->scx.dsq_vtime, taskc->enq_flags); @@ -2279,8 +2431,7 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) /* Check if task can still run on current CPU */ /* Insert to LLC DSQ for atomic affinity handling */ - scx_bpf_dsq_insert_vtime(p, - cpuc->llc_dsq, + scx_bpf_dsq_insert_vtime(p, cpuc->llc_dsq, taskc->slice_ns, p->scx.dsq_vtime, taskc->enq_flags); @@ -2292,13 +2443,13 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) } } else { if (likely(cpuc && dsq_id != cpuc->mig_dsq && - scx_bpf_dsq_move_to_local(cpuc->mig_dsq))) + scx_bpf_dsq_move_to_local(cpuc->mig_dsq))) return; } // Lookup LLC ctx (should never fail at this point) if (unlikely(p2dq_config.llc_shards <= 1 && - !(llcx = lookup_llc_ctx(cpuc->llc_id)))) { + !(llcx = lookup_llc_ctx(cpuc->llc_id)))) { scx_bpf_error("invalid llc id %u", cpuc->llc_id); return; } @@ -2313,10 +2464,10 @@ static void p2dq_dispatch_impl(s32 cpu, struct task_struct *prev) void BPF_STRUCT_OPS(p2dq_set_cpumask, struct task_struct *p, const struct cpumask *cpumask) { - task_ctx *taskc; + task_ctx *taskc; struct cpu_ctx *cpuc; struct llc_ctx *llcx; - bool was_all_cpus, is_all_cpus; + bool was_all_cpus, is_all_cpus; if (!(taskc = lookup_task_ctx(p))) return; @@ -2334,8 +2485,7 @@ void BPF_STRUCT_OPS(p2dq_set_cpumask, struct task_struct *p, // If affinity narrowed from all CPUs to restricted, and task is in // migration DSQ, move it to LLC DSQ to prevent cross-LLC livelock - if (was_all_cpus && !is_all_cpus && - valid_dsq(taskc->dsq_id) && + if (was_all_cpus && !is_all_cpus && valid_dsq(taskc->dsq_id) && (taskc->dsq_id & P2DQ_MIG_DSQ)) { s32 cpu = scx_bpf_task_cpu(p); if (cpu < 0 || cpu >= topo_config.nr_cpus) @@ -2359,21 +2509,22 @@ void BPF_STRUCT_OPS(p2dq_set_cpumask, struct task_struct *p, void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle) { const struct cpumask *idle_cpumask; - struct llc_ctx *llcx; - u64 idle_score; - int ret, priority; - u32 percent_idle; + struct llc_ctx *llcx; + u64 idle_score; + int ret, priority; + u32 percent_idle; idle_cpumask = scx_bpf_get_idle_cpumask(); percent_idle = idle_cpu_percent(idle_cpumask); - saturated = percent_idle < p2dq_config.saturated_percent; + saturated = percent_idle < p2dq_config.saturated_percent; if (saturated) { min_llc_runs_pick2 = min(2, lb_config.min_llc_runs_pick2); } else { - u32 llc_scaler = log2_u32(topo_config.nr_llcs); - min_llc_runs_pick2 = min(log2_u32(percent_idle) + llc_scaler, lb_config.min_llc_runs_pick2); + u32 llc_scaler = log2_u32(topo_config.nr_llcs); + min_llc_runs_pick2 = min(log2_u32(percent_idle) + llc_scaler, + lb_config.min_llc_runs_pick2); } if (!(llcx = lookup_cpu_llc_ctx(cpu))) { @@ -2386,9 +2537,9 @@ void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle) if (idle) { llc_ctx_clear_flag(llcx, LLC_CTX_F_SATURATED); overloaded = false; - } else if (!idle && llcx->cpumask && idle_cpumask && llcx->tmp_cpumask) { - bpf_cpumask_and(llcx->tmp_cpumask, - cast_mask(llcx->cpumask), + } else if (!idle && llcx->cpumask && idle_cpumask && + llcx->tmp_cpumask) { + bpf_cpumask_and(llcx->tmp_cpumask, cast_mask(llcx->cpumask), idle_cpumask); if (llcx->tmp_cpumask && bpf_cpumask_weight(cast_mask(llcx->tmp_cpumask)) == 0) @@ -2410,7 +2561,7 @@ void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle) priority = 1; // Since we use a minheap convert the highest prio to lowest score. - idle_score = scx_bpf_now() - ((1<<7) * (u64)priority); + idle_score = scx_bpf_now() - ((1 << 7) * (u64)priority); if ((ret = arena_spin_lock((void __arena *)&llcx->idle_lock))) return; @@ -2421,18 +2572,19 @@ void BPF_STRUCT_OPS(p2dq_update_idle, s32 cpu, bool idle) return; } -static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args *args) +static s32 p2dq_init_task_impl(struct task_struct *p, + struct scx_init_task_args *args) { struct mask_wrapper *wrapper; - struct bpf_cpumask *cpumask; - task_ctx *taskc; - struct cpu_ctx *cpuc; - struct llc_ctx *llcx; - u64 slice_ns; + struct bpf_cpumask *cpumask; + task_ctx *taskc; + struct cpu_ctx *cpuc; + struct llc_ctx *llcx; + u64 slice_ns; - s32 task_cpu = scx_bpf_task_cpu(p); + s32 task_cpu = scx_bpf_task_cpu(p); - taskc = scx_task_alloc(p); + taskc = scx_task_alloc(p); if (!taskc) { scx_bpf_error("task_ctx allocation failure"); return -ENOMEM; @@ -2461,10 +2613,10 @@ static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args return -EINVAL; } - slice_ns = scale_by_task_weight(p, - dsq_time_slice(p2dq_config.init_dsq_index)); + slice_ns = scale_by_task_weight( + p, dsq_time_slice(p2dq_config.init_dsq_index)); - taskc->llc_id = cpuc->llc_id; + taskc->llc_id = cpuc->llc_id; taskc->node_id = cpuc->node_id; // Adjust starting index based on niceness @@ -2476,8 +2628,8 @@ static s32 p2dq_init_task_impl(struct task_struct *p, struct scx_init_task_args taskc->dsq_index = p2dq_config.nr_dsqs_per_llc - 1; } taskc->last_dsq_index = taskc->dsq_index; - taskc->slice_ns = slice_ns; - taskc->enq_flags = 0; + taskc->slice_ns = slice_ns; + taskc->enq_flags = 0; if (p->cpus_ptr == &p->cpus_mask && p->nr_cpus_allowed == topo_config.nr_cpus) @@ -2511,11 +2663,67 @@ void BPF_STRUCT_OPS(p2dq_exit_task, struct task_struct *p, scx_task_free(p); } +static int init_cluster(u32 cluster_index) +{ + struct cluster_ctx *clusterx; + u32 cluster_id = cluster_ids[cluster_index]; + int ret; + + clusterx = bpf_map_lookup_elem(&cluster_ctxs, &cluster_id); + if (!clusterx) { + scx_bpf_error("No cluster %u", cluster_id); + return -ENOENT; + } + + clusterx->id = *MEMBER_VPTR(cluster_ids, [cluster_index]); + clusterx->nr_cpus = 0; + clusterx->vtime = 0; + clusterx->load = 0; + clusterx->affn_load = 0; + clusterx->state_flags = 0; + + // Create cluster-local DSQ + clusterx->dsq = clusterx->id | (MAX_CLUSTERS << 8); + ret = scx_bpf_create_dsq(clusterx->dsq, clusterx->node_id); + if (ret) { + scx_bpf_error("failed to create cluster DSQ %llu", + clusterx->dsq); + return -EINVAL; + } + + // Initialize cluster cpumasks + ret = init_cpumask(&clusterx->cpumask); + if (ret) { + scx_bpf_error("failed to create cluster cpumask"); + return ret; + } + + ret = init_cpumask(&clusterx->tmp_cpumask); + if (ret) { + scx_bpf_error("failed to create cluster tmp_cpumask"); + return ret; + } + + ret = init_cpumask(&clusterx->big_cpumask); + if (ret) { + scx_bpf_error("failed to create cluster big cpumask"); + return ret; + } + + ret = init_cpumask(&clusterx->little_cpumask); + if (ret) { + scx_bpf_error("failed to create cluster little cpumask"); + return ret; + } + + return 0; +} + static int init_llc(u32 llc_index) { struct llc_ctx *llcx; - u32 llc_id = llc_ids[llc_index]; - int i, ret; + u32 llc_id = llc_ids[llc_index]; + int i, ret; llcx = bpf_map_lookup_elem(&llc_ctxs, &llc_id); if (!llcx) { @@ -2523,13 +2731,13 @@ static int init_llc(u32 llc_index) return -ENOENT; } - llcx->vtime = 0; - llcx->id = *MEMBER_VPTR(llc_ids, [llc_index]); - llcx->index = llc_index; + llcx->vtime = 0; + llcx->id = *MEMBER_VPTR(llc_ids, [llc_index]); + llcx->index = llc_index; llcx->nr_cpus = 0; - llcx->vtime = 0; + llcx->vtime = 0; - ret = llc_create_atqs(llcx); + ret = llc_create_atqs(llcx); if (ret) { return ret; } @@ -2540,14 +2748,14 @@ static int init_llc(u32 llc_index) } llcx->dsq = llcx->id | MAX_LLCS; - ret = scx_bpf_create_dsq(llcx->dsq, llcx->node_id); + ret = scx_bpf_create_dsq(llcx->dsq, llcx->node_id); if (ret) { scx_bpf_error("failed to create DSQ %llu", llcx->dsq); return -EINVAL; } llcx->mig_dsq = llcx->id | P2DQ_MIG_DSQ; - ret = scx_bpf_create_dsq(llcx->mig_dsq, llcx->node_id); + ret = scx_bpf_create_dsq(llcx->mig_dsq, llcx->node_id); if (ret) { scx_bpf_error("failed to create DSQ %llu", llcx->mig_dsq); return -EINVAL; @@ -2588,17 +2796,21 @@ static int init_llc(u32 llc_index) llcx->nr_shards = p2dq_config.llc_shards; if (p2dq_config.llc_shards > 1) { - llcx->nr_shards = min(min(p2dq_config.llc_shards, llcx->nr_cpus), MAX_LLC_SHARDS); + llcx->nr_shards = + min(min(p2dq_config.llc_shards, llcx->nr_cpus), + MAX_LLC_SHARDS); - bpf_for(i, 0, llcx->nr_shards) { + bpf_for(i, 0, llcx->nr_shards) + { u64 shard_dsq = shard_dsq_id(llc_id, i); if (i < MAX_LLC_SHARDS) // verifier llcx->shard_dsqs[i] = shard_dsq; ret = scx_bpf_create_dsq(shard_dsq, llcx->node_id); if (ret) { - scx_bpf_error("failed to create shard DSQ %llu for LLC %u shard %u", - shard_dsq, llc_id, i); + scx_bpf_error( + "failed to create shard DSQ %llu for LLC %u shard %u", + shard_dsq, llc_id, i); return ret; } } @@ -2610,7 +2822,7 @@ static int init_llc(u32 llc_index) static int init_node(u32 node_id) { struct node_ctx *nodec; - int ret; + int ret; nodec = bpf_map_lookup_elem(&node_ctxs, &node_id); if (!nodec) { @@ -2620,7 +2832,7 @@ static int init_node(u32 node_id) nodec->id = node_id; - ret = init_cpumask(&nodec->cpumask); + ret = init_cpumask(&nodec->cpumask); if (ret) { scx_bpf_error("failed to create node cpumask"); return ret; @@ -2641,16 +2853,19 @@ static int init_node(u32 node_id) // Initializes per CPU data structures. static s32 init_cpu(int cpu) { - struct node_ctx *nodec; - struct llc_ctx *llcx; - struct cpu_ctx *cpuc; + struct node_ctx *nodec; + struct llc_ctx *llcx; + struct cluster_ctx *clusterx = NULL; + struct cpu_ctx *cpuc; if (!(cpuc = lookup_cpu_ctx(cpu))) return -ENOENT; - cpuc->id = cpu; - cpuc->llc_id = cpu_llc_ids[cpu]; - cpuc->node_id = cpu_node_ids[cpu]; + cpuc->id = cpu; + cpuc->llc_id = cpu_llc_ids[cpu]; + cpuc->cluster_id = cpu_cluster_ids[cpu]; + cpuc->node_id = cpu_node_ids[cpu]; + // cluster_id will be populated from Rust userspace via cpu_cluster_ids[] if (big_core_ids[cpu] == 1) cpu_ctx_set_flag(cpuc, CPU_CTX_F_IS_BIG); else @@ -2663,13 +2878,30 @@ static s32 init_cpu(int cpu) return -ENOENT; } + // Lookup cluster context if clusters are enabled + if (topo_config.has_clusters && + cpuc->cluster_id < topo_config.nr_clusters) { + clusterx = lookup_cluster_ctx(cpuc->cluster_id); + if (!clusterx) { + scx_bpf_error( + "failed to get cluster ctx for cpu %u cluster %u", + cpu, cpuc->cluster_id); + return -ENOENT; + } + clusterx->nr_cpus += 1; + // Copy for each CPU in cluster (gets overwritten, doesn't matter) + clusterx->llc_id = cpuc->llc_id; + clusterx->node_id = cpuc->node_id; + cpuc->cluster_dsq = clusterx->dsq; + } + // copy for each cpu, doesn't matter if it gets overwritten. llcx->nr_cpus += 1; - llcx->id = cpu_llc_ids[cpu]; - llcx->node_id = cpu_node_ids[cpu]; - nodec->id = cpu_node_ids[cpu]; - cpuc->mig_atq = llcx->mig_atq; - cpuc->mig_dhq = llcx->mig_dhq; + llcx->id = cpu_llc_ids[cpu]; + llcx->node_id = cpu_node_ids[cpu]; + nodec->id = cpu_node_ids[cpu]; + cpuc->mig_atq = llcx->mig_atq; + cpuc->mig_dhq = llcx->mig_dhq; cpuc->dhq_strand = llcx->dhq_strand; if (cpu_ctx_test_flag(cpuc, CPU_CTX_F_IS_BIG)) { @@ -2681,11 +2913,15 @@ static s32 init_cpu(int cpu) bpf_cpumask_set_cpu(cpu, nodec->big_cpumask); if (llcx->big_cpumask) bpf_cpumask_set_cpu(cpu, llcx->big_cpumask); + if (clusterx && clusterx->big_cpumask) + bpf_cpumask_set_cpu(cpu, clusterx->big_cpumask); bpf_rcu_read_unlock(); } else { bpf_rcu_read_lock(); if (llcx->little_cpumask) bpf_cpumask_set_cpu(cpu, llcx->little_cpumask); + if (clusterx && clusterx->little_cpumask) + bpf_cpumask_set_cpu(cpu, clusterx->little_cpumask); bpf_rcu_read_unlock(); } @@ -2696,10 +2932,16 @@ static s32 init_cpu(int cpu) bpf_cpumask_set_cpu(cpu, nodec->cpumask); if (llcx->cpumask) bpf_cpumask_set_cpu(cpu, llcx->cpumask); + if (clusterx && clusterx->cpumask) + bpf_cpumask_set_cpu(cpu, clusterx->cpumask); bpf_rcu_read_unlock(); - trace("CFG CPU[%d]NODE[%d]LLC[%d] initialized", - cpu, cpuc->node_id, cpuc->llc_id); + if (topo_config.has_clusters) + trace("CFG CPU[%d]NODE[%d]LLC[%d]CLUSTER[%d] initialized", cpu, + cpuc->node_id, cpuc->llc_id, cpuc->cluster_id); + else + trace("CFG CPU[%d]NODE[%d]LLC[%d] initialized", cpu, + cpuc->node_id, cpuc->llc_id); return 0; } @@ -2707,11 +2949,12 @@ static s32 init_cpu(int cpu) static bool load_balance_timer(void) { struct llc_ctx *llcx, *lb_llcx; - int j; - u64 ideal_sum, load_sum = 0, interactive_sum = 0; - u32 llc_id, llc_index, lb_llc_index, lb_llc_id; + int j; + u64 ideal_sum, load_sum = 0, interactive_sum = 0; + u32 llc_id, llc_index, lb_llc_index, lb_llc_id; - bpf_for(llc_index, 0, topo_config.nr_llcs) { + bpf_for(llc_index, 0, topo_config.nr_llcs) + { // verifier if (llc_index >= MAX_LLCS) break; @@ -2722,7 +2965,8 @@ static bool load_balance_timer(void) return false; } - lb_llc_index = (llc_index + llc_lb_offset) % topo_config.nr_llcs; + lb_llc_index = + (llc_index + llc_lb_offset) % topo_config.nr_llcs; if (lb_llc_index < 0 || lb_llc_index >= MAX_LLCS) { scx_bpf_error("failed to lookup lb_llc"); return false; @@ -2735,63 +2979,84 @@ static bool load_balance_timer(void) } /* Use PELT metrics if enabled, otherwise use simple counters */ - u64 llc_load = p2dq_config.pelt_enabled ? llcx->util_avg : llcx->load; - u64 lb_llc_load = p2dq_config.pelt_enabled ? lb_llcx->util_avg : lb_llcx->load; - u64 llc_intr_load = p2dq_config.pelt_enabled ? llcx->intr_util_avg : llcx->intr_load; + u64 llc_load = p2dq_config.pelt_enabled ? llcx->util_avg : + llcx->load; + u64 lb_llc_load = p2dq_config.pelt_enabled ? lb_llcx->util_avg : + lb_llcx->load; + u64 llc_intr_load = p2dq_config.pelt_enabled ? + llcx->intr_util_avg : + llcx->intr_load; load_sum += llc_load; interactive_sum += llc_intr_load; s64 load_imbalance = 0; - if(llc_load > lb_llc_load) - load_imbalance = (100 * (llc_load - lb_llc_load)) / llc_load; + if (llc_load > lb_llc_load) + load_imbalance = + (100 * (llc_load - lb_llc_load)) / llc_load; u32 lb_slack = (lb_config.slack_factor > 0 ? - lb_config.slack_factor : LOAD_BALANCE_SLACK); + lb_config.slack_factor : + LOAD_BALANCE_SLACK); if (load_imbalance > lb_slack) llcx->lb_llc_id = lb_llc_id; else llcx->lb_llc_id = MAX_LLCS; - dbg("LB llcx[%u] %llu lb_llcx[%u] %llu imbalance %lli", - llc_id, llc_load, lb_llc_id, lb_llc_load, load_imbalance); + dbg("LB llcx[%u] %llu lb_llcx[%u] %llu imbalance %lli", llc_id, + llc_load, lb_llc_id, lb_llc_load, load_imbalance); } - dbg("LB Total load %llu, Total interactive %llu", - load_sum, interactive_sum); + dbg("LB Total load %llu, Total interactive %llu", load_sum, + interactive_sum); - llc_lb_offset = (llc_lb_offset % (topo_config.nr_llcs - 1)) + 1; + // Only rotate offset if we have more than 2 LLCs + // For 2 LLCs, offset 1 is the only valid value and doesn't need to change + // For 1 LLC, no load balancing between LLCs is needed + if (topo_config.nr_llcs > 2) + llc_lb_offset = (llc_lb_offset % (topo_config.nr_llcs - 1)) + 1; - if (!timeline_config.autoslice || load_sum == 0 || load_sum < interactive_sum) + if (!timeline_config.autoslice || load_sum == 0 || + load_sum < interactive_sum) goto reset_load; if (interactive_sum == 0) { dsq_time_slices[0] = (11 * dsq_time_slices[0]) / 10; - bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) { - dsq_time_slices[j] = dsq_time_slices[0] << j << p2dq_config.dsq_shift; + bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) + { + dsq_time_slices[j] = dsq_time_slices[0] + << j << p2dq_config.dsq_shift; } } else { ideal_sum = (load_sum * p2dq_config.interactive_ratio) / 100; - dbg("LB autoslice ideal/sum %llu/%llu", ideal_sum, interactive_sum); + dbg("LB autoslice ideal/sum %llu/%llu", ideal_sum, + interactive_sum); if (interactive_sum < ideal_sum) { dsq_time_slices[0] = (11 * dsq_time_slices[0]) / 10; - bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) { - dsq_time_slices[j] = dsq_time_slices[0] << j << p2dq_config.dsq_shift; + bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) + { + dsq_time_slices[j] = dsq_time_slices[0] + << j + << p2dq_config.dsq_shift; } } else { - dsq_time_slices[0] = max((10 * dsq_time_slices[0]) / 11, min_slice_ns); - bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) { - dsq_time_slices[j] = dsq_time_slices[0] << j << p2dq_config.dsq_shift; + dsq_time_slices[0] = max((10 * dsq_time_slices[0]) / 11, + min_slice_ns); + bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) + { + dsq_time_slices[j] = dsq_time_slices[0] + << j + << p2dq_config.dsq_shift; } } } - reset_load: - bpf_for(llc_index, 0, topo_config.nr_llcs) { + bpf_for(llc_index, 0, topo_config.nr_llcs) + { llc_id = *MEMBER_VPTR(llc_ids, [llc_index]); if (!(llcx = lookup_llc_ctx(llc_id))) return false; @@ -2801,7 +3066,7 @@ static bool load_balance_timer(void) * weighting. We only reset simple counters for legacy mode. */ if (!p2dq_config.pelt_enabled) { - llcx->load = 0; + llcx->load = 0; llcx->intr_load = 0; llcx->affn_load = 0; } @@ -2809,23 +3074,34 @@ static bool load_balance_timer(void) llcx->last_period_ns = scx_bpf_now(); if (!p2dq_config.pelt_enabled) { - bpf_for(j, 0, p2dq_config.nr_dsqs_per_llc) { + bpf_for(j, 0, p2dq_config.nr_dsqs_per_llc) + { llcx->dsq_load[j] = 0; if (llc_id == 0 && timeline_config.autoslice) { - if (j > 0 && dsq_time_slices[j] < dsq_time_slices[j-1]) { - dsq_time_slices[j] = dsq_time_slices[j-1] << p2dq_config.dsq_shift; + if (j > 0 && + dsq_time_slices[j] < + dsq_time_slices[j - 1]) { + dsq_time_slices[j] = + dsq_time_slices[j - 1] + << p2dq_config.dsq_shift; } - dbg("LB autoslice interactive slice %llu", dsq_time_slices[j]); + dbg("LB autoslice interactive slice %llu", + dsq_time_slices[j]); } } } else { /* Even with PELT, still validate autoslice timings */ if (llc_id == 0 && timeline_config.autoslice) { - bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) { - if (dsq_time_slices[j] < dsq_time_slices[j-1]) { - dsq_time_slices[j] = dsq_time_slices[j-1] << p2dq_config.dsq_shift; + bpf_for(j, 1, p2dq_config.nr_dsqs_per_llc) + { + if (dsq_time_slices[j] < + dsq_time_slices[j - 1]) { + dsq_time_slices[j] = + dsq_time_slices[j - 1] + << p2dq_config.dsq_shift; } - dbg("LB autoslice interactive slice %llu", dsq_time_slices[j]); + dbg("LB autoslice interactive slice %llu", + dsq_time_slices[j]); } } } @@ -2844,7 +3120,6 @@ static bool run_timer_cb(int key) } } - static int timer_cb(void *map, int key, struct timer_wrapper *timerw) { if (timerw->key < 0 || timerw->key > MAX_TIMERS) { @@ -2852,27 +3127,26 @@ static int timer_cb(void *map, int key, struct timer_wrapper *timerw) } struct p2dq_timer *cb_timer = &p2dq_timers[timerw->key]; - bool resched = run_timer_cb(timerw->key); + bool resched = run_timer_cb(timerw->key); if (!resched || !cb_timer || cb_timer->interval_ns == 0) { trace("TIMER timer %d stopped", timerw->key); return 0; } - bpf_timer_start(&timerw->timer, - cb_timer->interval_ns, + bpf_timer_start(&timerw->timer, cb_timer->interval_ns, cb_timer->start_flags); return 0; } - s32 static start_timers(void) { struct timer_wrapper *timerw; - int timer_id, err; + int timer_id, err; - bpf_for(timer_id, 0, MAX_TIMERS) { + bpf_for(timer_id, 0, MAX_TIMERS) + { timerw = bpf_map_lookup_elem(&timer_data, &timer_id); if (!timerw || timer_id < 0 || timer_id > MAX_TIMERS) { scx_bpf_error("Failed to lookup timer"); @@ -2886,7 +3160,8 @@ s32 static start_timers(void) } timerw->key = timer_id; - err = bpf_timer_init(&timerw->timer, &timer_data, new_timer->init_flags); + err = bpf_timer_init(&timerw->timer, &timer_data, + new_timer->init_flags); if (err < 0) { scx_bpf_error("can't happen"); return -ENOENT; @@ -2898,8 +3173,7 @@ s32 static start_timers(void) return -ENOENT; } - err = bpf_timer_start(&timerw->timer, - new_timer->interval_ns, + err = bpf_timer_start(&timerw->timer, new_timer->interval_ns, new_timer->start_flags); if (err < 0) { scx_bpf_error("can't happen"); @@ -2914,8 +3188,8 @@ static s32 p2dq_init_impl() { struct llc_ctx *llcx; struct cpu_ctx *cpuc; - int i, ret; - u64 dsq_id; + int i, ret; + u64 dsq_id; ret = init_cpumask(&all_cpumask); if (ret) { @@ -2934,36 +3208,50 @@ static s32 p2dq_init_impl() } // First we initialize LLCs because DSQs are created at the LLC level. - bpf_for(i, 0, topo_config.nr_llcs) { + bpf_for(i, 0, topo_config.nr_llcs) + { ret = init_llc(i); if (ret) return ret; } - bpf_for(i, 0, topo_config.nr_nodes) { + // Initialize clusters if cluster awareness is enabled + if (topo_config.has_clusters) { + bpf_for(i, 0, topo_config.nr_clusters) + { + ret = init_cluster(i); + if (ret) + return ret; + } + } + + bpf_for(i, 0, topo_config.nr_nodes) + { ret = init_node(i); if (ret) return ret; } - bpf_for(i, 0, topo_config.nr_cpus) { + bpf_for(i, 0, topo_config.nr_cpus) + { ret = init_cpu(i); if (ret) return ret; } // Create DSQs for the LLCs - bpf_for(i, 0, topo_config.nr_cpus) { + bpf_for(i, 0, topo_config.nr_cpus) + { if (!(cpuc = lookup_cpu_ctx(i)) || !(llcx = lookup_llc_ctx(cpuc->llc_id))) return -EINVAL; - if (cpuc && - llcx->node_cpumask && + if (cpuc && llcx->node_cpumask && llcx->node_id == cpuc->node_id) { bpf_rcu_read_lock(); if (llcx->node_cpumask) - bpf_cpumask_set_cpu(cpuc->id, llcx->node_cpumask); + bpf_cpumask_set_cpu(cpuc->id, + llcx->node_cpumask); bpf_rcu_read_unlock(); } @@ -2973,10 +3261,10 @@ static s32 p2dq_init_impl() if (p2dq_config.llc_shards > 1 && llcx->nr_shards > 1) { int shard_id = cpuc->core_id % llcx->nr_shards; - if (shard_id >= 0 && - shard_id < MAX_LLC_SHARDS && + if (shard_id >= 0 && shard_id < MAX_LLC_SHARDS && shard_id < llcx->nr_shards) - cpuc->llc_dsq = *MEMBER_VPTR(llcx->shard_dsqs, [shard_id]); + cpuc->llc_dsq = *MEMBER_VPTR( + llcx->shard_dsqs, [shard_id]); } dsq_id = cpu_dsq_id(i); @@ -2987,11 +3275,12 @@ static s32 p2dq_init_impl() return ret; } cpuc->affn_dsq = dsq_id; - cpuc->mig_dsq = llcx->mig_dsq; + cpuc->mig_dsq = llcx->mig_dsq; } if (p2dq_config.cpu_priority) { - bpf_for(i, 0, topo_config.nr_llcs) { + bpf_for(i, 0, topo_config.nr_llcs) + { if (!(llcx = lookup_llc_ctx(i))) return -EINVAL; llcx->idle_cpu_heap = scx_minheap_alloc(llcx->nr_cpus); @@ -3022,17 +3311,19 @@ void BPF_STRUCT_OPS(p2dq_running, struct task_struct *p) p2dq_running_impl(p); } -void BPF_STRUCT_OPS(p2dq_enqueue, struct task_struct *p __arg_trusted, u64 enq_flags) +void BPF_STRUCT_OPS(p2dq_enqueue, struct task_struct *p __arg_trusted, + u64 enq_flags) { struct enqueue_promise pro; async_p2dq_enqueue(&pro, p, enq_flags); complete_p2dq_enqueue(&pro, p); } -void BPF_STRUCT_OPS(p2dq_dequeue, struct task_struct *p __arg_trusted, u64 deq_flags) +void BPF_STRUCT_OPS(p2dq_dequeue, struct task_struct *p __arg_trusted, + u64 deq_flags) { task_ctx *taskc = lookup_task_ctx(p); - int ret; + int ret; ret = scx_atq_cancel(&taskc->common); if (ret) @@ -3046,7 +3337,8 @@ void BPF_STRUCT_OPS(p2dq_dispatch, s32 cpu, struct task_struct *prev) return p2dq_dispatch_impl(cpu, prev); } -s32 BPF_STRUCT_OPS(p2dq_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) +s32 BPF_STRUCT_OPS(p2dq_select_cpu, struct task_struct *p, s32 prev_cpu, + u64 wake_flags) { return p2dq_select_cpu_impl(p, prev_cpu, wake_flags); } @@ -3057,19 +3349,14 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(p2dq_init_task, struct task_struct *p, return p2dq_init_task_impl(p, args); } -SCX_OPS_DEFINE(p2dq, - .select_cpu = (void *)p2dq_select_cpu, - .enqueue = (void *)p2dq_enqueue, - .dequeue = (void *)p2dq_dequeue, - .dispatch = (void *)p2dq_dispatch, - .running = (void *)p2dq_running, - .stopping = (void *)p2dq_stopping, - .set_cpumask = (void *)p2dq_set_cpumask, - .update_idle = (void *)p2dq_update_idle, - .init_task = (void *)p2dq_init_task, - .exit_task = (void *)p2dq_exit_task, - .init = (void *)p2dq_init, - .exit = (void *)p2dq_exit, - .timeout_ms = 25000, - .name = "p2dq"); +SCX_OPS_DEFINE(p2dq, .select_cpu = (void *)p2dq_select_cpu, + .enqueue = (void *)p2dq_enqueue, .dequeue = (void *)p2dq_dequeue, + .dispatch = (void *)p2dq_dispatch, + .running = (void *)p2dq_running, + .stopping = (void *)p2dq_stopping, + .set_cpumask = (void *)p2dq_set_cpumask, + .update_idle = (void *)p2dq_update_idle, + .init_task = (void *)p2dq_init_task, + .exit_task = (void *)p2dq_exit_task, .init = (void *)p2dq_init, + .exit = (void *)p2dq_exit, .timeout_ms = 25000, .name = "p2dq"); #endif diff --git a/scheds/rust/scx_p2dq/src/bpf/types.h b/scheds/rust/scx_p2dq/src/bpf/types.h index 6ccc1ea7d3..6368971349 100644 --- a/scheds/rust/scx_p2dq/src/bpf/types.h +++ b/scheds/rust/scx_p2dq/src/bpf/types.h @@ -29,156 +29,213 @@ struct p2dq_timer { }; /* cpu_ctx flag bits */ -#define CPU_CTX_F_INTERACTIVE (1 << 0) -#define CPU_CTX_F_IS_BIG (1 << 1) -#define CPU_CTX_F_NICE_TASK (1 << 2) -#define CPU_CTX_F_CLEAN_AFFN_DSQ (1 << 3) +#define CPU_CTX_F_INTERACTIVE (1 << 0) +#define CPU_CTX_F_IS_BIG (1 << 1) +#define CPU_CTX_F_NICE_TASK (1 << 2) +#define CPU_CTX_F_CLEAN_AFFN_DSQ (1 << 3) /* Helper macros for cpu_ctx flags */ -#define cpu_ctx_set_flag(cpuc, flag) ((cpuc)->flags |= (flag)) -#define cpu_ctx_clear_flag(cpuc, flag) ((cpuc)->flags &= ~(flag)) -#define cpu_ctx_test_flag(cpuc, flag) ((cpuc)->flags & (flag)) +#define cpu_ctx_set_flag(cpuc, flag) ((cpuc)->flags |= (flag)) +#define cpu_ctx_clear_flag(cpuc, flag) ((cpuc)->flags &= ~(flag)) +#define cpu_ctx_test_flag(cpuc, flag) ((cpuc)->flags & (flag)) struct cpu_ctx { - int id; - u32 llc_id; - u64 affn_dsq; - u64 slice_ns; - u32 core_id; - u32 dsq_index; - u32 perf; - u32 flags; /* Bitmask for interactive, is_big, nice_task */ - u64 ran_for; - u32 node_id; - u64 mig_dsq; - u64 llc_dsq; - u64 max_load_dsq; - - scx_atq_t *mig_atq; - scx_dhq_t *mig_dhq; - u64 dhq_strand; /* Which DHQ strand (A or B) for this CPU's LLC */ + int id; + u32 llc_id; + u32 cluster_id; + u64 affn_dsq; + u64 cluster_dsq; + u64 slice_ns; + u32 core_id; + u32 dsq_index; + u32 perf; + u32 flags; /* Bitmask for interactive, is_big, nice_task */ + u64 ran_for; + u32 node_id; + u64 mig_dsq; + u64 llc_dsq; + u64 max_load_dsq; + + scx_atq_t *mig_atq; + scx_dhq_t *mig_dhq; + u64 dhq_strand; /* Which DHQ strand (A or B) for this CPU's LLC */ +}; + +/* cluster_ctx state flag bits */ +#define CLUSTER_CTX_F_SATURATED (1 << 0) + +/* Helper macros for cluster_ctx state flags */ +#define cluster_ctx_set_flag(clusterx, flag) ((clusterx)->state_flags |= (flag)) +#define cluster_ctx_clear_flag(clusterx, flag) \ + ((clusterx)->state_flags &= ~(flag)) +#define cluster_ctx_test_flag(clusterx, flag) ((clusterx)->state_flags & (flag)) + +struct cluster_ctx { + /* Read-mostly fields - grouped together */ + u32 id; + u32 kernel_id; + u32 llc_id; + u32 node_id; + u32 nr_cpus; + u64 dsq; + u64 last_period_ns; + + /* + * Hot atomic field #1: vtime - frequently updated + * Padded to separate cache line from read-mostly fields above + */ + char __pad1[CACHE_LINE_SIZE]; + u64 vtime; + + /* + * Hot atomic fields #2: load counters - frequently updated + * Keep these together on same cache line since they're updated atomically together + */ + char __pad2[CACHE_LINE_SIZE - sizeof(u64)]; + u64 load; + u64 affn_load; + u32 state_flags; /* Bitmask for saturated and other state */ + + /* + * Hot atomic field #3: idle lock - frequently contended in idle CPU selection + * Separate cache line from load counters above + */ + char __pad3[CACHE_LINE_SIZE - 2 * sizeof(u64) - sizeof(u32)]; + arena_spinlock_t idle_lock; + + /* + * Read-mostly pointers - grouped together + * Accessed during CPU selection but not updated frequently + */ + char __pad4[CACHE_LINE_SIZE - sizeof(arena_spinlock_t)]; + struct bpf_cpumask __kptr *cpumask; + struct bpf_cpumask __kptr *big_cpumask; + struct bpf_cpumask __kptr *little_cpumask; + struct bpf_cpumask __kptr *tmp_cpumask; + + scx_minheap_t *idle_cpu_heap; }; /* llc_ctx state flag bits */ -#define LLC_CTX_F_SATURATED (1 << 0) +#define LLC_CTX_F_SATURATED (1 << 0) /* Helper macros for llc_ctx state flags */ -#define llc_ctx_set_flag(llcx, flag) ((llcx)->state_flags |= (flag)) -#define llc_ctx_clear_flag(llcx, flag) ((llcx)->state_flags &= ~(flag)) -#define llc_ctx_test_flag(llcx, flag) ((llcx)->state_flags & (flag)) +#define llc_ctx_set_flag(llcx, flag) ((llcx)->state_flags |= (flag)) +#define llc_ctx_clear_flag(llcx, flag) ((llcx)->state_flags &= ~(flag)) +#define llc_ctx_test_flag(llcx, flag) ((llcx)->state_flags & (flag)) struct llc_ctx { /* Read-mostly fields - grouped together */ - u32 id; - u32 nr_cpus; - u32 node_id; - u32 lb_llc_id; - u32 index; - u64 dsq; - u64 mig_dsq; - u64 last_period_ns; - u64 dsq_load[MAX_DSQS_PER_LLC]; + u32 id; + u32 nr_cpus; + u32 node_id; + u32 lb_llc_id; + u32 index; + u64 dsq; + u64 mig_dsq; + u64 last_period_ns; + u64 dsq_load[MAX_DSQS_PER_LLC]; /* CPU sharding related fields */ - u32 nr_shards; - u64 shard_dsqs[MAX_LLC_SHARDS]; + u32 nr_shards; + u64 shard_dsqs[MAX_LLC_SHARDS]; /* * Hot atomic field #1: vtime - frequently updated in p2dq_stopping() * Padded to separate cache line from read-mostly fields above */ - char __pad1[CACHE_LINE_SIZE]; - u64 vtime; + char __pad1[CACHE_LINE_SIZE]; + u64 vtime; /* * Hot atomic fields #2: load counters - frequently updated in p2dq_stopping() * Keep these together on same cache line since they're updated atomically together * Pad to separate from vtime above */ - char __pad2[CACHE_LINE_SIZE - sizeof(u64)]; - u64 load; - u64 affn_load; - u64 intr_load; - u32 state_flags; /* Bitmask for saturated and other state */ + char __pad2[CACHE_LINE_SIZE - sizeof(u64)]; + u64 load; + u64 affn_load; + u64 intr_load; + u32 state_flags; /* Bitmask for saturated and other state */ /* PELT (Per-Entity Load Tracking) aggregate fields */ - u64 util_avg; /* Aggregate utilization average */ - u64 load_avg; /* Aggregate load average */ - u64 intr_util_avg; /* Interactive task utilization average */ - u64 affn_util_avg; /* Affinitized task utilization average */ + u64 util_avg; /* Aggregate utilization average */ + u64 load_avg; /* Aggregate load average */ + u64 intr_util_avg; /* Interactive task utilization average */ + u64 affn_util_avg; /* Affinitized task utilization average */ /* * Hot atomic field #3: idle lock - frequently contended in idle CPU selection * Separate cache line from load counters above */ - char __pad3[CACHE_LINE_SIZE - 7*sizeof(u64) - sizeof(u32)]; - arena_spinlock_t idle_lock; + char __pad3[CACHE_LINE_SIZE - 7 * sizeof(u64) - sizeof(u32)]; + arena_spinlock_t idle_lock; /* * Read-mostly pointers - grouped together * Accessed during CPU selection but not updated frequently */ - char __pad4[CACHE_LINE_SIZE - sizeof(arena_spinlock_t)]; - struct bpf_cpumask __kptr *cpumask; - struct bpf_cpumask __kptr *big_cpumask; - struct bpf_cpumask __kptr *little_cpumask; - struct bpf_cpumask __kptr *node_cpumask; - struct bpf_cpumask __kptr *tmp_cpumask; - - scx_atq_t *mig_atq; - scx_dhq_t *mig_dhq; - u64 dhq_strand; /* Which DHQ strand (A or B) for this LLC */ - scx_minheap_t *idle_cpu_heap; + char __pad4[CACHE_LINE_SIZE - sizeof(arena_spinlock_t)]; + struct bpf_cpumask __kptr *cpumask; + struct bpf_cpumask __kptr *big_cpumask; + struct bpf_cpumask __kptr *little_cpumask; + struct bpf_cpumask __kptr *node_cpumask; + struct bpf_cpumask __kptr *tmp_cpumask; + + scx_atq_t *mig_atq; + scx_dhq_t *mig_dhq; + u64 dhq_strand; /* Which DHQ strand (A or B) for this LLC */ + scx_minheap_t *idle_cpu_heap; }; struct node_ctx { - u32 id; - struct bpf_cpumask __kptr *cpumask; - struct bpf_cpumask __kptr *big_cpumask; + u32 id; + struct bpf_cpumask __kptr *cpumask; + struct bpf_cpumask __kptr *big_cpumask; }; /* task_ctx flag bits */ -#define TASK_CTX_F_INTERACTIVE (1 << 0) -#define TASK_CTX_F_WAS_NICE (1 << 1) -#define TASK_CTX_F_IS_KWORKER (1 << 2) -#define TASK_CTX_F_ALL_CPUS (1 << 3) +#define TASK_CTX_F_INTERACTIVE (1 << 0) +#define TASK_CTX_F_WAS_NICE (1 << 1) +#define TASK_CTX_F_IS_KWORKER (1 << 2) +#define TASK_CTX_F_ALL_CPUS (1 << 3) /* Helper macros for task_ctx flags */ -#define task_ctx_set_flag(taskc, flag) ((taskc)->flags |= (flag)) -#define task_ctx_clear_flag(taskc, flag) ((taskc)->flags &= ~(flag)) -#define task_ctx_test_flag(taskc, flag) ((taskc)->flags & (flag)) +#define task_ctx_set_flag(taskc, flag) ((taskc)->flags |= (flag)) +#define task_ctx_clear_flag(taskc, flag) ((taskc)->flags &= ~(flag)) +#define task_ctx_test_flag(taskc, flag) ((taskc)->flags & (flag)) struct task_p2dq { /* * Do NOT change the position of common. It should be at the beginning * of the task_ctx. */ - struct scx_task_common common; - s32 pid; + struct scx_task_common common; + s32 pid; /* * PELT (Per-Entity Load Tracking) fields. * Placed early in the structure (low offset) to help BPF verifier * track arena pointer through complex control flow. */ - u64 pelt_last_update_time; - u32 util_sum; - u32 util_avg; - u32 period_contrib; - - u64 dsq_id; - u64 slice_ns; - int dsq_index; - u32 llc_id; - u32 node_id; - u64 used; - u64 last_dsq_id; - u64 last_run_started; - u64 last_run_at; - u64 llc_runs; /* how many runs on the current LLC */ - u64 enq_flags; - int last_dsq_index; - u32 flags; /* Bitmask for interactive, was_nice, is_kworker, all_cpus */ + u64 pelt_last_update_time; + u32 util_sum; + u32 util_avg; + u32 period_contrib; + + u64 dsq_id; + u64 slice_ns; + int dsq_index; + u32 llc_id; + u32 node_id; + u64 used; + u64 last_dsq_id; + u64 last_run_started; + u64 last_run_at; + u64 llc_runs; /* how many runs on the current LLC */ + u64 enq_flags; + int last_dsq_index; + u32 flags; /* Bitmask for interactive, was_nice, is_kworker, all_cpus */ }; typedef struct task_p2dq __arena task_ctx; @@ -194,52 +251,52 @@ enum enqueue_promise_kind { }; struct enqueue_promise_vtime { - u64 dsq_id; - u64 enq_flags; - u64 slice_ns; - u64 vtime; + u64 dsq_id; + u64 enq_flags; + u64 slice_ns; + u64 vtime; - scx_atq_t *atq; + scx_atq_t *atq; }; struct enqueue_promise_fifo { - u64 dsq_id; - u64 enq_flags; - u64 slice_ns; + u64 dsq_id; + u64 enq_flags; + u64 slice_ns; - scx_atq_t *atq; + scx_atq_t *atq; }; struct enqueue_promise_dhq { - u64 dsq_id; - u64 enq_flags; - u64 slice_ns; - u64 vtime; - u64 strand; + u64 dsq_id; + u64 enq_flags; + u64 slice_ns; + u64 vtime; + u64 strand; - scx_dhq_t *dhq; + scx_dhq_t *dhq; }; /* enqueue_promise flag bits */ -#define ENQUEUE_PROMISE_F_KICK_IDLE (1 << 0) -#define ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE (1 << 1) +#define ENQUEUE_PROMISE_F_KICK_IDLE (1 << 0) +#define ENQUEUE_PROMISE_F_HAS_CLEARED_IDLE (1 << 1) /* Helper macros for enqueue_promise flags */ -#define enqueue_promise_set_flag(pro, flag) ((pro)->flags |= (flag)) -#define enqueue_promise_clear_flag(pro, flag) ((pro)->flags &= ~(flag)) -#define enqueue_promise_test_flag(pro, flag) ((pro)->flags & (flag)) +#define enqueue_promise_set_flag(pro, flag) ((pro)->flags |= (flag)) +#define enqueue_promise_clear_flag(pro, flag) ((pro)->flags &= ~(flag)) +#define enqueue_promise_test_flag(pro, flag) ((pro)->flags & (flag)) // This struct is zeroed at the beginning of `async_p2dq_enqueue` and only // relevant fields are set, so assume 0 as default when adding fields. struct enqueue_promise { - enum enqueue_promise_kind kind; + enum enqueue_promise_kind kind; - s32 cpu; - u32 flags; /* Bitmask for kick_idle, has_cleared_idle */ + s32 cpu; + u32 flags; /* Bitmask for kick_idle, has_cleared_idle */ union { - struct enqueue_promise_vtime vtime; - struct enqueue_promise_fifo fifo; - struct enqueue_promise_dhq dhq; + struct enqueue_promise_vtime vtime; + struct enqueue_promise_fifo fifo; + struct enqueue_promise_dhq dhq; }; }; diff --git a/scheds/rust/scx_p2dq/src/lib.rs b/scheds/rust/scx_p2dq/src/lib.rs index d03bb56e93..bc25dd7698 100644 --- a/scheds/rust/scx_p2dq/src/lib.rs +++ b/scheds/rust/scx_p2dq/src/lib.rs @@ -388,9 +388,11 @@ macro_rules! init_open_skel { let rodata = skel.maps.rodata_data.as_mut().unwrap(); rodata.topo_config.nr_cpus = *$crate::NR_CPU_IDS as u32; rodata.topo_config.nr_llcs = $topo.all_llcs.clone().keys().len() as u32; + rodata.topo_config.nr_clusters = $topo.all_clusters.clone().keys().len() as u32; rodata.topo_config.nr_nodes = $topo.nodes.clone().keys().len() as u32; rodata.topo_config.smt_enabled = MaybeUninit::new($topo.smt_enabled); rodata.topo_config.has_little_cores = MaybeUninit::new($topo.has_little_cores()); + rodata.topo_config.has_clusters = MaybeUninit::new(!$topo.all_clusters.is_empty()); // timeline config rodata.timeline_config.min_slice_us = opts.min_slice_us; @@ -460,6 +462,12 @@ macro_rules! init_open_skel { #[macro_export] macro_rules! init_skel { ($skel: expr, $topo: expr) => { + // Populate cluster IDs + for cluster in $topo.all_clusters.values() { + $skel.maps.bss_data.as_mut().unwrap().cluster_ids[cluster.id] = cluster.id as u64; + } + + // Populate CPU data including cluster_id for cpu in $topo.all_cpus.values() { $skel.maps.bss_data.as_mut().unwrap().big_core_ids[cpu.id] = if cpu.core_type == ($crate::CoreType::Big { turbo: true }) { @@ -470,7 +478,18 @@ macro_rules! init_skel { $skel.maps.bss_data.as_mut().unwrap().cpu_core_ids[cpu.id] = cpu.core_id as u32; $skel.maps.bss_data.as_mut().unwrap().cpu_llc_ids[cpu.id] = cpu.llc_id as u64; $skel.maps.bss_data.as_mut().unwrap().cpu_node_ids[cpu.id] = cpu.node_id as u64; + + // Find cluster_id for this CPU by searching through topology + let mut cluster_id = 0u32; + for cluster in $topo.all_clusters.values() { + if cluster.all_cpus.contains_key(&cpu.id) { + cluster_id = cluster.id as u32; + break; + } + } + $skel.maps.bss_data.as_mut().unwrap().cpu_cluster_ids[cpu.id] = cluster_id; } + for llc in $topo.all_llcs.values() { $skel.maps.bss_data.as_mut().unwrap().llc_ids[llc.id] = llc.id as u64; }