diff --git a/scheds/rust/scx_p2dq/src/bpf/intf.h b/scheds/rust/scx_p2dq/src/bpf/intf.h index ee5b8b4e6..1e2a08e6d 100644 --- a/scheds/rust/scx_p2dq/src/bpf/intf.h +++ b/scheds/rust/scx_p2dq/src/bpf/intf.h @@ -88,6 +88,11 @@ enum stat_idx { P2DQ_STAT_EXEC_BALANCE, P2DQ_STAT_FORK_SAME_LLC, P2DQ_STAT_EXEC_SAME_LLC, + P2DQ_STAT_THERMAL_KICK, + P2DQ_STAT_THERMAL_AVOID, + P2DQ_STAT_EAS_LITTLE_SELECT, + P2DQ_STAT_EAS_BIG_SELECT, + P2DQ_STAT_EAS_FALLBACK, P2DQ_NR_STATS, }; diff --git a/scheds/rust/scx_p2dq/src/bpf/main.bpf.c b/scheds/rust/scx_p2dq/src/bpf/main.bpf.c index f0dc413d7..6ae00f704 100644 --- a/scheds/rust/scx_p2dq/src/bpf/main.bpf.c +++ b/scheds/rust/scx_p2dq/src/bpf/main.bpf.c @@ -137,6 +137,10 @@ const volatile struct { bool pelt_enabled; bool fork_balance; bool exec_balance; + bool enable_eas; + bool thermal_enabled; + u16 small_task_threshold; + u16 large_task_threshold; } p2dq_config = { .sched_mode = MODE_DEFAULT, .nr_dsqs_per_llc = 3, @@ -158,6 +162,10 @@ const volatile struct { .pelt_enabled = true, .fork_balance = true, .exec_balance = true, + .enable_eas = false, + .thermal_enabled = false, + .small_task_threshold = 256, + .large_task_threshold = 768, }; /* Latency priority and preemption configuration */ @@ -187,6 +195,10 @@ u64 cpu_node_ids[MAX_CPUS]; u64 big_core_ids[MAX_CPUS]; u64 dsq_time_slices[MAX_DSQS_PER_LLC]; +/* Energy and capacity per CPU for energy-aware scheduling */ +u16 cpu_energy_cost[MAX_CPUS]; // Energy cost coefficient (0-65535) +u16 cpu_capacity[MAX_CPUS]; // CPU capacity (0-1024) + /* DHQ per LLC pair for migration (MAX_LLCS / 2 DHQs) */ scx_dhq_t *llc_pair_dhqs[MAX_LLCS / 2]; /* Track number of LLCs per NUMA node for strand assignment */ @@ -332,6 +344,13 @@ static __always_inline u32 pelt_decay(u32 val, u32 periods) return val; } +/* Forward declarations for energy-aware scheduling helpers */ +static __always_inline u32 get_cpu_capacity(s32 cpu); +static __always_inline u32 get_cpu_energy_cost(s32 cpu); +static __always_inline u32 get_task_util(struct task_struct *p); +static __always_inline bool prefer_little_core(struct task_struct *p); +static __always_inline bool prefer_big_core(struct task_struct *p); + /* * Update task's PELT metrics based on runtime. * Called when task stops running or starts running (for decay). @@ -339,11 +358,14 @@ static __always_inline u32 pelt_decay(u32 val, u32 periods) * @taskc: Task context to update * @now: Current timestamp in ns * @delta_ns: Runtime delta (0 for decay-only update) + * @task_cpu: CPU the task is running on */ -static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta_ns) +static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta_ns, s32 task_cpu) { u64 elapsed_ns, elapsed_ms; u32 periods, delta_ms; + u32 capacity, freq; + u64 scaled_delta_ms, scaled_period_contrib; if (!p2dq_config.pelt_enabled) return; @@ -378,13 +400,27 @@ static __always_inline void update_task_pelt(task_ctx *taskc, u64 now, u64 delta taskc->util_sum = pelt_decay(taskc->util_sum, periods); } + capacity = get_cpu_capacity(task_cpu); + freq = scx_bpf_cpuperf_cur(task_cpu); + if (freq == 0) + freq = SCX_CPUPERF_ONE; + + /* + * Scale period contribution by capacity and frequency + * This makes the PELT metric represent "work done at max CPU capacity at max freq" + * + * Formula: scaled_time = wall_time * (capacity / 1024) * (freq / 1024) + * = wall_time * capacity * freq / (1024 * 1024) + */ if (taskc->period_contrib > 0) { - taskc->util_sum += taskc->period_contrib; + scaled_period_contrib = (taskc->period_contrib * capacity * freq) / (1024ULL * 1024ULL); + taskc->util_sum += scaled_period_contrib; taskc->period_contrib = 0; } delta_ms = delta_ns / NSEC_PER_MSEC; - taskc->util_sum += delta_ms; + scaled_delta_ms = (delta_ms * capacity * freq) / (1024ULL * 1024ULL); + taskc->util_sum += scaled_delta_ms; if (unlikely(taskc->util_sum > PELT_SUM_MAX)) taskc->util_sum = PELT_SUM_MAX; @@ -692,6 +728,125 @@ static task_ctx *lookup_task_ctx(struct task_struct *p) return taskc; } +/* + * Get CPU capacity (0-1024) + * Used for frequency/capacity-invariant PELT and energy-aware scheduling + */ +static __always_inline u32 get_cpu_capacity(s32 cpu) +{ + if (cpu < 0 || cpu >= MAX_CPUS) + return 1024; + return cpu_capacity[cpu] ? cpu_capacity[cpu] : 1024; +} + +/* + * Get CPU energy cost (lower is more efficient) + */ +static __always_inline u32 get_cpu_energy_cost(s32 cpu) +{ + if (cpu < 0 || cpu >= MAX_CPUS) + return 65535; + return cpu_energy_cost[cpu] ? cpu_energy_cost[cpu] : 1024; +} + +/* + * Get task utilization from custom PELT + * Returns util_avg in range 0-1024 + * NOTE: Frequency and capacity invariant after modifications + */ +static __always_inline u32 get_task_util(struct task_struct *p) +{ + task_ctx *taskc; + + taskc = lookup_task_ctx(p); + if (!taskc) + return 0; + + return taskc->util_avg; +} + +/* + * Check if task should prefer little cores based on utilization + */ +static __always_inline bool prefer_little_core(struct task_struct *p) +{ + if (!p2dq_config.enable_eas || !topo_config.has_little_cores) + return false; + + u32 util = get_task_util(p); + return util < p2dq_config.small_task_threshold; +} + +/* + * Check if task should prefer big cores based on utilization + */ +static __always_inline bool prefer_big_core(struct task_struct *p) +{ + if (!p2dq_config.enable_eas || !topo_config.has_little_cores) + return false; + + u32 util = get_task_util(p); + return util > p2dq_config.large_task_threshold; +} + +/* + * Get effective CPU capacity accounting for thermal pressure AND frequency + * Returns capacity in range 0-1024 + */ +static __always_inline u32 get_effective_cpu_capacity(s32 cpu) +{ + struct cpu_ctx *cpuc; + u32 base_capacity, thermal_pressure; + u32 cur_freq; + u64 effective_capacity; + + if (cpu < 0 || cpu >= MAX_CPUS) + return 0; + + cpuc = lookup_cpu_ctx(cpu); + if (!cpuc) + return 0; + + base_capacity = get_cpu_capacity(cpu); + + thermal_pressure = cpuc->perf; + + cur_freq = scx_bpf_cpuperf_cur(cpu); + if (cur_freq == 0) + cur_freq = SCX_CPUPERF_ONE; + + /* + * Effective capacity = (base - thermal) * freq / 1024 + * Combines thermal throttling and frequency scaling + */ + if (thermal_pressure >= base_capacity) + return 0; /* Fully throttled */ + + effective_capacity = (u64)(base_capacity - thermal_pressure) * cur_freq / SCX_CPUPERF_ONE; + + return (u32)effective_capacity; +} + +/* + * Check if CPU is thermally throttled + * Returns true if pressure > 25% capacity loss + */ +static __always_inline bool is_cpu_throttled(s32 cpu) +{ + struct cpu_ctx *cpuc; + + if (cpu < 0 || cpu >= MAX_CPUS) + return false; + + cpuc = lookup_cpu_ctx(cpu); + if (!cpuc) + return false; + + /* Throttled if pressure > 256 (25% of 1024) */ + return cpuc->perf > 256; +} + + struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __type(key, u32); @@ -901,7 +1056,6 @@ static s32 pick_idle_affinitized_cpu(struct task_struct *p, task_ctx *taskc, } } - // Fallback to anywhere the task can run cpu = bpf_cpumask_any_distribute(p->cpus_ptr); found_cpu: @@ -1006,6 +1160,207 @@ u32 __attribute__((noinline)) find_least_loaded_llc_for_fork(u32 parent_llc_id) return best_id; } +/* + * Pick idle CPU from mask, avoiding thermally throttled CPUs. + * Simpler/faster than full energy-aware selection - used for MODE_PERF/MODE_EFFICIENCY. + */ +static __always_inline s32 pick_idle_thermal_aware(struct bpf_cpumask *mask, + struct task_struct *p) +{ + s32 cpu, best_cpu = -1; + u32 best_capacity = 0; + + if (!mask) + return -1; + + /* First pass: try to find unthrottled idle CPU */ + bpf_for(cpu, 0, topo_config.nr_cpus) { + if (!bpf_cpumask_test_cpu(cpu, cast_mask(mask))) + continue; + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) + continue; + if (is_cpu_throttled(cpu)) + continue; + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + return cpu; + } + + /* Second pass: allow throttled CPUs, prefer least throttled */ + bpf_for(cpu, 0, topo_config.nr_cpus) { + u32 capacity; + + if (mask && !bpf_cpumask_test_cpu(cpu, cast_mask(mask))) + continue; + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) + continue; + if (!scx_bpf_test_and_clear_cpu_idle(cpu)) + continue; + + capacity = get_effective_cpu_capacity(cpu); + if (capacity > best_capacity) { + best_capacity = capacity; + best_cpu = cpu; + } + } + + if (best_cpu >= 0) + stat_inc(P2DQ_STAT_THERMAL_AVOID); + + return best_cpu; +} + +/* + * Select best idle CPU from mask based on: + * - Not thermally throttled (priority 1) + * - High effective capacity (accounts for thermal + freq) + * - Low energy cost + * + * Returns CPU ID or -1 if no suitable CPU found + * Updates best_score with score of selected CPU (higher is better) + */ +static __always_inline s32 select_best_idle_cpu(struct task_struct *p, + struct bpf_cpumask *mask, + u32 *best_score) +{ + s32 cpu, best_cpu = -1; + u32 highest_score = 0; + + if (!mask || !best_score) + return -1; + + *best_score = 0; + + bpf_for(cpu, 0, topo_config.nr_cpus) { + u32 capacity, energy_cost, score; + + if (!bpf_cpumask_test_cpu(cpu, cast_mask(mask))) + continue; + if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) + continue; + if (!scx_bpf_test_and_clear_cpu_idle(cpu)) + continue; + + capacity = get_effective_cpu_capacity(cpu); + energy_cost = get_cpu_energy_cost(cpu); + + /* + * Score formula: prioritize capacity, penalize energy cost + * score = capacity * 10 - (energy_cost / 10) + * Higher score is better + * + * Throttled CPUs (capacity=0) get score=0 + */ + if (is_cpu_throttled(cpu)) { + score = 0; /* Heavily penalize throttled CPUs */ + } else { + /* Multiply capacity by 10 for more weight vs energy cost */ + score = (capacity * 10); + /* Subtract scaled energy cost (divide by 10 to reduce impact) */ + if (energy_cost < score) + score -= (energy_cost / 10); + else + score = 1; + } + + if (score > highest_score) { + highest_score = score; + best_cpu = cpu; + } + } + + *best_score = highest_score; + return best_cpu; +} + +/* + * Pick idle CPU using comprehensive energy-aware scheduling + * Tries preferred core type first, then fallback type + */ +static __always_inline s32 pick_idle_energy_aware(struct task_struct *p, + struct llc_ctx *llcx, + bool *is_idle) +{ + s32 cpu = -1; + u32 pref_score = 0, fallback_score = 0; + + if (!llcx || !is_idle) + return -1; + + /* Determine preferred and fallback cpumasks based on task utilization */ + struct bpf_cpumask *pref_mask = NULL; + struct bpf_cpumask *fallback_mask = NULL; + bool prefer_little = prefer_little_core(p); + bool prefer_big = prefer_big_core(p); + + if (prefer_little) { + pref_mask = llcx->little_cpumask; + fallback_mask = llcx->big_cpumask; + } else if (prefer_big) { + pref_mask = llcx->big_cpumask; + fallback_mask = llcx->little_cpumask; + } else { + /* No strong preference, try both and pick best score */ + s32 little_cpu = -1, big_cpu = -1; + u32 little_score = 0, big_score = 0; + + if (llcx->little_cpumask) + little_cpu = select_best_idle_cpu(p, llcx->little_cpumask, &little_score); + if (llcx->big_cpumask) + big_cpu = select_best_idle_cpu(p, llcx->big_cpumask, &big_score); + + /* Pick whichever has better score */ + if (little_cpu >= 0 && big_cpu >= 0) { + if (little_score >= big_score) { + *is_idle = true; + stat_inc(P2DQ_STAT_EAS_LITTLE_SELECT); + return little_cpu; + } else { + *is_idle = true; + stat_inc(P2DQ_STAT_EAS_BIG_SELECT); + return big_cpu; + } + } else if (little_cpu >= 0) { + *is_idle = true; + stat_inc(P2DQ_STAT_EAS_LITTLE_SELECT); + return little_cpu; + } else if (big_cpu >= 0) { + *is_idle = true; + stat_inc(P2DQ_STAT_EAS_BIG_SELECT); + return big_cpu; + } + return -1; + } + + /* Try preferred core type first */ + if (pref_mask) { + cpu = select_best_idle_cpu(p, pref_mask, &pref_score); + if (cpu >= 0) { + *is_idle = true; + if (prefer_little) + stat_inc(P2DQ_STAT_EAS_LITTLE_SELECT); + else + stat_inc(P2DQ_STAT_EAS_BIG_SELECT); + return cpu; + } + } + + /* Fallback to opposite core type if preferred not available */ + if (fallback_mask) { + cpu = select_best_idle_cpu(p, fallback_mask, &fallback_score); + if (cpu >= 0) { + *is_idle = true; + stat_inc(P2DQ_STAT_EAS_FALLBACK); + if (prefer_little) + stat_inc(P2DQ_STAT_EAS_BIG_SELECT); + else + stat_inc(P2DQ_STAT_EAS_LITTLE_SELECT); + return cpu; + } + } + + return -1; +} + static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, s32 prev_cpu, u64 wake_flags, bool *is_idle) { @@ -1141,39 +1496,66 @@ static s32 pick_idle_cpu(struct task_struct *p, task_ctx *taskc, goto found_cpu; } + /* + * Energy-aware selection with comprehensive scoring + * Uses effective capacity, energy cost, and thermal awareness + */ + if (p2dq_config.enable_eas && topo_config.has_little_cores) { + cpu = pick_idle_energy_aware(p, llcx, is_idle); + if (cpu >= 0) + goto found_cpu; + } + if (p2dq_config.sched_mode == MODE_PERF && topo_config.has_little_cores && llcx->big_cpumask) { - cpu = __pick_idle_cpu(llcx->big_cpumask, - SCX_PICK_IDLE_CORE); - if (cpu >= 0) { - *is_idle = true; - goto found_cpu; - } - if (llcx->big_cpumask) { - cpu = __pick_idle_cpu(llcx->big_cpumask, 0); + /* Try thermal-aware selection first for big cores if thermal tracking enabled */ + if (p2dq_config.thermal_enabled) { + cpu = pick_idle_thermal_aware(llcx->big_cpumask, p); if (cpu >= 0) { *is_idle = true; goto found_cpu; } } + /* Fallback to non-thermal-aware if thermal disabled or no idle big cores */ + if (llcx->big_cpumask && + (cpu = __pick_idle_cpu(llcx->big_cpumask, SCX_PICK_IDLE_CORE)) && + cpu >= 0) { + *is_idle = true; + goto found_cpu; + } + if (llcx->big_cpumask && + (cpu = __pick_idle_cpu(llcx->big_cpumask, 0)) && + cpu >= 0) { + *is_idle = true; + goto found_cpu; + } } if (p2dq_config.sched_mode == MODE_EFFICIENCY && topo_config.has_little_cores && llcx->little_cpumask) { - cpu = __pick_idle_cpu(llcx->little_cpumask, SCX_PICK_IDLE_CORE); - if (cpu >= 0) { - *is_idle = true; - goto found_cpu; - } - if (llcx->little_cpumask) { - cpu = __pick_idle_cpu(llcx->little_cpumask, 0); + /* Try thermal-aware selection first for little cores if thermal tracking enabled */ + if (p2dq_config.thermal_enabled) { + cpu = pick_idle_thermal_aware(llcx->little_cpumask, p); if (cpu >= 0) { *is_idle = true; goto found_cpu; } } + /* Fallback to non-thermal-aware if thermal disabled or no idle little cores */ + if (llcx->little_cpumask && + (cpu = __pick_idle_cpu(llcx->little_cpumask, SCX_PICK_IDLE_CORE)) && + cpu >= 0) { + *is_idle = true; + goto found_cpu; + } + if (llcx->little_cpumask && + (cpu = __pick_idle_cpu(llcx->little_cpumask, 0)) && + cpu >= 0) { + *is_idle = true; + goto found_cpu; + } } @@ -1912,7 +2294,7 @@ static int p2dq_running_impl(struct task_struct *p) /* Decay PELT metrics when task starts running (0 delta for decay-only) */ if (p2dq_config.pelt_enabled) - update_task_pelt(taskc, now, 0); + update_task_pelt(taskc, now, 0, task_cpu); return 0; } @@ -1924,6 +2306,7 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) struct cpu_ctx *cpuc; u64 used, scaled_used, last_dsq_slice_ns; u64 now = bpf_ktime_get_ns(); + s32 task_cpu = scx_bpf_task_cpu(p); if (unlikely(!(taskc = lookup_task_ctx(p)) || !(llcx = lookup_llc_ctx(taskc->llc_id)))) @@ -1940,7 +2323,7 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) // time. When a nice task was run we need to update the cpu_ctx so that // tasks are no longer enqueued to the local DSQ. if (task_ctx_test_flag(taskc, TASK_CTX_F_WAS_NICE) && - (cpuc = lookup_cpu_ctx(scx_bpf_task_cpu(p)))) { + (cpuc = lookup_cpu_ctx(task_cpu))) { cpu_ctx_clear_flag(cpuc, CPU_CTX_F_NICE_TASK); task_ctx_clear_flag(taskc, TASK_CTX_F_WAS_NICE); } @@ -1958,7 +2341,7 @@ void BPF_STRUCT_OPS(p2dq_stopping, struct task_struct *p, bool runnable) /* Update PELT metrics if enabled */ if (p2dq_config.pelt_enabled) { - update_task_pelt(taskc, now, used); + update_task_pelt(taskc, now, used, task_cpu); aggregate_pelt_to_llc(llcx, taskc, task_ctx_test_flag(taskc, TASK_CTX_F_INTERACTIVE), !task_ctx_test_flag(taskc, TASK_CTX_F_ALL_CPUS)); @@ -3235,6 +3618,50 @@ void BPF_STRUCT_OPS(p2dq_exit, struct scx_exit_info *ei) UEI_RECORD(uei, ei); } +/* + * Thermal Pressure Tracking (requires CONFIG_SCHED_HW_PRESSURE=y) + * + * Thermal tracking provides optimization by avoiding throttled CPUs. + * This program has autoload disabled by default and is conditionally + * enabled from userspace if the kernel supports hw_pressure_update tracepoint. + * + * The '?' suffix makes this program optional - veristat and the verifier + * will skip it if the tracepoint doesn't exist in the kernel. + * + * Tracepoint: hw_pressure_update + * Fires when kernel detects thermal throttling on a CPU + * + * Arguments: + * cpu: CPU ID experiencing pressure + * hw_pressure: Pressure value (0 = no throttling, 1024 = max capacity lost) + * + * Note: This tracepoint only exists on ARM/ARM64 architectures + */ +#if defined(__aarch64__) || defined(__arm__) +__weak __hidden SEC("tp_btf/hw_pressure_update?") +int BPF_PROG(on_thermal_pressure, u32 cpu, u64 hw_pressure) +{ + struct cpu_ctx *cpuc; + + if (cpu >= MAX_CPUS) + return 0; + + cpuc = lookup_cpu_ctx(cpu); + if (!cpuc) + return 0; + + cpuc->perf = (u32)hw_pressure; + + if (hw_pressure > 512) { + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + stat_inc(P2DQ_STAT_THERMAL_KICK); + } + + return 0; +} +#endif + + #if P2DQ_CREATE_STRUCT_OPS s32 BPF_STRUCT_OPS_SLEEPABLE(p2dq_init) { diff --git a/scheds/rust/scx_p2dq/src/bpf/types.h b/scheds/rust/scx_p2dq/src/bpf/types.h index ac0a9abd1..cd123df62 100644 --- a/scheds/rust/scx_p2dq/src/bpf/types.h +++ b/scheds/rust/scx_p2dq/src/bpf/types.h @@ -46,7 +46,7 @@ struct cpu_ctx { u64 slice_ns; u32 core_id; u32 dsq_index; - u32 perf; + u32 perf; /* Thermal pressure (0-1024, 0=no throttling, 1024=max capacity lost) */ u32 flags; /* Bitmask for interactive, is_big, nice_task */ u64 ran_for; u32 node_id; diff --git a/scheds/rust/scx_p2dq/src/energy.rs b/scheds/rust/scx_p2dq/src/energy.rs new file mode 100644 index 000000000..8c885e0a0 --- /dev/null +++ b/scheds/rust/scx_p2dq/src/energy.rs @@ -0,0 +1,192 @@ +use anyhow::Result; +use scx_utils::{EnergyModel as KernelEnergyModel, Topology}; +use std::collections::BTreeMap; +use tracing::info; + +/// Energy characteristics for a CPU type +#[derive(Debug, Clone)] +pub struct CpuEnergyProfile { + pub capacity: u32, // Relative performance (0-1024) + pub base_power_mw: u32, // Base power consumption (mW) + pub dynamic_power_mw: u32, // Dynamic power at 100% util (mW) + pub efficiency: f32, // Performance per watt +} + +impl CpuEnergyProfile { + /// Calculate energy cost coefficient for placement decisions + /// Returns cost in arbitrary units (higher = less efficient) + pub fn energy_cost(&self) -> u32 { + // Cost = power / capacity (mW per unit of performance) + // Scale to integer for BPF + let total_power = self.base_power_mw + self.dynamic_power_mw; + ((total_power as f64 / self.capacity as f64) * 1024.0) as u32 + } +} + +pub struct EnergyModel { + /// Map from CPU ID to energy profile + cpu_profiles: BTreeMap, + /// Utilization threshold for small tasks (prefer little cores) + pub small_task_threshold: u32, + /// Utilization threshold for large tasks (prefer big cores) + pub large_task_threshold: u32, +} + +impl EnergyModel { + /// Create new energy model from system topology + /// Tries to use kernel energy model first, falls back to heuristics + pub fn new(topo: &Topology) -> Result { + let mut cpu_profiles = BTreeMap::new(); + + // Try to use kernel energy model if available + if let Ok(kernel_em) = KernelEnergyModel::new() { + info!("Using kernel energy model from /sys/kernel/debug/energy_model"); + + for cpu in topo.all_cpus.values() { + let profile = Self::create_profile_from_kernel_em(cpu, &kernel_em); + cpu_profiles.insert(cpu.id, profile); + } + } else { + info!("Kernel energy model not available, using frequency-based estimates"); + + for cpu in topo.all_cpus.values() { + let profile = Self::create_profile_from_heuristics(cpu, topo); + cpu_profiles.insert(cpu.id, profile); + } + } + + // Derive thresholds from actual capacity distribution + let (small_thresh, large_thresh) = Self::derive_thresholds(topo); + + Ok(EnergyModel { + cpu_profiles, + small_task_threshold: small_thresh, + large_task_threshold: large_thresh, + }) + } + + /// Derive task size thresholds from CPU capacity distribution + fn derive_thresholds(topo: &Topology) -> (u32, u32) { + // Find min and max capacities + let mut min_cap = u32::MAX; + let mut max_cap = 0u32; + + for cpu in topo.all_cpus.values() { + let cap = cpu.cpu_capacity as u32; + min_cap = min_cap.min(cap); + max_cap = max_cap.max(cap); + } + + // If homogeneous (all cores similar capacity), use percentage-based thresholds + if max_cap - min_cap < 200 { + // Less than ~20% variation + return (256, 768); // 25% and 75% of 1024 + } + + // For big.LITTLE or heterogeneous systems: + // Small task threshold: 25% of little core capacity + // Large task threshold: 75% of big core capacity + let small_thresh = (min_cap / 4).max(128); + let large_thresh = ((max_cap * 3) / 4).min(896); + + (small_thresh, large_thresh) + } + + /// Create energy profile from kernel energy model + fn create_profile_from_kernel_em( + cpu: &scx_utils::Cpu, + kernel_em: &KernelEnergyModel, + ) -> CpuEnergyProfile { + if let Some(pd) = kernel_em.get_pd_by_cpu_id(cpu.id) { + // Use highest performance state (max frequency) for power estimates + if let Some((_, ps)) = pd.perf_table.last_key_value() { + // Kernel provides power in microwatts, convert to milliwatts + let dynamic_power_mw = (ps.power / 1000) as u32; + + // Estimate idle power as ~2-5% of dynamic power + let base_power_mw = (dynamic_power_mw / 30).max(10); + + return CpuEnergyProfile { + capacity: cpu.cpu_capacity as u32, + base_power_mw, + dynamic_power_mw, + efficiency: (cpu.cpu_capacity as f32) / (dynamic_power_mw as f32), + }; + } + } + + // Fallback if we can't find this CPU in the energy model + Self::create_profile_from_heuristics(cpu, &Topology::new().unwrap()) + } + + /// Create energy profile based on CPU characteristics using heuristics + /// Uses frequency and capacity to estimate power consumption + fn create_profile_from_heuristics(cpu: &scx_utils::Cpu, topo: &Topology) -> CpuEnergyProfile { + // Find max capacity in the system to determine core type + let max_capacity = topo + .all_cpus + .values() + .map(|c| c.cpu_capacity) + .max() + .unwrap_or(1024); + + // Determine if this is a big or little core + // Consider it "big" if capacity is >= 78% of max capacity + let is_big_core = cpu.cpu_capacity >= (max_capacity * 78) / 100; + + // Power scales roughly with frequency and voltage + // P ≈ C * V^2 * f, and V ≈ f for modern CPUs + // So P ≈ k * f^3 (simplified) + + let freq_ratio = if cpu.max_freq > 0 { + cpu.max_freq as f64 / 2500000.0 // Normalize to ~2.5GHz baseline + } else { + 1.0 + }; + + let capacity_ratio = cpu.cpu_capacity as f64 / 1024.0; + + if is_big_core { + // Big core - scale power based on frequency + let base_dynamic_power = 3000.0; // 3W baseline for 2.5GHz big core + let dynamic_power_mw = (base_dynamic_power * freq_ratio.powf(2.5)) as u32; + let base_power_mw = (dynamic_power_mw / 60).max(30); // ~1.6-3% of dynamic + + CpuEnergyProfile { + capacity: cpu.cpu_capacity as u32, + base_power_mw, + dynamic_power_mw, + efficiency: (cpu.cpu_capacity as f32) / (dynamic_power_mw as f32), + } + } else { + // Little core - more efficient, lower power + let base_dynamic_power = 1200.0; // 1.2W baseline for little core + let dynamic_power_mw = + (base_dynamic_power * freq_ratio.powf(2.5) * capacity_ratio) as u32; + let base_power_mw = (dynamic_power_mw / 50).max(15); // ~2% of dynamic + + CpuEnergyProfile { + capacity: cpu.cpu_capacity as u32, + base_power_mw, + dynamic_power_mw, + efficiency: (cpu.cpu_capacity as f32) / (dynamic_power_mw as f32), + } + } + } + + /// Get energy cost for a CPU + pub fn cpu_energy_cost(&self, cpu: usize) -> u32 { + self.cpu_profiles + .get(&cpu) + .map(|p| p.energy_cost()) + .unwrap_or(1024) + } + + /// Get CPU capacity + pub fn cpu_capacity(&self, cpu: usize) -> u32 { + self.cpu_profiles + .get(&cpu) + .map(|p| p.capacity) + .unwrap_or(1024) + } +} diff --git a/scheds/rust/scx_p2dq/src/lib.rs b/scheds/rust/scx_p2dq/src/lib.rs index f5335ab98..8f6ae47d6 100644 --- a/scheds/rust/scx_p2dq/src/lib.rs +++ b/scheds/rust/scx_p2dq/src/lib.rs @@ -4,6 +4,7 @@ // GNU General Public License version 2. pub mod bpf_intf; pub mod bpf_skel; +pub mod energy; pub use bpf_skel::types; use scx_utils::cli::TopologyArgs; @@ -339,6 +340,13 @@ pub struct SchedulerOpts { #[clap(long, action = clap::ArgAction::SetTrue)] pub wakeup_preemption: bool, + /// Enable Energy-Aware Scheduling (EAS) for big.LITTLE CPUs. + /// Places low-utilization tasks on efficient cores and high-utilization + /// tasks on performance cores. Requires PELT to be enabled. Improves + /// battery life on heterogeneous systems. + #[clap(long, default_value_t = false, action = clap::ArgAction::Set)] + pub enable_eas: bool, + #[clap(flatten, next_help_heading = "Topology Options")] pub topo: TopologyArgs, } @@ -468,6 +476,9 @@ macro_rules! init_open_skel { rodata.p2dq_config.pelt_enabled = MaybeUninit::new(opts.enable_pelt); rodata.p2dq_config.fork_balance = MaybeUninit::new(opts.fork_balance); rodata.p2dq_config.exec_balance = MaybeUninit::new(opts.exec_balance); + rodata.p2dq_config.enable_eas = MaybeUninit::new(opts.enable_eas); + rodata.p2dq_config.small_task_threshold = 256; // 25% utilization + rodata.p2dq_config.large_task_threshold = 768; // 75% utilization // Latency priority config rodata.latency_config.latency_priority_enabled = MaybeUninit::new(opts.latency_priority); @@ -483,7 +494,16 @@ macro_rules! init_open_skel { #[macro_export] macro_rules! init_skel { - ($skel: expr, $topo: expr) => { + ($skel: expr, $topo: expr) => {{ + use $crate::energy::EnergyModel; + + // Initialize energy model for EAS + let energy_model = EnergyModel::new(&$topo).unwrap_or_else(|e| { + eprintln!("Warning: Failed to create energy model: {}", e); + eprintln!("Energy-aware scheduling will use fallback values"); + EnergyModel::new(&$topo).unwrap() // This should not fail + }); + for cpu in $topo.all_cpus.values() { $skel.maps.bss_data.as_mut().unwrap().big_core_ids[cpu.id] = if cpu.core_type == ($crate::CoreType::Big { turbo: true }) { @@ -494,9 +514,15 @@ macro_rules! init_skel { $skel.maps.bss_data.as_mut().unwrap().cpu_core_ids[cpu.id] = cpu.core_id as u32; $skel.maps.bss_data.as_mut().unwrap().cpu_llc_ids[cpu.id] = cpu.llc_id as u64; $skel.maps.bss_data.as_mut().unwrap().cpu_node_ids[cpu.id] = cpu.node_id as u64; + + // Populate energy model data + $skel.maps.bss_data.as_mut().unwrap().cpu_capacity[cpu.id] = + energy_model.cpu_capacity(cpu.id) as u16; + $skel.maps.bss_data.as_mut().unwrap().cpu_energy_cost[cpu.id] = + energy_model.cpu_energy_cost(cpu.id) as u16; } for llc in $topo.all_llcs.values() { $skel.maps.bss_data.as_mut().unwrap().llc_ids[llc.id] = llc.id as u64; } - }; + }}; } diff --git a/scheds/rust/scx_p2dq/src/main.rs b/scheds/rust/scx_p2dq/src/main.rs index 8457903d0..7920ff50d 100644 --- a/scheds/rust/scx_p2dq/src/main.rs +++ b/scheds/rust/scx_p2dq/src/main.rs @@ -46,6 +46,9 @@ use bpf_intf::stat_idx_P2DQ_STAT_DIRECT; use bpf_intf::stat_idx_P2DQ_STAT_DISPATCH_PICK2; use bpf_intf::stat_idx_P2DQ_STAT_DSQ_CHANGE; use bpf_intf::stat_idx_P2DQ_STAT_DSQ_SAME; +use bpf_intf::stat_idx_P2DQ_STAT_EAS_BIG_SELECT; +use bpf_intf::stat_idx_P2DQ_STAT_EAS_FALLBACK; +use bpf_intf::stat_idx_P2DQ_STAT_EAS_LITTLE_SELECT; use bpf_intf::stat_idx_P2DQ_STAT_ENQ_CPU; use bpf_intf::stat_idx_P2DQ_STAT_ENQ_INTR; use bpf_intf::stat_idx_P2DQ_STAT_ENQ_LLC; @@ -59,6 +62,8 @@ use bpf_intf::stat_idx_P2DQ_STAT_KEEP; use bpf_intf::stat_idx_P2DQ_STAT_LLC_MIGRATION; use bpf_intf::stat_idx_P2DQ_STAT_NODE_MIGRATION; use bpf_intf::stat_idx_P2DQ_STAT_SELECT_PICK2; +use bpf_intf::stat_idx_P2DQ_STAT_THERMAL_AVOID; +use bpf_intf::stat_idx_P2DQ_STAT_THERMAL_KICK; use bpf_intf::stat_idx_P2DQ_STAT_WAKE_LLC; use bpf_intf::stat_idx_P2DQ_STAT_WAKE_MIG; use bpf_intf::stat_idx_P2DQ_STAT_WAKE_PREV; @@ -153,6 +158,12 @@ impl<'a> Scheduler<'a> { https://github.com/sched-ext/scx/issues/new?labels=scx_p2dq&title=scx_p2dq:%20New%20Issue&assignees=hodgesds&body=Kernel%20version:%20(fill%20me%20out)%0ADistribution:%20(fill%20me%20out)%0AHardware:%20(fill%20me%20out)%0A%0AIssue:%20(fill%20me%20out)" )?; + // Disable autoload for thermal pressure tracepoint by default + // Will be conditionally enabled if kernel supports it + // Note: This tracepoint only exists on ARM/ARM64 architectures + #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] + open_skel.progs.on_thermal_pressure.set_autoload(false); + // Apply hardware-specific optimizations before macro let hw_profile = scx_p2dq::HardwareProfile::detect(); let mut opts_optimized = opts.clone(); @@ -168,6 +179,41 @@ impl<'a> Scheduler<'a> { &hw_profile )?; + // Thermal pressure tracking (ARM/ARM64 only) + #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] + { + let thermal_enabled = std::path::Path::new( + "/sys/kernel/tracing/events/thermal_pressure/hw_pressure_update", + ) + .exists() + || std::path::Path::new( + "/sys/kernel/debug/tracing/events/thermal_pressure/hw_pressure_update", + ) + .exists(); + + if thermal_enabled { + debug!( + "Kernel supports thermal pressure tracking, enabling hw_pressure_update tracepoint" + ); + open_skel.progs.on_thermal_pressure.set_autoload(true); + stats::set_thermal_tracking_enabled(true); + + open_skel + .maps + .rodata_data + .as_mut() + .unwrap() + .p2dq_config + .thermal_enabled = std::mem::MaybeUninit::new(true); + } else { + debug!("Kernel does not support thermal pressure tracking (CONFIG_SCHED_HW_PRESSURE not enabled)"); + } + } + + if opts_optimized.enable_eas { + stats::set_eas_enabled(true); + } + if opts.queued_wakeup { open_skel.struct_ops.p2dq_mut().flags |= *compat::SCX_OPS_ALLOW_QUEUED_WAKEUP; } @@ -233,6 +279,11 @@ impl<'a> Scheduler<'a> { exec_balance: stats[stat_idx_P2DQ_STAT_EXEC_BALANCE as usize], fork_same_llc: stats[stat_idx_P2DQ_STAT_FORK_SAME_LLC as usize], exec_same_llc: stats[stat_idx_P2DQ_STAT_EXEC_SAME_LLC as usize], + thermal_kick: stats[stat_idx_P2DQ_STAT_THERMAL_KICK as usize], + thermal_avoid: stats[stat_idx_P2DQ_STAT_THERMAL_AVOID as usize], + eas_little_select: stats[stat_idx_P2DQ_STAT_EAS_LITTLE_SELECT as usize], + eas_big_select: stats[stat_idx_P2DQ_STAT_EAS_BIG_SELECT as usize], + eas_fallback: stats[stat_idx_P2DQ_STAT_EAS_FALLBACK as usize], } } diff --git a/scheds/rust/scx_p2dq/src/stats.rs b/scheds/rust/scx_p2dq/src/stats.rs index 7c1e4dd88..aa44dfafb 100644 --- a/scheds/rust/scx_p2dq/src/stats.rs +++ b/scheds/rust/scx_p2dq/src/stats.rs @@ -11,6 +11,28 @@ use scx_stats_derive::Stats; use serde::Deserialize; use serde::Serialize; +// Global flag to track if thermal pressure tracking is enabled +static THERMAL_TRACKING_ENABLED: AtomicBool = AtomicBool::new(false); + +// Global flag to track if energy-aware scheduling is enabled +static EAS_ENABLED: AtomicBool = AtomicBool::new(false); + +pub fn set_thermal_tracking_enabled(enabled: bool) { + THERMAL_TRACKING_ENABLED.store(enabled, Ordering::Relaxed); +} + +pub fn is_thermal_tracking_enabled() -> bool { + THERMAL_TRACKING_ENABLED.load(Ordering::Relaxed) +} + +pub fn set_eas_enabled(enabled: bool) { + EAS_ENABLED.store(enabled, Ordering::Relaxed); +} + +pub fn is_eas_enabled() -> bool { + EAS_ENABLED.load(Ordering::Relaxed) +} + #[stat_doc] #[derive(Clone, Debug, Default, Serialize, Deserialize, Stats)] #[stat(top)] @@ -59,6 +81,16 @@ pub struct Metrics { pub fork_same_llc: u64, #[stat(desc = "Number of times exec stayed on same LLC")] pub exec_same_llc: u64, + #[stat(desc = "Number of CPU kicks due to thermal pressure")] + pub thermal_kick: u64, + #[stat(desc = "Number of times throttled CPUs were avoided")] + pub thermal_avoid: u64, + #[stat(desc = "Number of times EAS placed task on little core")] + pub eas_little_select: u64, + #[stat(desc = "Number of times EAS placed task on big core")] + pub eas_big_select: u64, + #[stat(desc = "Number of times EAS fell back to non-preferred core type")] + pub eas_fallback: u64, } impl Metrics { @@ -78,8 +110,9 @@ impl Metrics { self.enq_intr, self.enq_mig, )?; - writeln!( - w, + + // Build the stats line conditionally based on thermal tracking availability + let mut stats_line = format!( "\twake prev/llc/mig {}/{}/{}\n\tpick2 select/dispatch {}/{}\n\tmigrations llc/node: {}/{}\n\tfork balance/same {}/{}\n\texec balance/same {}/{}", self.wake_prev, self.wake_llc, @@ -92,7 +125,25 @@ impl Metrics { self.fork_same_llc, self.exec_balance, self.exec_same_llc, - )?; + ); + + // Only show thermal stats if thermal tracking is enabled + if is_thermal_tracking_enabled() { + stats_line.push_str(&format!( + "\n\tthermal kick/avoid {}/{}", + self.thermal_kick, self.thermal_avoid, + )); + } + + // Only show EAS stats if energy-aware scheduling is enabled + if is_eas_enabled() { + stats_line.push_str(&format!( + "\n\tEAS little/big/fallback {}/{}/{}", + self.eas_little_select, self.eas_big_select, self.eas_fallback, + )); + } + + writeln!(w, "{}", stats_line)?; Ok(()) } @@ -120,6 +171,11 @@ impl Metrics { exec_balance: self.exec_balance - rhs.exec_balance, fork_same_llc: self.fork_same_llc - rhs.fork_same_llc, exec_same_llc: self.exec_same_llc - rhs.exec_same_llc, + thermal_kick: self.thermal_kick - rhs.thermal_kick, + thermal_avoid: self.thermal_avoid - rhs.thermal_avoid, + eas_little_select: self.eas_little_select - rhs.eas_little_select, + eas_big_select: self.eas_big_select - rhs.eas_big_select, + eas_fallback: self.eas_fallback - rhs.eas_fallback, } } }