strat9_kernel/process/sched_classes/fair.rs
1// SPDX-License-Identifier: MPL-2.0
2
3use super::{CurrentRuntime, SchedClassRq};
4use crate::process::task::Task;
5use alloc::{collections::BTreeMap, sync::Arc};
6
7const WEIGHT_0: u64 = 1024;
8
9/// Base time slice per task in ticks for the CFS fair scheduler.
10///
11/// At TIMER_HZ=100 (10 ms/tick):
12/// BASE_SLICE_TICKS = 1 -> 1 tick = 10 ms per task (matches `quantum_ms: 10`)
13///
14/// Previously this was mistakenly 10, giving 10 ticks = 100 ms slices and
15/// effectively disabling preemption for lightly loaded workloads.
16///
17/// Derivation: target_ms = 10 ms, tick_ms = 1000 / TIMER_HZ = 10 ms -> 1 tick.
18const BASE_SLICE_TICKS: u64 = 1;
19
20/// Performs the nice to weight operation.
21pub const fn nice_to_weight(nice: super::nice::Nice) -> u64 {
22 const FACTOR_NUMERATOR: u64 = 5;
23 const FACTOR_DENOMINATOR: u64 = 4;
24
25 const NICE_TO_WEIGHT: [u64; 40] = const {
26 let mut ret = [0; 40];
27 let mut index = 0;
28 let mut nice = super::nice::NiceValue::MIN.get();
29 while nice <= super::nice::NiceValue::MAX.get() {
30 ret[index] = match nice {
31 0 => WEIGHT_0,
32 nice @ 1.. => {
33 let numerator = FACTOR_DENOMINATOR.pow(nice as u32);
34 let denominator = FACTOR_NUMERATOR.pow(nice as u32);
35 WEIGHT_0 * numerator / denominator
36 }
37 nice => {
38 let numerator = FACTOR_NUMERATOR.pow((-nice) as u32);
39 let denominator = FACTOR_DENOMINATOR.pow((-nice) as u32);
40 WEIGHT_0 * numerator / denominator
41 }
42 };
43 index += 1;
44 nice += 1;
45 }
46 ret
47 };
48
49 NICE_TO_WEIGHT[(nice.value().get() + 20) as usize]
50}
51
52/// Per-CPU run queue for the Completely Fair Scheduler.
53///
54/// Uses two `BTreeMap`s for O(log n) operations without allocation on the
55/// fast paths (`pick_next`, `remove`):
56///
57/// - `entities`: primary map keyed by `(vruntime, task_id)` : the minimum
58/// entry is always the next task to schedule.
59/// - `by_id`: reverse index mapping `task_id → primary key` : enables O(log n)
60/// removal by task ID in `remove()` without scanning `entities`.
61///
62/// This replaces the previous `BinaryHeap`-based design which used *lazy
63/// deletion* (O(n) `remove()` scan, phantom entries, generation counters).
64/// The `BTreeMap` approach gives:
65///
66/// | Operation | Complexity | Allocates? |
67/// |-------------|------------|-----------------------------------------|
68/// | `enqueue` | O(log n) | yes : 2 BTreeMap nodes (wakeup path) |
69/// | `pick_next` | O(log n) | no : removes 2 nodes |
70/// | `remove` | O(log n) | no : removes 2 nodes |
71///
72/// No phantom entries means no generation counter, no `prune_stale_head()`,
73/// and no per-entry liveness checks. The BTreeMap pair is the authoritative
74/// record of which tasks are currently on the run queue.
75pub struct FairClassRq {
76 /// Primary index: `(vruntime, task_id)` → `(Arc<Task>, weight)`.
77 /// Ordered so `pop_first()` yields the task with the smallest vruntime.
78 /// `task_id` is part of the key to ensure uniqueness when two tasks share
79 /// the same vruntime.
80 entities: BTreeMap<(u64, u64), (Arc<Task>, u64)>,
81 /// Reverse index: `task_id` → primary key.
82 /// Allows `remove(task_id)` to locate and delete the `entities` entry in
83 /// O(log n) without scanning the primary map.
84 by_id: BTreeMap<u64, (u64, u64)>,
85 min_vruntime: u64,
86 total_weight: u64,
87 runnable_count: usize,
88}
89
90impl FairClassRq {
91 /// Creates a new instance.
92 pub fn new() -> Self {
93 Self {
94 entities: BTreeMap::new(),
95 by_id: BTreeMap::new(),
96 min_vruntime: 0,
97 total_weight: 0,
98 runnable_count: 0,
99 }
100 }
101
102 /// Total scheduling period in ticks.
103 ///
104 /// `BASE_SLICE_TICKS * (nr_runnable + 1)` : each runnable task gets at
105 /// least one full `BASE_SLICE_TICKS` per round. `+1` accounts for the
106 /// currently-running task that is not counted in `runnable_count`.
107 fn period(&self) -> u64 {
108 let count = (self.runnable_count + 1) as u64;
109 (BASE_SLICE_TICKS * count).max(BASE_SLICE_TICKS)
110 }
111
112 /// Virtual-time slice: the vruntime budget for the current task.
113 fn vtime_slice(&self) -> u64 {
114 self.period() / (self.runnable_count + 1) as u64
115 }
116
117 /// Wall-clock time slice scaled by `cur_weight` relative to total weight.
118 fn time_slice(&self, cur_weight: u64) -> u64 {
119 let denom = self.total_weight + cur_weight;
120 if denom == 0 {
121 return self.period();
122 }
123 self.period() * cur_weight / denom
124 }
125}
126
127impl SchedClassRq for FairClassRq {
128 /// Enqueues a task onto the run queue.
129 ///
130 /// Clamps `vruntime` to `min_vruntime` so waking tasks do not receive an
131 /// unfair head start over tasks that have been waiting. O(log n);
132 /// allocates two BTreeMap nodes.
133 fn enqueue(&mut self, task: Arc<Task>) {
134 if let super::SchedPolicy::Fair(nice) = task.sched_policy() {
135 let task_id = task.id.as_u64();
136
137 // Guard against double-enqueue: `by_id` is the authoritative
138 // membership record. This should not occur in normal operation.
139 if self.by_id.contains_key(&task_id) {
140 return;
141 }
142
143 let weight = nice_to_weight(nice);
144 let mut vruntime = task.vruntime();
145 if vruntime < self.min_vruntime {
146 vruntime = self.min_vruntime;
147 }
148 task.set_vruntime(vruntime);
149 // Keep the task-side `fair_on_rq` flag consistent with external
150 // observers. The generation return value is unused in this design.
151 task.fair_prepare_enqueue();
152
153 let key = (vruntime, task_id);
154 self.entities.insert(key, (task, weight));
155 self.by_id.insert(task_id, key);
156 self.total_weight += weight;
157 self.runnable_count += 1;
158 }
159 }
160
161 /// Returns the number of tasks currently on the run queue.
162 fn len(&self) -> usize {
163 self.runnable_count
164 }
165
166 /// Picks the next task to run: the one with the smallest vruntime.
167 ///
168 /// O(log n), allocation-free.
169 fn pick_next(&mut self) -> Option<Arc<Task>> {
170 let ((_, task_id), (task, weight)) = self.entities.pop_first()?;
171 self.by_id.remove(&task_id);
172 task.fair_mark_dequeued();
173 self.total_weight = self.total_weight.saturating_sub(weight);
174 self.runnable_count = self.runnable_count.saturating_sub(1);
175 Some(task)
176 }
177
178 /// Updates the vruntime of the currently-running task and decides whether
179 /// it should be preempted.
180 ///
181 /// Returns `true` if the task has exhausted its time slice or its vruntime
182 /// has overtaken the leftmost task's vruntime by more than `vtime_slice`.
183 fn update_current(&mut self, rt: &CurrentRuntime, task: &Task, is_yield: bool) -> bool {
184 if is_yield {
185 return true;
186 }
187 if let super::SchedPolicy::Fair(nice) = task.sched_policy() {
188 let weight = nice_to_weight(nice);
189 let delta_vruntime = if weight == 0 {
190 0
191 } else {
192 rt.delta_ticks * WEIGHT_0 / weight
193 };
194 let vruntime = task.vruntime() + delta_vruntime;
195 task.set_vruntime(vruntime);
196
197 // The leftmost entry is O(log n) to peek on a BTreeMap.
198 let leftmost_vruntime = self.entities.keys().next().map(|&(v, _)| v);
199 self.min_vruntime = match leftmost_vruntime {
200 Some(lv) => vruntime.min(lv),
201 None => vruntime,
202 };
203
204 // No other runnable task : keep running.
205 if leftmost_vruntime.is_none() {
206 return false;
207 }
208
209 rt.period_delta_ticks > self.time_slice(weight)
210 || vruntime > self.min_vruntime + self.vtime_slice()
211 } else {
212 false
213 }
214 }
215
216 /// Removes the task identified by `task_id` from the run queue.
217 ///
218 /// Uses the `by_id` reverse index for O(log n) lookup, then removes both
219 /// entries. Allocation-free. Returns `true` if the task was present.
220 fn remove(&mut self, task_id: crate::process::TaskId) -> bool {
221 let Some(key) = self.by_id.remove(&task_id.as_u64()) else {
222 return false;
223 };
224 if let Some((task, weight)) = self.entities.remove(&key) {
225 task.fair_invalidate_rq_entry();
226 self.total_weight = self.total_weight.saturating_sub(weight);
227 self.runnable_count = self.runnable_count.saturating_sub(1);
228 true
229 } else {
230 // `by_id` and `entities` are kept in sync on every mutation path;
231 // reaching here indicates a bug in this module.
232 debug_assert!(
233 false,
234 "FairClassRq: by_id/entities out of sync for task {:?}",
235 task_id
236 );
237 false
238 }
239 }
240}