Skip to main content

strat9_kernel/process/sched_classes/
fair.rs

1// SPDX-License-Identifier: MPL-2.0
2
3use super::{CurrentRuntime, SchedClassRq};
4use crate::process::task::Task;
5use alloc::{collections::BTreeMap, sync::Arc};
6
7const WEIGHT_0: u64 = 1024;
8
9/// Base time slice per task in ticks for the CFS fair scheduler.
10///
11/// At TIMER_HZ=100 (10 ms/tick):
12///   BASE_SLICE_TICKS = 1 -> 1 tick = 10 ms per task (matches `quantum_ms: 10`)
13///
14/// Previously this was mistakenly 10, giving 10 ticks = 100 ms slices and
15/// effectively disabling preemption for lightly loaded workloads.
16///
17/// Derivation: target_ms = 10 ms, tick_ms = 1000 / TIMER_HZ = 10 ms -> 1 tick.
18const BASE_SLICE_TICKS: u64 = 1;
19
20/// Performs the nice to weight operation.
21pub const fn nice_to_weight(nice: super::nice::Nice) -> u64 {
22    const FACTOR_NUMERATOR: u64 = 5;
23    const FACTOR_DENOMINATOR: u64 = 4;
24
25    const NICE_TO_WEIGHT: [u64; 40] = const {
26        let mut ret = [0; 40];
27        let mut index = 0;
28        let mut nice = super::nice::NiceValue::MIN.get();
29        while nice <= super::nice::NiceValue::MAX.get() {
30            ret[index] = match nice {
31                0 => WEIGHT_0,
32                nice @ 1.. => {
33                    let numerator = FACTOR_DENOMINATOR.pow(nice as u32);
34                    let denominator = FACTOR_NUMERATOR.pow(nice as u32);
35                    WEIGHT_0 * numerator / denominator
36                }
37                nice => {
38                    let numerator = FACTOR_NUMERATOR.pow((-nice) as u32);
39                    let denominator = FACTOR_DENOMINATOR.pow((-nice) as u32);
40                    WEIGHT_0 * numerator / denominator
41                }
42            };
43            index += 1;
44            nice += 1;
45        }
46        ret
47    };
48
49    NICE_TO_WEIGHT[(nice.value().get() + 20) as usize]
50}
51
52/// Per-CPU run queue for the Completely Fair Scheduler.
53///
54/// Uses two `BTreeMap`s for O(log n) operations without allocation on the
55/// fast paths (`pick_next`, `remove`):
56///
57/// - `entities`: primary map keyed by `(vruntime, task_id)` : the minimum
58///   entry is always the next task to schedule.
59/// - `by_id`: reverse index mapping `task_id → primary key` : enables O(log n)
60///   removal by task ID in `remove()` without scanning `entities`.
61///
62/// This replaces the previous `BinaryHeap`-based design which used *lazy
63/// deletion* (O(n) `remove()` scan, phantom entries, generation counters).
64/// The `BTreeMap` approach gives:
65///
66/// | Operation   | Complexity | Allocates?                              |
67/// |-------------|------------|-----------------------------------------|
68/// | `enqueue`   | O(log n)   | yes : 2 BTreeMap nodes (wakeup path)    |
69/// | `pick_next` | O(log n)   | no  : removes 2 nodes                   |
70/// | `remove`    | O(log n)   | no  : removes 2 nodes                   |
71///
72/// No phantom entries means no generation counter, no `prune_stale_head()`,
73/// and no per-entry liveness checks.  The BTreeMap pair is the authoritative
74/// record of which tasks are currently on the run queue.
75pub struct FairClassRq {
76    /// Primary index: `(vruntime, task_id)` → `(Arc<Task>, weight)`.
77    /// Ordered so `pop_first()` yields the task with the smallest vruntime.
78    /// `task_id` is part of the key to ensure uniqueness when two tasks share
79    /// the same vruntime.
80    entities: BTreeMap<(u64, u64), (Arc<Task>, u64)>,
81    /// Reverse index: `task_id` → primary key.
82    /// Allows `remove(task_id)` to locate and delete the `entities` entry in
83    /// O(log n) without scanning the primary map.
84    by_id: BTreeMap<u64, (u64, u64)>,
85    min_vruntime: u64,
86    total_weight: u64,
87    runnable_count: usize,
88}
89
90impl FairClassRq {
91    /// Creates a new instance.
92    pub fn new() -> Self {
93        Self {
94            entities: BTreeMap::new(),
95            by_id: BTreeMap::new(),
96            min_vruntime: 0,
97            total_weight: 0,
98            runnable_count: 0,
99        }
100    }
101
102    /// Total scheduling period in ticks.
103    ///
104    /// `BASE_SLICE_TICKS * (nr_runnable + 1)` : each runnable task gets at
105    /// least one full `BASE_SLICE_TICKS` per round.  `+1` accounts for the
106    /// currently-running task that is not counted in `runnable_count`.
107    fn period(&self) -> u64 {
108        let count = (self.runnable_count + 1) as u64;
109        (BASE_SLICE_TICKS * count).max(BASE_SLICE_TICKS)
110    }
111
112    /// Virtual-time slice: the vruntime budget for the current task.
113    fn vtime_slice(&self) -> u64 {
114        self.period() / (self.runnable_count + 1) as u64
115    }
116
117    /// Wall-clock time slice scaled by `cur_weight` relative to total weight.
118    fn time_slice(&self, cur_weight: u64) -> u64 {
119        let denom = self.total_weight + cur_weight;
120        if denom == 0 {
121            return self.period();
122        }
123        self.period() * cur_weight / denom
124    }
125}
126
127impl SchedClassRq for FairClassRq {
128    /// Enqueues a task onto the run queue.
129    ///
130    /// Clamps `vruntime` to `min_vruntime` so waking tasks do not receive an
131    /// unfair head start over tasks that have been waiting.  O(log n);
132    /// allocates two BTreeMap nodes.
133    fn enqueue(&mut self, task: Arc<Task>) {
134        if let super::SchedPolicy::Fair(nice) = task.sched_policy() {
135            let task_id = task.id.as_u64();
136
137            // Guard against double-enqueue: `by_id` is the authoritative
138            // membership record.  This should not occur in normal operation.
139            if self.by_id.contains_key(&task_id) {
140                return;
141            }
142
143            let weight = nice_to_weight(nice);
144            let mut vruntime = task.vruntime();
145            if vruntime < self.min_vruntime {
146                vruntime = self.min_vruntime;
147            }
148            task.set_vruntime(vruntime);
149            // Keep the task-side `fair_on_rq` flag consistent with external
150            // observers.  The generation return value is unused in this design.
151            task.fair_prepare_enqueue();
152
153            let key = (vruntime, task_id);
154            self.entities.insert(key, (task, weight));
155            self.by_id.insert(task_id, key);
156            self.total_weight += weight;
157            self.runnable_count += 1;
158        }
159    }
160
161    /// Returns the number of tasks currently on the run queue.
162    fn len(&self) -> usize {
163        self.runnable_count
164    }
165
166    /// Picks the next task to run: the one with the smallest vruntime.
167    ///
168    /// O(log n), allocation-free.
169    fn pick_next(&mut self) -> Option<Arc<Task>> {
170        let ((_, task_id), (task, weight)) = self.entities.pop_first()?;
171        self.by_id.remove(&task_id);
172        task.fair_mark_dequeued();
173        self.total_weight = self.total_weight.saturating_sub(weight);
174        self.runnable_count = self.runnable_count.saturating_sub(1);
175        Some(task)
176    }
177
178    /// Updates the vruntime of the currently-running task and decides whether
179    /// it should be preempted.
180    ///
181    /// Returns `true` if the task has exhausted its time slice or its vruntime
182    /// has overtaken the leftmost task's vruntime by more than `vtime_slice`.
183    fn update_current(&mut self, rt: &CurrentRuntime, task: &Task, is_yield: bool) -> bool {
184        if is_yield {
185            return true;
186        }
187        if let super::SchedPolicy::Fair(nice) = task.sched_policy() {
188            let weight = nice_to_weight(nice);
189            let delta_vruntime = if weight == 0 {
190                0
191            } else {
192                rt.delta_ticks * WEIGHT_0 / weight
193            };
194            let vruntime = task.vruntime() + delta_vruntime;
195            task.set_vruntime(vruntime);
196
197            // The leftmost entry is O(log n) to peek on a BTreeMap.
198            let leftmost_vruntime = self.entities.keys().next().map(|&(v, _)| v);
199            self.min_vruntime = match leftmost_vruntime {
200                Some(lv) => vruntime.min(lv),
201                None => vruntime,
202            };
203
204            // No other runnable task : keep running.
205            if leftmost_vruntime.is_none() {
206                return false;
207            }
208
209            rt.period_delta_ticks > self.time_slice(weight)
210                || vruntime > self.min_vruntime + self.vtime_slice()
211        } else {
212            false
213        }
214    }
215
216    /// Removes the task identified by `task_id` from the run queue.
217    ///
218    /// Uses the `by_id` reverse index for O(log n) lookup, then removes both
219    /// entries.  Allocation-free.  Returns `true` if the task was present.
220    fn remove(&mut self, task_id: crate::process::TaskId) -> bool {
221        let Some(key) = self.by_id.remove(&task_id.as_u64()) else {
222            return false;
223        };
224        if let Some((task, weight)) = self.entities.remove(&key) {
225            task.fair_invalidate_rq_entry();
226            self.total_weight = self.total_weight.saturating_sub(weight);
227            self.runnable_count = self.runnable_count.saturating_sub(1);
228            true
229        } else {
230            // `by_id` and `entities` are kept in sync on every mutation path;
231            // reaching here indicates a bug in this module.
232            debug_assert!(
233                false,
234                "FairClassRq: by_id/entities out of sync for task {:?}",
235                task_id
236            );
237            false
238        }
239    }
240}