Skip to main content

strat9_kernel/process/
scheduler.rs

1//! Scheduler implementation
2//!
3//! Implements a per-CPU round-robin scheduler for Strat9-OS with support for
4//! cooperative and preemptive multitasking.
5//!
6//! ## Preemption design
7//!
8//! The timer interrupt (100Hz) calls `maybe_preempt()` which picks the next
9//! task and performs a context switch. Interrupts are disabled while the
10//! scheduler lock is held to prevent deadlock on single-core systems:
11//!
12//! - `yield_task()`: CLI → lock → pick next → TSS/CR3 → unlock → switch_context → restore IF
13//! - Timer handler: CPU already cleared IF → lock → pick next → TSS/CR3 → unlock → switch_context
14//!
15//! Each task has its own 16KB kernel stack. Callee-saved registers are
16//! pushed/popped by `switch_context()`. `CpuContext` only stores `saved_rsp`.
17//!
18//! TODO(v3 scheduler):
19//! - API stabilization before adding more features:
20//!   - freeze scheduler command syntax.
21//!   - add a small machine-friendly output format (key=value) for scripts/debug.
22//! - observability v2:
23//!   - per-class latency/wait histograms.
24//!   - one structured dump format (instead of free-form text logs) for top/debug.
25//! - targeted scheduler tests (high priority):
26//!   - config validation/reject paths (class/policy map).
27//!   - ready-task migration on class-table updates.
28//!   - SMP steal/preempt non-regression.
29//! - only then: CPU affinity (first truly useful advanced scheduler feature).
30//!
31//! Legacy backlog:
32//! - class registry v2:
33//!   - dynamic add/remove/reorder with validation and safe reject path.
34//!   - policy->class mapping as runtime registry (not only static enum mapping).
35//! - atomic class-table migration:
36//!   - RCU/STW swap + migration of queued tasks across classes.
37//!   - preserve per-task accounting (vruntime, rt budget, wake deadlines).
38//! - balancing v2:
39//!   - dedicated balancer module, per-class steal policy, CPU affinity masks.
40//!   - NUMA-aware placement (future) and stronger anti-thrashing controls.
41//! - SMP hardening:
42//!   - explicit lock hierarchy doc + assertions.
43//!   - improved resched IPI batching/coalescing policy tuning.
44//! - observability v2:
45//!   - latency/wait-time histograms per class + structured trace dump.
46//!   - shell/top integration over stable snapshot API.
47//! - tests:
48//!   - deterministic migration/policy-remap/SMP-steal suites.
49//!   - fairness/starvation long-run regression in test ISO.
50//!
51//! Optimization roadmap (stability-first, incremental):
52//! 1) Lock contention reduction (highest ROI, low risk)
53//!    - keep scheduler critical sections minimal: compute decisions under lock,
54//!      execute expensive side effects (IPI, signal delivery, cleanup) after unlock.
55//!    - split hot paths into tiny helpers with explicit "lock held / lock free" contract.
56//!    - add/track contention counters in every try_lock fallback path.
57//! 2) Wakeup path scalability (only after strong guards)
58//!    - re-introduce deadline index behind a runtime feature flag (default OFF).
59//!    - enforce single writer API for wake deadlines (no direct field stores in syscalls).
60//!    - add strict invariants:
61//!      - if task has deadline != 0, index contains task exactly once.
62//!      - on wake/kill/exit/resume, deadline is removed from index and field cleared.
63//!    - keep safe fallback scan path available and switchable at runtime.
64//! 3) Scheduler observability for regressions
65//!    - keep stable key=value output for scripts (`scheduler metrics kv`, `scheduler dump kv`).
66//!    - expose blocked-task ids and per-cpu preempt causes to diagnose stalls quickly.
67//!    - include boot-phase and lock-miss counters in all dump modes.
68//! 4) Balancing/pick optimizations
69//!    - tune steal hysteresis/cooldown with metrics, avoid ping-pong migration.
70//!    - avoid counting idle task as runnable load for CPU selection.
71//!    - add bounded per-tick work budgets to prevent long interrupt latency tails.
72//! 5) Safety rails before each optimization lands
73//!    - ship each optimization in one isolated patchset with rollback switch.
74//!    - validate with targeted scenarios:
75//!      - boot + shell responsiveness,
76//!      - timeout-heavy workload (poll/futex/nanosleep),
77//!      - SMP preempt/steal stress.
78//!    - if any regression appears, disable feature first, debug second.
79
80use super::task::{Pid, Task, TaskId, TaskPriority, TaskState, Tid};
81use crate::{
82    arch::x86_64::{apic, percpu, restore_flags, save_flags_and_cli, timer, timer::NS_PER_TICK},
83    sync::SpinLock,
84};
85use alloc::{collections::BTreeMap, sync::Arc, vec::Vec};
86use core::sync::atomic::{AtomicBool, AtomicU64, Ordering};
87use spin::RwLock as SpinRwLock;
88
89/// Per-CPU scheduler tick counters used for CPU usage estimation.
90///
91/// - `CPU_TOTAL_TICKS[cpu]`: all timer ticks observed on `cpu`.
92/// - `CPU_IDLE_TICKS[cpu]`: ticks where the idle task was running on `cpu`.
93///
94/// CPU usage over a time window:
95/// `usage = 1 - (delta_idle / delta_total)`.
96static CPU_TOTAL_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
97    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
98static CPU_IDLE_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
99    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
100static CPU_RT_RUNTIME_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
101    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
102static CPU_FAIR_RUNTIME_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
103    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
104static CPU_SWITCH_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
105    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
106static CPU_PREEMPT_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
107    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
108static CPU_STEAL_IN_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
109    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
110static CPU_STEAL_OUT_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
111    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
112static CPU_TRY_LOCK_FAIL_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
113    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
114static RESCHED_IPI_PENDING: [AtomicBool; crate::arch::x86_64::percpu::MAX_CPUS] =
115    [const { AtomicBool::new(false) }; crate::arch::x86_64::percpu::MAX_CPUS];
116static IPI_SEND_TRACE_BUDGET: AtomicU64 = AtomicU64::new(64);
117/// Lock-free per-CPU hint: request a local preemption as soon as maybe_preempt
118/// can observe scheduler state. Written from IRQ paths without touching
119/// `GLOBAL_SCHED_STATE`, consumed under scheduler lock in `maybe_preempt`.
120static FORCE_RESCHED_HINT: [AtomicBool; crate::arch::x86_64::percpu::MAX_CPUS] =
121    [const { AtomicBool::new(false) }; crate::arch::x86_64::percpu::MAX_CPUS];
122static LAST_STEAL_TICK: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
123    [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
124/// One-shot flag per CPU: set to true after the first preemption is logged.
125/// Prevents flooding the serial port with a preempt trace on every tick.
126pub(crate) static FIRST_PREEMPT_LOGGED: [AtomicBool; crate::arch::x86_64::percpu::MAX_CPUS] =
127    [const { AtomicBool::new(false) }; crate::arch::x86_64::percpu::MAX_CPUS];
128
129const STEAL_IMBALANCE_MIN: usize = 2;
130const STEAL_COOLDOWN_TICKS: u64 = 2;
131
132/// Performs the active cpu count operation.
133#[inline]
134fn active_cpu_count() -> usize {
135    crate::arch::x86_64::smp::cpu_count()
136        .max(1)
137        .min(crate::arch::x86_64::percpu::MAX_CPUS)
138}
139
140/// Performs the cpu is valid operation.
141#[inline]
142fn cpu_is_valid(cpu: usize) -> bool {
143    cpu < crate::arch::x86_64::percpu::MAX_CPUS
144}
145
146#[derive(Clone, Copy)]
147pub struct CpuUsageSnapshot {
148    pub cpu_count: usize,
149    pub total_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
150    pub idle_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
151}
152
153#[derive(Clone, Copy)]
154pub struct SchedulerMetricsSnapshot {
155    pub cpu_count: usize,
156    pub rt_runtime_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
157    pub fair_runtime_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
158    pub idle_runtime_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
159    pub switch_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
160    pub preempt_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
161    pub steal_in_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
162    pub steal_out_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
163    pub try_lock_fail_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
164}
165
166#[derive(Clone, Copy)]
167pub struct SchedulerStateSnapshot {
168    pub initialized: bool,
169    pub boot_phase: u8,
170    pub cpu_count: usize,
171    pub pick_order: [crate::process::sched::SchedClassId; 3],
172    pub steal_order: [crate::process::sched::SchedClassId; 2],
173    pub blocked_tasks: usize,
174    pub current_task: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
175    pub rq_rt: [usize; crate::arch::x86_64::percpu::MAX_CPUS],
176    pub rq_fair: [usize; crate::arch::x86_64::percpu::MAX_CPUS],
177    pub rq_idle: [usize; crate::arch::x86_64::percpu::MAX_CPUS],
178    pub need_resched: [bool; crate::arch::x86_64::percpu::MAX_CPUS],
179}
180
181/// Performs the cpu usage snapshot operation.
182pub fn cpu_usage_snapshot() -> CpuUsageSnapshot {
183    let cpu_count = active_cpu_count();
184    let mut total_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
185    let mut idle_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
186    for i in 0..cpu_count {
187        total_ticks[i] = CPU_TOTAL_TICKS[i].load(Ordering::Relaxed);
188        idle_ticks[i] = CPU_IDLE_TICKS[i].load(Ordering::Relaxed);
189    }
190    CpuUsageSnapshot {
191        cpu_count,
192        total_ticks,
193        idle_ticks,
194    }
195}
196
197/// Performs the scheduler metrics snapshot operation.
198pub fn scheduler_metrics_snapshot() -> SchedulerMetricsSnapshot {
199    let cpu_count = active_cpu_count();
200    let mut rt_runtime_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
201    let mut fair_runtime_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
202    let mut idle_runtime_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
203    let mut switch_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
204    let mut preempt_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
205    let mut steal_in_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
206    let mut steal_out_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
207    let mut try_lock_fail_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
208    for i in 0..cpu_count {
209        rt_runtime_ticks[i] = CPU_RT_RUNTIME_TICKS[i].load(Ordering::Relaxed);
210        fair_runtime_ticks[i] = CPU_FAIR_RUNTIME_TICKS[i].load(Ordering::Relaxed);
211        idle_runtime_ticks[i] = CPU_IDLE_TICKS[i].load(Ordering::Relaxed);
212        switch_count[i] = CPU_SWITCH_COUNT[i].load(Ordering::Relaxed);
213        preempt_count[i] = CPU_PREEMPT_COUNT[i].load(Ordering::Relaxed);
214        steal_in_count[i] = CPU_STEAL_IN_COUNT[i].load(Ordering::Relaxed);
215        steal_out_count[i] = CPU_STEAL_OUT_COUNT[i].load(Ordering::Relaxed);
216        try_lock_fail_count[i] = CPU_TRY_LOCK_FAIL_COUNT[i].load(Ordering::Relaxed);
217    }
218    SchedulerMetricsSnapshot {
219        cpu_count,
220        rt_runtime_ticks,
221        fair_runtime_ticks,
222        idle_runtime_ticks,
223        switch_count,
224        preempt_count,
225        steal_in_count,
226        steal_out_count,
227        try_lock_fail_count,
228    }
229}
230
231/// Performs the reset scheduler metrics operation.
232pub fn reset_scheduler_metrics() {
233    let cpu_count = active_cpu_count();
234    for i in 0..cpu_count {
235        CPU_RT_RUNTIME_TICKS[i].store(0, Ordering::Relaxed);
236        CPU_FAIR_RUNTIME_TICKS[i].store(0, Ordering::Relaxed);
237        CPU_IDLE_TICKS[i].store(0, Ordering::Relaxed);
238        CPU_SWITCH_COUNT[i].store(0, Ordering::Relaxed);
239        CPU_PREEMPT_COUNT[i].store(0, Ordering::Relaxed);
240        CPU_STEAL_IN_COUNT[i].store(0, Ordering::Relaxed);
241        CPU_STEAL_OUT_COUNT[i].store(0, Ordering::Relaxed);
242        CPU_TRY_LOCK_FAIL_COUNT[i].store(0, Ordering::Relaxed);
243    }
244}
245
246/// Performs the note try lock fail on cpu operation.
247#[inline]
248pub(crate) fn note_try_lock_fail_on_cpu(cpu: usize) {
249    if cpu_is_valid(cpu) {
250        CPU_TRY_LOCK_FAIL_COUNT[cpu].fetch_add(1, Ordering::Relaxed);
251    }
252}
253
254/// Performs the note try lock fail operation.
255#[inline]
256pub fn note_try_lock_fail() {
257    note_try_lock_fail_on_cpu(current_cpu_index());
258}
259
260// ========== Cross-CPU IPI helpers ==========================================================================================================================================================================
261
262/// Send a reschedule IPI to `cpu_index`.
263/// No-op if APIC is not initialized, or if `cpu_index` is the current CPU
264/// (the caller already handles the local-CPU case via `yield_cpu`).
265fn send_resched_ipi_to_cpu(cpu_index: usize) {
266    if !cpu_is_valid(cpu_index) {
267        return;
268    }
269    if !apic::is_initialized() {
270        return;
271    }
272    let my_cpu = current_cpu_index();
273    let should_trace = IPI_SEND_TRACE_BUDGET
274        .fetch_update(Ordering::AcqRel, Ordering::Relaxed, |budget| {
275            budget.checked_sub(1)
276        })
277        .is_ok();
278    if let Some(target_apic) = percpu::apic_id_by_cpu_index(cpu_index) {
279        if let Some(my_apic) = percpu::apic_id_by_cpu_index(my_cpu) {
280            if target_apic != my_apic {
281                if RESCHED_IPI_PENDING[cpu_index].swap(true, Ordering::AcqRel) {
282                    if should_trace {
283                        crate::e9_println!(
284                            "[ipi-send-skip] from_cpu={} to_cpu={} to_apic={:#x} pending=1",
285                            my_cpu,
286                            cpu_index,
287                            target_apic
288                        );
289                    }
290                    return;
291                }
292                if should_trace {
293                    crate::e9_println!(
294                        "[ipi-send] from_cpu={} from_apic={:#x} to_cpu={} to_apic={:#x}",
295                        my_cpu,
296                        my_apic,
297                        cpu_index,
298                        target_apic
299                    );
300                }
301                apic::send_resched_ipi(target_apic);
302            }
303        }
304    }
305}
306
307/// Request a local force-reschedule hint for `cpu`.
308#[inline]
309pub(crate) fn request_force_resched_hint(cpu: usize) {
310    if cpu_is_valid(cpu) {
311        FORCE_RESCHED_HINT[cpu].store(true, Ordering::Release);
312    }
313}
314
315/// Consume and clear the local force-reschedule hint for `cpu`.
316#[inline]
317pub(crate) fn take_force_resched_hint(cpu: usize) -> bool {
318    if cpu_is_valid(cpu) {
319        FORCE_RESCHED_HINT[cpu].swap(false, Ordering::AcqRel)
320    } else {
321        false
322    }
323}
324
325/// Global scheduler state : cold path: fork, exit, wake, block.
326///
327/// Lock order: acquire `GLOBAL_SCHED_STATE` before `LOCAL_SCHEDULERS[n]`
328/// when both are needed. Never hold a LOCAL lock and then block-acquire GLOBAL.
329///
330/// When both `GLOBAL_SCHED_STATE` and `SCHED_IDENTITY` are needed, always take
331/// `GLOBAL_SCHED_STATE` first, then `SCHED_IDENTITY`. Paths that reversed this
332/// order deadlocked with `kill_task` / `try_reap_child_locked` during shutdown.
333pub(crate) static GLOBAL_SCHED_STATE: SpinLock<Option<GlobalSchedState>> = SpinLock::new(None);
334
335/// Returns the scheduler lock address for deadlock tracing.
336pub fn debug_scheduler_lock_addr() -> usize {
337    &GLOBAL_SCHED_STATE as *const _ as usize
338}
339
340/// Global tick counter (safe to increment from interrupt context)
341static TICK_COUNT: AtomicU64 = AtomicU64::new(0);
342/// Verbose scheduler trace switch.
343static SCHED_VERBOSE: AtomicBool = AtomicBool::new(false);
344
345/// Performs the sched trace operation.
346#[inline]
347fn sched_trace(args: core::fmt::Arguments<'_>) {
348    if SCHED_VERBOSE.load(Ordering::Relaxed) {
349        log::debug!("[sched] {}", args);
350    }
351}
352
353/// Information needed to perform a context switch after releasing the lock.
354pub(super) struct SwitchTarget {
355    pub(super) old_rsp_ptr: *mut u64,
356    pub(super) new_rsp_ptr: *const u64,
357    pub(super) old_fpu_ptr: *mut u8,
358    pub(super) new_fpu_ptr: *const u8,
359    pub(super) old_xcr0: u64,
360    pub(super) new_xcr0: u64,
361}
362
363// SAFETY: The pointers in SwitchTarget point into Arc<Task> objects
364// that are kept alive by the scheduler. The scheduler lock ensures
365// exclusive access when computing these pointers.
366unsafe impl Send for SwitchTarget {}
367
368/// Result of a non-blocking wait on child exit.
369pub enum WaitChildResult {
370    Reaped {
371        child: TaskId,
372        pid: Pid,
373        status: i32,
374    },
375    NoChildren,
376    StillRunning,
377}
378
379/// Performs the current cpu index operation.
380fn current_cpu_index() -> usize {
381    crate::arch::x86_64::percpu::current_cpu_index()
382}
383
384struct PerCpuClassRqSet {
385    real_time: crate::process::sched::real_time::RealTimeClassRq,
386    fair: crate::process::sched::fair::FairClassRq,
387    idle: crate::process::sched::idle::IdleClassRq,
388}
389
390impl PerCpuClassRqSet {
391    /// Creates a new instance.
392    fn new() -> Self {
393        Self {
394            real_time: crate::process::sched::real_time::RealTimeClassRq::new(),
395            fair: crate::process::sched::fair::FairClassRq::new(),
396            idle: crate::process::sched::idle::IdleClassRq::new(),
397        }
398    }
399
400    /// Performs the enqueue operation.
401    fn enqueue(&mut self, class: crate::process::sched::SchedClassId, task: Arc<Task>) {
402        use crate::process::sched::SchedClassRq;
403        unsafe { core::arch::asm!("mov al, 'q'; out 0xe9, al", out("al") _) };
404        match class {
405            crate::process::sched::SchedClassId::Fair => {
406                unsafe { core::arch::asm!("mov al, 'f'; out 0xe9, al", out("al") _) };
407                self.fair.enqueue(task);
408            }
409            crate::process::sched::SchedClassId::RealTime => {
410                unsafe { core::arch::asm!("mov al, 'r'; out 0xe9, al", out("al") _) };
411                self.real_time.enqueue(task);
412            }
413            crate::process::sched::SchedClassId::Idle => {
414                unsafe { core::arch::asm!("mov al, 'i'; out 0xe9, al", out("al") _) };
415                self.idle.enqueue(task);
416            }
417        }
418        unsafe { core::arch::asm!("mov al, 'Q'; out 0xe9, al", out("al") _) };
419    }
420
421    /// Performs the len by class operation.
422    fn len_by_class(&self, class: crate::process::sched::SchedClassId) -> usize {
423        use crate::process::sched::SchedClassRq;
424        match class {
425            crate::process::sched::SchedClassId::Fair => self.fair.len(),
426            crate::process::sched::SchedClassId::RealTime => self.real_time.len(),
427            crate::process::sched::SchedClassId::Idle => self.idle.len(),
428        }
429    }
430
431    /// Performs the runnable len operation.
432    fn runnable_len(&self) -> usize {
433        self.len_by_class(crate::process::sched::SchedClassId::RealTime)
434            + self.len_by_class(crate::process::sched::SchedClassId::Fair)
435    }
436
437    /// Performs the pick next by class operation.
438    fn pick_next_by_class(
439        &mut self,
440        class: crate::process::sched::SchedClassId,
441    ) -> Option<Arc<Task>> {
442        use crate::process::sched::SchedClassRq;
443        match class {
444            crate::process::sched::SchedClassId::Fair => self.fair.pick_next(),
445            crate::process::sched::SchedClassId::RealTime => self.real_time.pick_next(),
446            crate::process::sched::SchedClassId::Idle => self.idle.pick_next(),
447        }
448    }
449
450    /// Performs the pick next operation.
451    fn pick_next(&mut self, table: &crate::process::sched::SchedClassTable) -> Option<Arc<Task>> {
452        for class in table.pick_order().iter().copied() {
453            if let Some(task) = self.pick_next_by_class(class) {
454                return Some(task);
455            }
456        }
457        None
458    }
459
460    /// Updates current.
461    fn update_current(
462        &mut self,
463        rt: &crate::process::sched::CurrentRuntime,
464        task: &Task,
465        is_yield: bool,
466        table: &crate::process::sched::SchedClassTable,
467    ) -> bool {
468        use crate::process::sched::SchedClassRq;
469        let should_preempt = match table.class_for_task(task) {
470            crate::process::sched::SchedClassId::Fair => {
471                self.fair.update_current(rt, task, is_yield)
472            }
473            crate::process::sched::SchedClassId::RealTime => {
474                self.real_time.update_current(rt, task, is_yield)
475            }
476            crate::process::sched::SchedClassId::Idle => {
477                self.idle.update_current(rt, task, is_yield)
478            }
479        };
480        // Always preempt idle task if there are other tasks ready
481        let any_ready = !self.real_time.is_empty() || !self.fair.is_empty();
482        should_preempt
483            || (table.class_for_task(task) == crate::process::sched::SchedClassId::Idle
484                && any_ready)
485    }
486
487    /// Performs the remove operation.
488    fn remove(&mut self, task_id: crate::process::TaskId) -> bool {
489        use crate::process::sched::SchedClassRq;
490        self.real_time.remove(task_id) || self.fair.remove(task_id) || self.idle.remove(task_id)
491    }
492
493    /// Performs the steal candidate operation.
494    fn steal_candidate(
495        &mut self,
496        table: &crate::process::sched::SchedClassTable,
497    ) -> Option<Arc<Task>> {
498        for class in table.steal_order().iter().copied() {
499            if let Some(task) = self.pick_next_by_class(class) {
500                return Some(task);
501            }
502        }
503        None
504    }
505}
506
507/// Per-CPU scheduler state
508struct SchedulerCpu {
509    /// Multi-class priority queues
510    class_rqs: PerCpuClassRqSet,
511    /// Currently running task
512    current_task: Option<Arc<Task>>,
513    /// Current runtime accounting
514    current_runtime: crate::process::sched::CurrentRuntime,
515    /// Idle task to run when no other tasks are ready
516    idle_task: Arc<Task>,
517    /// Task that was just preempted and needs to be re-queued
518    task_to_requeue: Option<Arc<Task>>,
519    /// Task that is dying or blocked, to drop outside the scheduler lock
520    task_to_drop: Option<Arc<Task>>,
521    /// Flag indicating if the current task's time slice has expired
522    need_resched: bool,
523    /// Local copy of the class table for hot-path use without GLOBAL lock.
524    /// Updated atomically when the global class table changes.
525    class_table: crate::process::sched::SchedClassTable,
526}
527
528/// Per-CPU local scheduler locks : hot path: timer tick, preemption, yield.
529///
530/// Lock order: LOCAL before nothing; never hold two LOCAL locks simultaneously
531/// (use `try_lock` when touching a sibling CPU).
532#[allow(dead_code)]
533pub(crate) static LOCAL_SCHEDULERS: [SpinLock<Option<SchedulerCpu>>;
534    crate::arch::x86_64::percpu::MAX_CPUS] =
535    [const { SpinLock::new(None) }; crate::arch::x86_64::percpu::MAX_CPUS];
536
537/// Blocked tasks registry : hot path: block/wake.
538///
539/// This lock is **independent** of `GLOBAL_SCHED_STATE`. The block and wake
540/// paths acquire only this lock + the target CPU's `LOCAL_SCHEDULERS[cpu]`
541/// lock, avoiding contention with cold-path operations (fork, exit, kill).
542///
543/// Lock order: BLOCKED_TASKS before LOCAL (never the reverse).
544pub(crate) static BLOCKED_TASKS: SpinLock<BTreeMap<TaskId, Arc<Task>>> =
545    SpinLock::new(BTreeMap::new());
546
547/// Identity maps : cold path: PID/TID lookups, process groups, sessions, parent/child.
548///
549/// Separate from `GLOBAL_SCHED_STATE` so that identity lookups (getpid, getpgid,
550/// setpgid, etc.) never contend with fork/exit/zombie management.
551///
552/// Upgraded to an `RwLock` so that concurrent readers (`getpid`, `gettid`,
553/// `getppid`, `getpgid`, `getsid`, `get_task_by_pid`) do not serialize on
554/// each other.  Only writers (`fork`, `exit`, `setpgid`, `setsid`, `kill`)
555/// acquire the exclusive write lock.
556///
557/// Lock order: SCHED_IDENTITY before LOCAL (never the reverse).
558/// SCHED_IDENTITY and BLOCKED_TASKS are independent : never hold both.
559pub(crate) static SCHED_IDENTITY: SpinRwLock<SchedIdentity> = SpinRwLock::new(SchedIdentity::new());
560
561/// Identity maps for the scheduler: PID/TID routing, process groups,
562/// session membership, and parent/child relationships.
563///
564/// Lives behind the `SCHED_IDENTITY` lock, separate from `GLOBAL_SCHED_STATE`
565/// so that syscall lookups (`getpid`, `getpgid`, `setpgid`, `setsid`, etc.)
566/// never contend with fork/exit or block/wake paths.
567pub struct SchedIdentity {
568    /// Map userspace PID -> internal TaskId (process leader in current model).
569    pub pid_to_task: BTreeMap<Pid, TaskId>,
570    /// Map userspace TID -> internal TaskId (fast thread lookup).
571    pub tid_to_task: BTreeMap<Tid, TaskId>,
572    /// Map PID -> process group id.
573    pub pid_to_pgid: BTreeMap<Pid, Pid>,
574    /// Map PID -> session id.
575    pub pid_to_sid: BTreeMap<Pid, Pid>,
576    /// Group membership index: pgid -> task ids.
577    pub pgid_members: BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
578    /// Session membership index: sid -> task ids.
579    pub sid_members: BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
580    /// Parent relationship: child -> parent
581    pub parent_of: BTreeMap<TaskId, TaskId>,
582    /// Children list: parent -> children
583    pub children_of: BTreeMap<TaskId, alloc::vec::Vec<TaskId>>,
584}
585
586impl SchedIdentity {
587    /// Creates a new empty identity registry.
588    pub const fn new() -> Self {
589        Self {
590            pid_to_task: BTreeMap::new(),
591            tid_to_task: BTreeMap::new(),
592            pid_to_pgid: BTreeMap::new(),
593            pid_to_sid: BTreeMap::new(),
594            pgid_members: BTreeMap::new(),
595            sid_members: BTreeMap::new(),
596            parent_of: BTreeMap::new(),
597            children_of: BTreeMap::new(),
598        }
599    }
600}
601
602/// Global task registry : cold path: fork, exit, all_tasks scan.
603///
604/// Lock order: acquire GLOBAL_SCHED_STATE before LOCAL when both are needed.
605/// Per-CPU runqueues and current-task tracking live in `LOCAL_SCHEDULERS`.
606/// Blocked tasks are tracked in `BLOCKED_TASKS` (separate lock).
607/// Identity maps (PID/TID, pgid, sid, parent/child) are in `SCHED_IDENTITY` (separate lock).
608/// This struct holds only data that is accessed by cold paths (fork, exit,
609/// all_tasks scan, zombie management, wake deadlines) and is protected by the
610/// `GLOBAL_SCHED_STATE` lock.
611pub struct GlobalSchedState {
612    /// All tasks in the system (for lookup by TaskId)
613    pub(crate) all_tasks: BTreeMap<TaskId, Arc<Task>>,
614    /// Flat task snapshot used by IRQ-safe scans such as `tick_all_timers`.
615    ///
616    /// Unlike `BTreeMap::iter()`, iterating a contiguous `Vec<Arc<Task>>` avoids
617    /// tree-walk pointer chasing in hard-IRQ context. The authoritative storage
618    /// remains `all_tasks`; this mirror is updated under the same scheduler lock.
619    pub(crate) all_tasks_scan: Vec<Arc<Task>>,
620    /// Map TaskId -> CPU index (for wake/resume routing)
621    task_cpu: BTreeMap<TaskId, usize>,
622    /// Deadline -> task ids map for sleeping tasks (ordered wakeups).
623    #[allow(dead_code)]
624    wake_deadlines: BTreeMap<u64, alloc::vec::Vec<TaskId>>,
625    /// Task -> deadline reverse index.
626    #[allow(dead_code)]
627    wake_deadline_of: BTreeMap<TaskId, u64>,
628    /// Zombie exit statuses: child -> (exit_code, pid)
629    zombies: BTreeMap<TaskId, (i32, Pid)>,
630    /// Scheduler class table (pick order, steal order, class metadata)
631    class_table: crate::process::sched::SchedClassTable,
632}
633
634/// Performs the validate task context operation.
635fn validate_task_context(task: &Arc<Task>) -> Result<(), &'static str> {
636    let ptr = Arc::as_ptr(task);
637    if ptr.is_null() {
638        return Err("null Task pointer in Arc");
639    }
640    let saved_rsp = unsafe { (*task.context.get()).saved_rsp };
641    let stack_base = task.kernel_stack.virt_base.as_u64();
642    let stack_top = stack_base.saturating_add(task.kernel_stack.size as u64);
643
644    if saved_rsp < stack_base || saved_rsp.saturating_add(56) > stack_top {
645        return Err("saved_rsp outside kernel stack bounds");
646    }
647
648    // For our switch frame layout, return IP is at [saved_rsp + 48].
649    // Use read_unaligned: saved_rsp is only guaranteed to be within the stack
650    // bounds, not necessarily aligned to 8 bytes at this offset.
651    let ret_ip = unsafe { core::ptr::read_unaligned((saved_rsp + 48) as *const u64) };
652    if ret_ip == 0 {
653        return Err("null return IP in switch frame");
654    }
655
656    Ok(())
657}
658
659mod core_impl;
660pub mod perf_counters;
661mod runtime_ops;
662mod task_ops;
663mod timer_ops;
664
665pub use runtime_ops::*;
666pub use task_ops::*;
667pub use timer_ops::*;