1use super::task::{Pid, Task, TaskId, TaskPriority, TaskState, Tid};
81use crate::{
82 arch::x86_64::{apic, percpu, restore_flags, save_flags_and_cli, timer, timer::NS_PER_TICK},
83 sync::SpinLock,
84};
85use alloc::{collections::BTreeMap, sync::Arc, vec::Vec};
86use core::sync::atomic::{AtomicBool, AtomicU64, Ordering};
87use spin::RwLock as SpinRwLock;
88
89static CPU_TOTAL_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
97 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
98static CPU_IDLE_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
99 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
100static CPU_RT_RUNTIME_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
101 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
102static CPU_FAIR_RUNTIME_TICKS: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
103 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
104static CPU_SWITCH_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
105 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
106static CPU_PREEMPT_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
107 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
108static CPU_STEAL_IN_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
109 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
110static CPU_STEAL_OUT_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
111 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
112static CPU_TRY_LOCK_FAIL_COUNT: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
113 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
114static RESCHED_IPI_PENDING: [AtomicBool; crate::arch::x86_64::percpu::MAX_CPUS] =
115 [const { AtomicBool::new(false) }; crate::arch::x86_64::percpu::MAX_CPUS];
116static IPI_SEND_TRACE_BUDGET: AtomicU64 = AtomicU64::new(64);
117static FORCE_RESCHED_HINT: [AtomicBool; crate::arch::x86_64::percpu::MAX_CPUS] =
121 [const { AtomicBool::new(false) }; crate::arch::x86_64::percpu::MAX_CPUS];
122static LAST_STEAL_TICK: [AtomicU64; crate::arch::x86_64::percpu::MAX_CPUS] =
123 [const { AtomicU64::new(0) }; crate::arch::x86_64::percpu::MAX_CPUS];
124pub(crate) static FIRST_PREEMPT_LOGGED: [AtomicBool; crate::arch::x86_64::percpu::MAX_CPUS] =
127 [const { AtomicBool::new(false) }; crate::arch::x86_64::percpu::MAX_CPUS];
128
129const STEAL_IMBALANCE_MIN: usize = 2;
130const STEAL_COOLDOWN_TICKS: u64 = 2;
131
132#[inline]
134fn active_cpu_count() -> usize {
135 crate::arch::x86_64::smp::cpu_count()
136 .max(1)
137 .min(crate::arch::x86_64::percpu::MAX_CPUS)
138}
139
140#[inline]
142fn cpu_is_valid(cpu: usize) -> bool {
143 cpu < crate::arch::x86_64::percpu::MAX_CPUS
144}
145
146#[derive(Clone, Copy)]
147pub struct CpuUsageSnapshot {
148 pub cpu_count: usize,
149 pub total_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
150 pub idle_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
151}
152
153#[derive(Clone, Copy)]
154pub struct SchedulerMetricsSnapshot {
155 pub cpu_count: usize,
156 pub rt_runtime_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
157 pub fair_runtime_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
158 pub idle_runtime_ticks: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
159 pub switch_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
160 pub preempt_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
161 pub steal_in_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
162 pub steal_out_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
163 pub try_lock_fail_count: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
164}
165
166#[derive(Clone, Copy)]
167pub struct SchedulerStateSnapshot {
168 pub initialized: bool,
169 pub boot_phase: u8,
170 pub cpu_count: usize,
171 pub pick_order: [crate::process::sched::SchedClassId; 3],
172 pub steal_order: [crate::process::sched::SchedClassId; 2],
173 pub blocked_tasks: usize,
174 pub current_task: [u64; crate::arch::x86_64::percpu::MAX_CPUS],
175 pub rq_rt: [usize; crate::arch::x86_64::percpu::MAX_CPUS],
176 pub rq_fair: [usize; crate::arch::x86_64::percpu::MAX_CPUS],
177 pub rq_idle: [usize; crate::arch::x86_64::percpu::MAX_CPUS],
178 pub need_resched: [bool; crate::arch::x86_64::percpu::MAX_CPUS],
179}
180
181pub fn cpu_usage_snapshot() -> CpuUsageSnapshot {
183 let cpu_count = active_cpu_count();
184 let mut total_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
185 let mut idle_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
186 for i in 0..cpu_count {
187 total_ticks[i] = CPU_TOTAL_TICKS[i].load(Ordering::Relaxed);
188 idle_ticks[i] = CPU_IDLE_TICKS[i].load(Ordering::Relaxed);
189 }
190 CpuUsageSnapshot {
191 cpu_count,
192 total_ticks,
193 idle_ticks,
194 }
195}
196
197pub fn scheduler_metrics_snapshot() -> SchedulerMetricsSnapshot {
199 let cpu_count = active_cpu_count();
200 let mut rt_runtime_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
201 let mut fair_runtime_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
202 let mut idle_runtime_ticks = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
203 let mut switch_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
204 let mut preempt_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
205 let mut steal_in_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
206 let mut steal_out_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
207 let mut try_lock_fail_count = [0u64; crate::arch::x86_64::percpu::MAX_CPUS];
208 for i in 0..cpu_count {
209 rt_runtime_ticks[i] = CPU_RT_RUNTIME_TICKS[i].load(Ordering::Relaxed);
210 fair_runtime_ticks[i] = CPU_FAIR_RUNTIME_TICKS[i].load(Ordering::Relaxed);
211 idle_runtime_ticks[i] = CPU_IDLE_TICKS[i].load(Ordering::Relaxed);
212 switch_count[i] = CPU_SWITCH_COUNT[i].load(Ordering::Relaxed);
213 preempt_count[i] = CPU_PREEMPT_COUNT[i].load(Ordering::Relaxed);
214 steal_in_count[i] = CPU_STEAL_IN_COUNT[i].load(Ordering::Relaxed);
215 steal_out_count[i] = CPU_STEAL_OUT_COUNT[i].load(Ordering::Relaxed);
216 try_lock_fail_count[i] = CPU_TRY_LOCK_FAIL_COUNT[i].load(Ordering::Relaxed);
217 }
218 SchedulerMetricsSnapshot {
219 cpu_count,
220 rt_runtime_ticks,
221 fair_runtime_ticks,
222 idle_runtime_ticks,
223 switch_count,
224 preempt_count,
225 steal_in_count,
226 steal_out_count,
227 try_lock_fail_count,
228 }
229}
230
231pub fn reset_scheduler_metrics() {
233 let cpu_count = active_cpu_count();
234 for i in 0..cpu_count {
235 CPU_RT_RUNTIME_TICKS[i].store(0, Ordering::Relaxed);
236 CPU_FAIR_RUNTIME_TICKS[i].store(0, Ordering::Relaxed);
237 CPU_IDLE_TICKS[i].store(0, Ordering::Relaxed);
238 CPU_SWITCH_COUNT[i].store(0, Ordering::Relaxed);
239 CPU_PREEMPT_COUNT[i].store(0, Ordering::Relaxed);
240 CPU_STEAL_IN_COUNT[i].store(0, Ordering::Relaxed);
241 CPU_STEAL_OUT_COUNT[i].store(0, Ordering::Relaxed);
242 CPU_TRY_LOCK_FAIL_COUNT[i].store(0, Ordering::Relaxed);
243 }
244}
245
246#[inline]
248pub(crate) fn note_try_lock_fail_on_cpu(cpu: usize) {
249 if cpu_is_valid(cpu) {
250 CPU_TRY_LOCK_FAIL_COUNT[cpu].fetch_add(1, Ordering::Relaxed);
251 }
252}
253
254#[inline]
256pub fn note_try_lock_fail() {
257 note_try_lock_fail_on_cpu(current_cpu_index());
258}
259
260fn send_resched_ipi_to_cpu(cpu_index: usize) {
266 if !cpu_is_valid(cpu_index) {
267 return;
268 }
269 if !apic::is_initialized() {
270 return;
271 }
272 let my_cpu = current_cpu_index();
273 let should_trace = IPI_SEND_TRACE_BUDGET
274 .fetch_update(Ordering::AcqRel, Ordering::Relaxed, |budget| {
275 budget.checked_sub(1)
276 })
277 .is_ok();
278 if let Some(target_apic) = percpu::apic_id_by_cpu_index(cpu_index) {
279 if let Some(my_apic) = percpu::apic_id_by_cpu_index(my_cpu) {
280 if target_apic != my_apic {
281 if RESCHED_IPI_PENDING[cpu_index].swap(true, Ordering::AcqRel) {
282 if should_trace {
283 crate::e9_println!(
284 "[ipi-send-skip] from_cpu={} to_cpu={} to_apic={:#x} pending=1",
285 my_cpu,
286 cpu_index,
287 target_apic
288 );
289 }
290 return;
291 }
292 if should_trace {
293 crate::e9_println!(
294 "[ipi-send] from_cpu={} from_apic={:#x} to_cpu={} to_apic={:#x}",
295 my_cpu,
296 my_apic,
297 cpu_index,
298 target_apic
299 );
300 }
301 apic::send_resched_ipi(target_apic);
302 }
303 }
304 }
305}
306
307#[inline]
309pub(crate) fn request_force_resched_hint(cpu: usize) {
310 if cpu_is_valid(cpu) {
311 FORCE_RESCHED_HINT[cpu].store(true, Ordering::Release);
312 }
313}
314
315#[inline]
317pub(crate) fn take_force_resched_hint(cpu: usize) -> bool {
318 if cpu_is_valid(cpu) {
319 FORCE_RESCHED_HINT[cpu].swap(false, Ordering::AcqRel)
320 } else {
321 false
322 }
323}
324
325pub(crate) static GLOBAL_SCHED_STATE: SpinLock<Option<GlobalSchedState>> = SpinLock::new(None);
334
335pub fn debug_scheduler_lock_addr() -> usize {
337 &GLOBAL_SCHED_STATE as *const _ as usize
338}
339
340static TICK_COUNT: AtomicU64 = AtomicU64::new(0);
342static SCHED_VERBOSE: AtomicBool = AtomicBool::new(false);
344
345#[inline]
347fn sched_trace(args: core::fmt::Arguments<'_>) {
348 if SCHED_VERBOSE.load(Ordering::Relaxed) {
349 log::debug!("[sched] {}", args);
350 }
351}
352
353pub(super) struct SwitchTarget {
355 pub(super) old_rsp_ptr: *mut u64,
356 pub(super) new_rsp_ptr: *const u64,
357 pub(super) old_fpu_ptr: *mut u8,
358 pub(super) new_fpu_ptr: *const u8,
359 pub(super) old_xcr0: u64,
360 pub(super) new_xcr0: u64,
361}
362
363unsafe impl Send for SwitchTarget {}
367
368pub enum WaitChildResult {
370 Reaped {
371 child: TaskId,
372 pid: Pid,
373 status: i32,
374 },
375 NoChildren,
376 StillRunning,
377}
378
379fn current_cpu_index() -> usize {
381 crate::arch::x86_64::percpu::current_cpu_index()
382}
383
384struct PerCpuClassRqSet {
385 real_time: crate::process::sched::real_time::RealTimeClassRq,
386 fair: crate::process::sched::fair::FairClassRq,
387 idle: crate::process::sched::idle::IdleClassRq,
388}
389
390impl PerCpuClassRqSet {
391 fn new() -> Self {
393 Self {
394 real_time: crate::process::sched::real_time::RealTimeClassRq::new(),
395 fair: crate::process::sched::fair::FairClassRq::new(),
396 idle: crate::process::sched::idle::IdleClassRq::new(),
397 }
398 }
399
400 fn enqueue(&mut self, class: crate::process::sched::SchedClassId, task: Arc<Task>) {
402 use crate::process::sched::SchedClassRq;
403 unsafe { core::arch::asm!("mov al, 'q'; out 0xe9, al", out("al") _) };
404 match class {
405 crate::process::sched::SchedClassId::Fair => {
406 unsafe { core::arch::asm!("mov al, 'f'; out 0xe9, al", out("al") _) };
407 self.fair.enqueue(task);
408 }
409 crate::process::sched::SchedClassId::RealTime => {
410 unsafe { core::arch::asm!("mov al, 'r'; out 0xe9, al", out("al") _) };
411 self.real_time.enqueue(task);
412 }
413 crate::process::sched::SchedClassId::Idle => {
414 unsafe { core::arch::asm!("mov al, 'i'; out 0xe9, al", out("al") _) };
415 self.idle.enqueue(task);
416 }
417 }
418 unsafe { core::arch::asm!("mov al, 'Q'; out 0xe9, al", out("al") _) };
419 }
420
421 fn len_by_class(&self, class: crate::process::sched::SchedClassId) -> usize {
423 use crate::process::sched::SchedClassRq;
424 match class {
425 crate::process::sched::SchedClassId::Fair => self.fair.len(),
426 crate::process::sched::SchedClassId::RealTime => self.real_time.len(),
427 crate::process::sched::SchedClassId::Idle => self.idle.len(),
428 }
429 }
430
431 fn runnable_len(&self) -> usize {
433 self.len_by_class(crate::process::sched::SchedClassId::RealTime)
434 + self.len_by_class(crate::process::sched::SchedClassId::Fair)
435 }
436
437 fn pick_next_by_class(
439 &mut self,
440 class: crate::process::sched::SchedClassId,
441 ) -> Option<Arc<Task>> {
442 use crate::process::sched::SchedClassRq;
443 match class {
444 crate::process::sched::SchedClassId::Fair => self.fair.pick_next(),
445 crate::process::sched::SchedClassId::RealTime => self.real_time.pick_next(),
446 crate::process::sched::SchedClassId::Idle => self.idle.pick_next(),
447 }
448 }
449
450 fn pick_next(&mut self, table: &crate::process::sched::SchedClassTable) -> Option<Arc<Task>> {
452 for class in table.pick_order().iter().copied() {
453 if let Some(task) = self.pick_next_by_class(class) {
454 return Some(task);
455 }
456 }
457 None
458 }
459
460 fn update_current(
462 &mut self,
463 rt: &crate::process::sched::CurrentRuntime,
464 task: &Task,
465 is_yield: bool,
466 table: &crate::process::sched::SchedClassTable,
467 ) -> bool {
468 use crate::process::sched::SchedClassRq;
469 let should_preempt = match table.class_for_task(task) {
470 crate::process::sched::SchedClassId::Fair => {
471 self.fair.update_current(rt, task, is_yield)
472 }
473 crate::process::sched::SchedClassId::RealTime => {
474 self.real_time.update_current(rt, task, is_yield)
475 }
476 crate::process::sched::SchedClassId::Idle => {
477 self.idle.update_current(rt, task, is_yield)
478 }
479 };
480 let any_ready = !self.real_time.is_empty() || !self.fair.is_empty();
482 should_preempt
483 || (table.class_for_task(task) == crate::process::sched::SchedClassId::Idle
484 && any_ready)
485 }
486
487 fn remove(&mut self, task_id: crate::process::TaskId) -> bool {
489 use crate::process::sched::SchedClassRq;
490 self.real_time.remove(task_id) || self.fair.remove(task_id) || self.idle.remove(task_id)
491 }
492
493 fn steal_candidate(
495 &mut self,
496 table: &crate::process::sched::SchedClassTable,
497 ) -> Option<Arc<Task>> {
498 for class in table.steal_order().iter().copied() {
499 if let Some(task) = self.pick_next_by_class(class) {
500 return Some(task);
501 }
502 }
503 None
504 }
505}
506
507struct SchedulerCpu {
509 class_rqs: PerCpuClassRqSet,
511 current_task: Option<Arc<Task>>,
513 current_runtime: crate::process::sched::CurrentRuntime,
515 idle_task: Arc<Task>,
517 task_to_requeue: Option<Arc<Task>>,
519 task_to_drop: Option<Arc<Task>>,
521 need_resched: bool,
523 class_table: crate::process::sched::SchedClassTable,
526}
527
528#[allow(dead_code)]
533pub(crate) static LOCAL_SCHEDULERS: [SpinLock<Option<SchedulerCpu>>;
534 crate::arch::x86_64::percpu::MAX_CPUS] =
535 [const { SpinLock::new(None) }; crate::arch::x86_64::percpu::MAX_CPUS];
536
537pub(crate) static BLOCKED_TASKS: SpinLock<BTreeMap<TaskId, Arc<Task>>> =
545 SpinLock::new(BTreeMap::new());
546
547pub(crate) static SCHED_IDENTITY: SpinRwLock<SchedIdentity> = SpinRwLock::new(SchedIdentity::new());
560
561pub struct SchedIdentity {
568 pub pid_to_task: BTreeMap<Pid, TaskId>,
570 pub tid_to_task: BTreeMap<Tid, TaskId>,
572 pub pid_to_pgid: BTreeMap<Pid, Pid>,
574 pub pid_to_sid: BTreeMap<Pid, Pid>,
576 pub pgid_members: BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
578 pub sid_members: BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
580 pub parent_of: BTreeMap<TaskId, TaskId>,
582 pub children_of: BTreeMap<TaskId, alloc::vec::Vec<TaskId>>,
584}
585
586impl SchedIdentity {
587 pub const fn new() -> Self {
589 Self {
590 pid_to_task: BTreeMap::new(),
591 tid_to_task: BTreeMap::new(),
592 pid_to_pgid: BTreeMap::new(),
593 pid_to_sid: BTreeMap::new(),
594 pgid_members: BTreeMap::new(),
595 sid_members: BTreeMap::new(),
596 parent_of: BTreeMap::new(),
597 children_of: BTreeMap::new(),
598 }
599 }
600}
601
602pub struct GlobalSchedState {
612 pub(crate) all_tasks: BTreeMap<TaskId, Arc<Task>>,
614 pub(crate) all_tasks_scan: Vec<Arc<Task>>,
620 task_cpu: BTreeMap<TaskId, usize>,
622 #[allow(dead_code)]
624 wake_deadlines: BTreeMap<u64, alloc::vec::Vec<TaskId>>,
625 #[allow(dead_code)]
627 wake_deadline_of: BTreeMap<TaskId, u64>,
628 zombies: BTreeMap<TaskId, (i32, Pid)>,
630 class_table: crate::process::sched::SchedClassTable,
632}
633
634fn validate_task_context(task: &Arc<Task>) -> Result<(), &'static str> {
636 let ptr = Arc::as_ptr(task);
637 if ptr.is_null() {
638 return Err("null Task pointer in Arc");
639 }
640 let saved_rsp = unsafe { (*task.context.get()).saved_rsp };
641 let stack_base = task.kernel_stack.virt_base.as_u64();
642 let stack_top = stack_base.saturating_add(task.kernel_stack.size as u64);
643
644 if saved_rsp < stack_base || saved_rsp.saturating_add(56) > stack_top {
645 return Err("saved_rsp outside kernel stack bounds");
646 }
647
648 let ret_ip = unsafe { core::ptr::read_unaligned((saved_rsp + 48) as *const u64) };
652 if ret_ip == 0 {
653 return Err("null return IP in switch frame");
654 }
655
656 Ok(())
657}
658
659mod core_impl;
660pub mod perf_counters;
661mod runtime_ops;
662mod task_ops;
663mod timer_ops;
664
665pub use runtime_ops::*;
666pub use task_ops::*;
667pub use timer_ops::*;