Skip to main content

strat9_kernel/process/scheduler/
core_impl.rs

1use super::{runtime_ops::idle_task_main, *};
2
3/// Create a `SchedulerCpu` for the given CPU index (creates its idle task).
4pub(super) fn create_cpu_scheduler(cpu_idx: usize) -> SchedulerCpu {
5    crate::serial_println!(
6        "[trace][sched] create_cpu_scheduler cpu={} create idle begin",
7        cpu_idx
8    );
9    let idle_task = Task::new_kernel_task(idle_task_main, "idle", TaskPriority::Idle)
10        .expect("Failed to create idle task");
11    crate::serial_println!(
12        "[trace][sched] create_cpu_scheduler cpu={} create idle done id={}",
13        cpu_idx,
14        idle_task.id.as_u64()
15    );
16    idle_task.set_sched_policy(crate::process::sched::SchedPolicy::Idle);
17    let mut class_rqs = PerCpuClassRqSet::new();
18    class_rqs.enqueue(crate::process::sched::SchedClassId::Idle, idle_task.clone());
19    SchedulerCpu {
20        class_rqs,
21        current_task: None,
22        current_runtime: crate::process::sched::CurrentRuntime::new(),
23        idle_task,
24        task_to_requeue: None,
25        task_to_drop: None,
26        need_resched: false,
27        class_table: crate::process::sched::SchedClassTable::default(),
28    }
29}
30
31impl GlobalSchedState {
32    /// Create a new global scheduler state (no per-CPU runqueues : those live in LOCAL_SCHEDULERS).
33    pub fn new() -> Self {
34        crate::serial_println!("[trace][sched] GlobalSchedState::new enter");
35        GlobalSchedState {
36            all_tasks: BTreeMap::new(),
37            all_tasks_scan: Vec::new(),
38            task_cpu: BTreeMap::new(),
39            wake_deadlines: BTreeMap::new(),
40            wake_deadline_of: BTreeMap::new(),
41            zombies: BTreeMap::new(),
42            class_table: crate::process::sched::SchedClassTable::default(),
43        }
44    }
45
46    /// Performs the member add operation.
47    pub(crate) fn member_add(
48        map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
49        key: Pid,
50        task_id: TaskId,
51    ) {
52        let members = map.entry(key).or_default();
53        if !members.iter().any(|id| *id == task_id) {
54            members.push(task_id);
55        }
56    }
57
58    /// Performs the member remove operation.
59    pub(crate) fn member_remove(
60        map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
61        key: Pid,
62        task_id: TaskId,
63    ) {
64        let mut clear = false;
65        if let Some(members) = map.get_mut(&key) {
66            members.retain(|id| *id != task_id);
67            clear = members.is_empty();
68        }
69        if clear {
70            map.remove(&key);
71        }
72    }
73
74    /// Performs the register identity locked operation.
75    pub(crate) fn register_identity_locked(identity: &mut SchedIdentity, task: &Arc<Task>) {
76        let task_id = task.id;
77        let pid = task.pid;
78        let pgid = task.pgid.load(Ordering::Relaxed);
79        let sid = task.sid.load(Ordering::Relaxed);
80        crate::serial_println!(
81            "[trace][sched] register_identity enter tid={} pid={} pgid={} sid={}",
82            task_id.as_u64(),
83            pid,
84            pgid,
85            sid
86        );
87        identity.pid_to_pgid.insert(pid, pgid);
88        crate::serial_println!(
89            "[trace][sched] register_identity pid_to_pgid inserted pid={}",
90            pid
91        );
92        identity.pid_to_sid.insert(pid, sid);
93        crate::serial_println!(
94            "[trace][sched] register_identity pid_to_sid inserted pid={}",
95            pid
96        );
97        Self::member_add(&mut identity.pgid_members, pgid, task_id);
98        Self::member_add(&mut identity.sid_members, sid, task_id);
99        crate::serial_println!(
100            "[trace][sched] register_identity done tid={}",
101            task_id.as_u64()
102        );
103    }
104
105    /// Performs the unregister identity locked operation.
106    pub(crate) fn unregister_identity_locked(
107        identity: &mut SchedIdentity,
108        task_id: TaskId,
109        pid: Pid,
110        tid: Tid,
111    ) {
112        identity.pid_to_task.remove(&pid);
113        identity.tid_to_task.remove(&tid);
114        if let Some(pgid) = identity.pid_to_pgid.remove(&pid) {
115            Self::member_remove(&mut identity.pgid_members, pgid, task_id);
116        }
117        if let Some(sid) = identity.pid_to_sid.remove(&pid) {
118            Self::member_remove(&mut identity.sid_members, sid, task_id);
119        }
120    }
121
122    /// Add a task to the scheduler
123    pub fn add_task(&mut self, task: Arc<Task>) -> Option<usize> {
124        let cpu_index = self.select_cpu_for_task();
125        self.add_task_on_cpu(task, cpu_index)
126    }
127
128    /// Performs the add task with parent operation.
129    pub fn add_task_with_parent(&mut self, task: Arc<Task>, parent: TaskId) -> Option<usize> {
130        let child = task.id;
131        let cpu_index = self.select_cpu_for_task();
132        let ipi = self.add_task_on_cpu(task, cpu_index);
133        {
134            let mut identity = SCHED_IDENTITY.write();
135            identity.parent_of.insert(child, parent);
136            identity.children_of.entry(parent).or_default().push(child);
137        }
138        ipi
139    }
140
141    /// Performs the add task on cpu operation.
142    fn add_task_on_cpu(&mut self, task: Arc<Task>, cpu_index: usize) -> Option<usize> {
143        let task_id = task.id;
144        crate::serial_println!(
145            "[trace][sched] add_task_on_cpu enter tid={} cpu={}",
146            task_id.as_u64(),
147            cpu_index
148        );
149        task.set_state(TaskState::Ready);
150        crate::serial_println!(
151            "[trace][sched] add_task_on_cpu state ready tid={}",
152            task_id.as_u64()
153        );
154
155        crate::serial_println!(
156            "[trace][sched] add_task_on_cpu before clone tid={} all_tasks_len={}",
157            task_id.as_u64(),
158            self.all_tasks.len()
159        );
160        let task_clone = task.clone();
161        crate::serial_println!(
162            "[trace][sched] add_task_on_cpu before all_tasks.insert tid={}",
163            task_id.as_u64()
164        );
165        self.insert_all_task_locked(task_id, task_clone);
166        crate::serial_println!(
167            "[trace][sched] add_task_on_cpu all_tasks inserted tid={}",
168            task_id.as_u64()
169        );
170        self.task_cpu.insert(task_id, cpu_index);
171        task.home_cpu
172            .store(cpu_index, core::sync::atomic::Ordering::Relaxed);
173        crate::serial_println!(
174            "[trace][sched] add_task_on_cpu task_cpu inserted tid={}",
175            task_id.as_u64()
176        );
177        {
178            let mut identity = SCHED_IDENTITY.write();
179            identity.pid_to_task.insert(task.pid, task_id);
180            crate::serial_println!(
181                "[trace][sched] add_task_on_cpu pid map inserted tid={}",
182                task_id.as_u64()
183            );
184            identity.tid_to_task.insert(task.tid, task_id);
185            crate::serial_println!(
186                "[trace][sched] add_task_on_cpu tid map inserted tid={}",
187                task_id.as_u64()
188            );
189            Self::register_identity_locked(&mut identity, &task);
190            crate::serial_println!(
191                "[trace][sched] add_task_on_cpu identity registered tid={}",
192                task_id.as_u64()
193            );
194        }
195        {
196            let class = self.class_table.class_for_task(&task);
197            if let Some(ref mut local_cpu) = *LOCAL_SCHEDULERS[cpu_index].lock() {
198                local_cpu.class_rqs.enqueue(class, task);
199                local_cpu.need_resched = true;
200                crate::serial_println!(
201                    "[trace][sched] add_task_on_cpu enqueued tid={} cpu={}",
202                    task_id.as_u64(),
203                    cpu_index
204                );
205            }
206        }
207        sched_trace(format_args!(
208            "enqueue task={} cpu={}",
209            task_id.as_u64(),
210            cpu_index
211        ));
212        if cpu_index != current_cpu_index() {
213            Some(cpu_index)
214        } else {
215            None
216        }
217    }
218
219    pub(super) fn insert_all_task_locked(&mut self, task_id: TaskId, task: Arc<Task>) {
220        assert_eq!(
221            task.id,
222            task_id,
223            "scheduler corruption: insert_all_task_locked task.id={} != task_id={}",
224            task.id.as_u64(),
225            task_id.as_u64()
226        );
227        if self.all_tasks.contains_key(&task_id) {
228            unsafe {
229                core::arch::asm!("mov al, 'D'; out 0xe9, al", out("al") _);
230            }
231            crate::serial_force_println!(
232                "[RACE] insert_all_task_locked: duplicate tid={} all_tasks={} all_tasks_scan={}",
233                task_id.as_u64(),
234                self.all_tasks.len(),
235                self.all_tasks_scan.len()
236            );
237            panic!(
238                "scheduler corruption: duplicate insert_all_task_locked tid={}",
239                task_id.as_u64()
240            );
241        }
242        self.all_tasks.insert(task_id, task.clone());
243        self.all_tasks_scan.push(task);
244        // Race/corruption diagnostic: all_tasks and all_tasks_scan must stay in sync.
245        let bt_len = self.all_tasks.len();
246        let scan_len = self.all_tasks_scan.len();
247        if bt_len != scan_len {
248            unsafe {
249                core::arch::asm!("mov al, 'X'; out 0xe9, al", out("al") _);
250            }
251            crate::serial_force_println!(
252                "[RACE] insert_all_task_locked: all_tasks={} != all_tasks_scan={} tid={}",
253                bt_len,
254                scan_len,
255                task_id.as_u64()
256            );
257            panic!(
258                "scheduler corruption: insert_all_task_locked len mismatch all_tasks={} all_tasks_scan={} tid={}",
259                bt_len,
260                scan_len,
261                task_id.as_u64()
262            );
263        }
264    }
265
266    pub(super) fn remove_all_task_locked(&mut self, task_id: TaskId) -> Option<Arc<Task>> {
267        let removed = self.all_tasks.remove(&task_id);
268        if removed.is_some() {
269            if let Some(idx) = self
270                .all_tasks_scan
271                .iter()
272                .position(|task| task.id == task_id)
273            {
274                self.all_tasks_scan.swap_remove(idx);
275            } else {
276                unsafe { core::arch::asm!("mov al, 'Z'; out 0xe9, al", out("al") _) };
277                crate::serial_force_println!(
278                    "[RACE] remove_all_task_locked: tid={} in all_tasks but NOT in all_tasks_scan",
279                    task_id.as_u64()
280                );
281                panic!(
282                    "scheduler corruption: remove_all_task_locked missing scan entry tid={}",
283                    task_id.as_u64()
284                );
285            }
286        }
287        let bt_len = self.all_tasks.len();
288        let scan_len = self.all_tasks_scan.len();
289        if bt_len != scan_len {
290            unsafe {
291                core::arch::asm!("mov al, 'X'; out 0xe9, al", out("al") _);
292            }
293            crate::serial_force_println!(
294                "[RACE] remove_all_task_locked: all_tasks={} != all_tasks_scan={} tid={}",
295                bt_len,
296                scan_len,
297                task_id.as_u64()
298            );
299            panic!(
300                "scheduler corruption: remove_all_task_locked len mismatch all_tasks={} all_tasks_scan={} tid={}",
301                bt_len,
302                scan_len,
303                task_id.as_u64()
304            );
305        }
306        removed
307    }
308
309    /// Performs the clear task wake deadline locked operation.
310    pub fn clear_task_wake_deadline_locked(&mut self, id: TaskId) -> bool {
311        if let Some(task) = self.all_tasks.get(&id) {
312            task.wake_deadline_ns.store(0, Ordering::Relaxed);
313            true
314        } else {
315            false
316        }
317    }
318
319    /// Sets task wake deadline locked.
320    pub fn set_task_wake_deadline_locked(&mut self, id: TaskId, deadline: u64) -> bool {
321        if deadline == 0 {
322            return self.clear_task_wake_deadline_locked(id);
323        }
324        if let Some(task) = self.all_tasks.get(&id) {
325            task.wake_deadline_ns.store(deadline, Ordering::Relaxed);
326            true
327        } else {
328            false
329        }
330    }
331
332    /// Performs the wake task locked operation.
333    ///
334    /// Returns `(was_woken, ipi_cpu)`. The caller must send a resched IPI to
335    /// `ipi_cpu` after releasing the scheduler lock.
336    ///
337    /// NOTE: `blocked_tasks` lives in the separate `BLOCKED_TASKS` lock.
338    /// This method now only handles the fallback path (task not yet blocked,
339    /// set `wake_pending`). The primary wake path is in `wake_task()`.
340    pub fn wake_task_locked(&mut self, id: TaskId) -> (bool, Option<usize>) {
341        self.clear_task_wake_deadline_locked(id);
342        // Fallback: task is not yet in BLOCKED_TASKS (still transitioning to
343        // Blocked). Set wake_pending so block_current_task will skip blocking.
344        if let Some(task) = self.all_tasks.get(&id) {
345            task.wake_pending
346                .store(true, core::sync::atomic::Ordering::Release);
347            (true, None)
348        } else {
349            (false, None)
350        }
351    }
352
353    /// Attempts to reap child locked.
354    /// If `target` is `Some(tid)`, only reaps that child; otherwise, reaps any child.
355    /// Returns `WaitChildResult` indicating the outcome.
356    /// Must be called with the scheduler lock held.
357    ///
358    pub fn try_reap_child_locked(
359        &mut self,
360        parent: TaskId,
361        target: Option<TaskId>,
362    ) -> WaitChildResult {
363        // First, check children under SCHED_IDENTITY lock.
364        let target_is_child = {
365            let identity = SCHED_IDENTITY.read();
366            let Some(children_view) = identity.children_of.get(&parent) else {
367                return WaitChildResult::NoChildren;
368            };
369
370            if children_view.is_empty() {
371                return WaitChildResult::NoChildren;
372            }
373
374            let target_is_child = if let Some(target_id) = target {
375                children_view.iter().any(|&id| id == target_id)
376            } else {
377                true
378            };
379            target_is_child
380        };
381
382        if !target_is_child {
383            return WaitChildResult::NoChildren;
384        }
385
386        // Find the zombie child : re-check children under SCHED_IDENTITY.
387        let zombie = {
388            let identity = SCHED_IDENTITY.read();
389            let children = match identity.children_of.get(&parent) {
390                Some(c) => c.clone(),
391                None => return WaitChildResult::NoChildren,
392            };
393            children
394                .iter()
395                .copied()
396                .find(|id| target.map_or(true, |t| t == *id) && self.zombies.contains_key(id))
397        };
398
399        if let Some(child) = zombie {
400            let (status, child_pid) = self.zombies.remove(&child).unwrap_or((0, 0));
401            // Remove from all_tasks now so that pick_next_task (if it races with
402            // reaping) will see was_registered=false and skip cleanup_task_resources.
403            let reaped_task = self.remove_all_task_locked(child);
404            if let Some(task) = reaped_task.as_ref() {
405                super::task_ops::cleanup_task_resources(task);
406            }
407            let child_tid = reaped_task.as_ref().map(|t| t.tid);
408            if child_pid != 0 {
409                if let Some(tid) = child_tid {
410                    let mut identity = SCHED_IDENTITY.write();
411                    Self::unregister_identity_locked(&mut identity, child, child_pid, tid);
412                }
413            }
414            {
415                let mut identity = SCHED_IDENTITY.write();
416                if let Some(children) = identity.children_of.get_mut(&parent) {
417                    children.retain(|&id| id != child);
418                    if children.is_empty() {
419                        identity.children_of.remove(&parent);
420                    }
421                }
422                identity.parent_of.remove(&child);
423            }
424            return WaitChildResult::Reaped {
425                child,
426                pid: child_pid,
427                status,
428            };
429        }
430
431        WaitChildResult::StillRunning
432    }
433
434    /// Select the least-loaded CPU for a newly created task.
435    ///
436    /// Uses **blocking** `LOCAL_SCHEDULERS[i].lock()` for each CPU while
437    /// `GLOBAL_SCHED_STATE` is already held by the caller.  This is safe because the
438    /// hot-path only ever does `try_lock_no_irqsave` on `GLOBAL_SCHED_STATE` (so it
439    /// cannot deadlock with us), but it may briefly stall behind a timer tick
440    /// that holds a LOCAL lock.  The stall is bounded by one tick period.
441    fn select_cpu_for_task(&self) -> usize {
442        // Early boot: before the first real task is running, keep all new tasks
443        // on the BSP. Spreading init/shell/status across CPUs at this point can
444        // strand boot-critical work on AP scheduler instances that have not yet
445        // entered their steady-state scheduling loop.
446        let n = active_cpu_count();
447        let all_idle = (0..n).all(|i| {
448            LOCAL_SCHEDULERS[i]
449                .lock()
450                .as_ref()
451                .map(|cpu| cpu.current_task.is_none())
452                .unwrap_or(true)
453        });
454        if all_idle {
455            crate::serial_println!("[trace][sched] select_cpu_for_task early-boot best=0");
456            return 0;
457        }
458        let mut best = 0usize;
459        let mut best_load = usize::MAX;
460        for idx in 0..n {
461            let load = {
462                let guard = LOCAL_SCHEDULERS[idx].lock();
463                if let Some(ref cpu) = *guard {
464                    let mut l = cpu.class_rqs.runnable_len();
465                    if let Some(current) = cpu.current_task.as_ref() {
466                        if self.class_table.class_for_task(current)
467                            != crate::process::sched::SchedClassId::Idle
468                        {
469                            l += 1;
470                        }
471                    }
472                    l
473                } else {
474                    0
475                }
476            };
477            if load < best_load {
478                best = idx;
479                best_load = load;
480            }
481        }
482        crate::serial_println!(
483            "[trace][sched] select_cpu_for_task best={} load={}",
484            best,
485            best_load
486        );
487        best
488    }
489
490    /// Performs the migrate ready tasks for new class table operation.
491    pub fn migrate_ready_tasks_for_new_class_table(&mut self) {
492        let mut ready: Vec<(TaskId, Arc<Task>, usize)> = Vec::new();
493        for (id, task) in self.all_tasks.iter() {
494            let state = task.get_state();
495            if state != TaskState::Ready {
496                continue;
497            }
498            let cpu = self.task_cpu.get(id).copied().unwrap_or(0);
499            ready.push((*id, task.clone(), cpu));
500        }
501
502        for (id, task, cpu_idx) in ready {
503            let mut guard = LOCAL_SCHEDULERS[cpu_idx].lock();
504            let Some(ref mut cpu) = *guard else {
505                continue;
506            };
507            if cpu.class_rqs.remove(id) {
508                let class = self.class_table.class_for_task(&task);
509                cpu.class_rqs.enqueue(class, task);
510                cpu.need_resched = true;
511            }
512        }
513    }
514}
515
516//  Per-CPU hot-path helpers
517//
518// These functions operate primarily on `SchedulerCpu` (acquired via
519// `LOCAL_SCHEDULERS[cpu_index]`).  Most never touch the global `SCHEDULER`
520// lock.  The one exception is `steal_task_local`, which does a **non-blocking**
521// `GLOBAL_SCHED_STATE.try_lock_no_irqsave()` to update `task_cpu` after a successful
522// steal.  This is an intentional lock-order inversion (LOCAL held, then GLOBAL
523// attempted) that is safe because the try-lock never blocks : if GLOBAL_SCHED_STATE is
524// contended, we simply skip stealing.
525//
526// Lock order for steal: own LOCAL held → try_lock GLOBAL → try_lock sibling
527// LOCALs.  Never blocking-wait, so no deadlock possible.
528
529/// Steal a task from the busiest sibling CPU using per-CPU LOCAL locks.
530///
531/// Called with `cpu` borrowed from `LOCAL_SCHEDULERS[cpu_index]` (our own
532/// LOCAL lock already held). Uses `try_lock_no_irqsave` on sibling entries :
533/// if a sibling or the global scheduler state is contended, we skip stealing
534/// rather than waiting.
535pub(super) fn steal_task_local(cpu: &mut SchedulerCpu, cpu_index: usize) -> Option<Arc<Task>> {
536    let now_tick = TICK_COUNT.load(Ordering::Relaxed);
537    if now_tick < LAST_STEAL_TICK[cpu_index].load(Ordering::Relaxed) + STEAL_COOLDOWN_TICKS {
538        return None;
539    }
540
541    // Best-effort only: if a cold path is holding the global scheduler, skip
542    // stealing instead of blocking the hot path.
543    let mut scheduler = GLOBAL_SCHED_STATE.try_lock_no_irqsave()?;
544    let sched = scheduler.as_mut()?;
545
546    let n = active_cpu_count();
547    let my_load = cpu.class_rqs.runnable_len();
548
549    let mut best_cpu = None;
550    let mut best_load = 0usize;
551
552    for i in 0..n {
553        if i == cpu_index {
554            continue;
555        }
556        // try_lock_no_irqsave: returns immediately if contended (no deadlock).
557        if let Some(guard) = LOCAL_SCHEDULERS[i].try_lock_no_irqsave() {
558            if let Some(ref sib) = *guard {
559                let load = sib.class_rqs.runnable_len();
560                if load > best_load {
561                    best_load = load;
562                    best_cpu = Some(i);
563                }
564            }
565        }
566    }
567
568    if best_load < my_load.saturating_add(STEAL_IMBALANCE_MIN) {
569        return None;
570    }
571    let steal_from = best_cpu?;
572
573    // Re-acquire the sibling lock to perform the steal.
574    if let Some(mut guard) = LOCAL_SCHEDULERS[steal_from].try_lock_no_irqsave() {
575        if let Some(ref mut sib) = *guard {
576            if sib.class_rqs.runnable_len() < 2 {
577                return None;
578            }
579            if let Some(task) = sib.class_rqs.steal_candidate(&sib.class_table) {
580                sched.task_cpu.insert(task.id, cpu_index);
581                task.home_cpu
582                    .store(cpu_index, core::sync::atomic::Ordering::Relaxed);
583                if cpu_is_valid(cpu_index) {
584                    CPU_STEAL_IN_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
585                }
586                if cpu_is_valid(steal_from) {
587                    CPU_STEAL_OUT_COUNT[steal_from].fetch_add(1, Ordering::Relaxed);
588                }
589                LAST_STEAL_TICK[cpu_index].store(now_tick, Ordering::Relaxed);
590                return Some(task);
591            }
592        }
593    }
594    None
595}
596
597/// Pick the next task using only per-CPU LOCAL state.
598///
599/// Handles current task disposition (re-queue, drop-for-cleanup, or ignore if
600/// Blocked), then picks from the local class_rqs, falls back to work-stealing,
601/// and finally returns the idle task.
602///
603/// **Dead tasks**: if the current task is Dead, it goes into `task_to_drop`.
604/// Global map cleanup (`all_tasks`, `task_cpu`, etc.) must have been performed
605/// by the caller (e.g., `exit_current_task`) BEFORE reaching this point.
606pub(super) fn pick_next_task_local(cpu: &mut SchedulerCpu, cpu_index: usize) -> Arc<Task> {
607    // Step 1: dispose of the current task.
608    if let Some(task) = cpu.current_task.take() {
609        match task.get_state() {
610            TaskState::Running => {
611                task.set_state(TaskState::Ready);
612                if !Arc::ptr_eq(&task, &cpu.idle_task) {
613                    // Defer re-queue to finish_switch (not yet safe to enqueue :
614                    // another CPU could steal it before our context is saved).
615                    cpu.task_to_requeue = Some(task);
616                }
617            }
618            TaskState::Dead => {
619                // Global maps already cleaned by exit_current_task / kill_task.
620                // Defer the Arc drop so KernelStack::drop → buddy_alloc runs
621                // outside any lock.
622                cpu.task_to_drop = Some(task);
623            }
624            TaskState::Blocked | TaskState::Ready => {
625                // Blocked: moved to blocked_tasks by block_current_task : do nothing.
626                // Ready: shouldn't normally occur for current_task; safe to ignore.
627            }
628        }
629    }
630
631    // Step 2: pick from local class_rqs.
632    let next = if let Some(next) = cpu.class_rqs.pick_next(&cpu.class_table) {
633        next
634    } else if let Some(stolen) = steal_task_local(cpu, cpu_index) {
635        // Step 3: try work-stealing from a sibling CPU.
636        stolen
637    } else {
638        // Step 4: idle fallback.
639        cpu.idle_task.clone()
640    };
641
642    next.set_state(TaskState::Running);
643    cpu.current_task = Some(next.clone());
644    cpu.current_runtime = crate::process::sched::CurrentRuntime::new();
645    next
646}
647
648/// Prepare a LOCAL-only context switch.
649///
650/// Updates the TSS, SYSCALL RSP, and CR3 for the next task. Returns the
651/// raw pointer pair needed by `do_switch_context`.
652///
653/// Returns `None` if there is no task to switch to (same task or invalid context).
654pub(super) fn yield_cpu_local(cpu: &mut SchedulerCpu, cpu_index: usize) -> Option<SwitchTarget> {
655    let current = cpu.current_task.as_ref()?.clone();
656
657    let next = pick_next_task_local(cpu, cpu_index);
658
659    if Arc::ptr_eq(&current, &next) {
660        return None;
661    }
662    if cpu_is_valid(cpu_index) {
663        CPU_SWITCH_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
664    }
665
666    if let Err(e) = validate_task_context(&next) {
667        let bad_rsp = unsafe { (*next.context.get()).saved_rsp };
668        let stk_base = next.kernel_stack.virt_base.as_u64();
669        let stk_top = stk_base + next.kernel_stack.size as u64;
670        crate::serial_println!(
671            "[sched-local] WARN: invalid ctx task='{}' id={} cpu={}: {} \
672             rsp={:#x} stack=[{:#x}..{:#x}] : restoring current",
673            next.name,
674            next.id.as_u64(),
675            cpu_index,
676            e,
677            bad_rsp,
678            stk_base,
679            stk_top,
680        );
681
682        // Restore invariants: undo what pick_next_task_local mutated.
683        let is_idle = Arc::ptr_eq(&next, &cpu.idle_task);
684        drop(cpu.task_to_drop.take());
685        if let Some(prev) = cpu.task_to_requeue.take() {
686            prev.set_state(TaskState::Running);
687            cpu.current_task = Some(prev);
688        } else {
689            current.set_state(TaskState::Running);
690            cpu.current_task = Some(current.clone());
691        }
692        if !is_idle {
693            next.set_state(TaskState::Ready);
694            let class = cpu.class_table.class_for_task(&next);
695            cpu.class_rqs.enqueue(class, next);
696        }
697        return None;
698    }
699
700    // Update TSS.rsp0 and SYSCALL kernel RSP for the new task.
701    let stack_top = next.kernel_stack.virt_base.as_u64() + next.kernel_stack.size as u64;
702    crate::arch::x86_64::tss::set_kernel_stack(x86_64::VirtAddr::new(stack_top));
703    crate::arch::x86_64::syscall::set_kernel_rsp(stack_top);
704
705    // Switch CR3 if the new task has a different address space.
706    // SAFETY: The new task's address space has a valid PML4 with the kernel half mapped.
707    unsafe {
708        next.process.address_space_arc().switch_to();
709    }
710
711    Some(SwitchTarget {
712        old_rsp_ptr: unsafe { &raw mut (*current.context.get()).saved_rsp },
713        new_rsp_ptr: unsafe { &raw const (*next.context.get()).saved_rsp },
714        old_fpu_ptr: current.fpu_state.get() as *mut u8,
715        new_fpu_ptr: next.fpu_state.get() as *const u8,
716        old_xcr0: current
717            .xcr0_mask
718            .load(core::sync::atomic::Ordering::Relaxed),
719        new_xcr0: next.xcr0_mask.load(core::sync::atomic::Ordering::Relaxed),
720    })
721}
722
723/// Post-switch cleanup using only LOCAL state: re-enqueue the previous task
724/// and optionally extract the task-to-drop for deferred deallocation.
725pub(super) fn drain_post_switch_local(
726    cpu: &mut SchedulerCpu,
727    take_drop: bool,
728) -> Option<Arc<Task>> {
729    let task_to_drop = if take_drop {
730        cpu.task_to_drop.take()
731    } else {
732        None
733    };
734    if let Some(task) = cpu.task_to_requeue.take() {
735        let class = cpu.class_table.class_for_task(&task);
736        cpu.class_rqs.enqueue(class, task);
737    }
738    task_to_drop
739}