Skip to main content

strat9_kernel/process/scheduler/
core_impl.rs

1use super::{runtime_ops::idle_task_main, *};
2
3impl Scheduler {
4    /// Create a new scheduler instance
5    pub fn new(cpu_count: usize) -> Self {
6        crate::serial_println!(
7            "[trace][sched] Scheduler::new enter cpu_count={}",
8            cpu_count
9        );
10        let mut cpus = alloc::vec::Vec::new();
11        for cpu_idx in 0..cpu_count {
12            crate::serial_println!(
13                "[trace][sched] Scheduler::new cpu={} create idle begin",
14                cpu_idx
15            );
16            let idle_task = Task::new_kernel_task(idle_task_main, "idle", TaskPriority::Idle)
17                .expect("Failed to create idle task");
18            crate::serial_println!(
19                "[trace][sched] Scheduler::new cpu={} create idle done id={}",
20                cpu_idx,
21                idle_task.id.as_u64()
22            );
23            idle_task.set_sched_policy(crate::process::sched::SchedPolicy::Idle);
24            let mut class_rqs = PerCpuClassRqSet::new();
25            class_rqs.enqueue(crate::process::sched::SchedClassId::Idle, idle_task.clone());
26            cpus.push(SchedulerCpu {
27                class_rqs,
28                current_task: None,
29                current_runtime: crate::process::sched::CurrentRuntime::new(),
30                idle_task,
31                task_to_requeue: None,
32                task_to_drop: None,
33                need_resched: false,
34            });
35            crate::serial_println!("[trace][sched] Scheduler::new cpu={} push done", cpu_idx);
36        }
37        crate::serial_println!(
38            "[trace][sched] Scheduler::new cpus ready len={}",
39            cpus.len()
40        );
41
42        Scheduler {
43            cpus,
44            blocked_tasks: BTreeMap::new(),
45            all_tasks: BTreeMap::new(),
46            task_cpu: BTreeMap::new(),
47            pid_to_task: BTreeMap::new(),
48            tid_to_task: BTreeMap::new(),
49            pid_to_pgid: BTreeMap::new(),
50            pid_to_sid: BTreeMap::new(),
51            pgid_members: BTreeMap::new(),
52            sid_members: BTreeMap::new(),
53            wake_deadlines: BTreeMap::new(),
54            wake_deadline_of: BTreeMap::new(),
55            parent_of: BTreeMap::new(),
56            children_of: BTreeMap::new(),
57            zombies: BTreeMap::new(),
58            class_table: crate::process::sched::SchedClassTable::default(),
59        }
60    }
61
62    /// Performs the member add operation.
63    pub(crate) fn member_add(
64        map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
65        key: Pid,
66        task_id: TaskId,
67    ) {
68        crate::serial_println!(
69            "[trace][sched] member_add enter key={} tid={}",
70            key,
71            task_id.as_u64()
72        );
73        let members = map.entry(key).or_default();
74        crate::serial_println!(
75            "[trace][sched] member_add entry key={} len={}",
76            key,
77            members.len()
78        );
79        if !members.iter().any(|id| *id == task_id) {
80            members.push(task_id);
81            crate::serial_println!(
82                "[trace][sched] member_add pushed key={} tid={}",
83                key,
84                task_id.as_u64()
85            );
86        }
87    }
88
89    /// Performs the member remove operation.
90    pub(crate) fn member_remove(
91        map: &mut BTreeMap<Pid, alloc::vec::Vec<TaskId>>,
92        key: Pid,
93        task_id: TaskId,
94    ) {
95        let mut clear = false;
96        if let Some(members) = map.get_mut(&key) {
97            members.retain(|id| *id != task_id);
98            clear = members.is_empty();
99        }
100        if clear {
101            map.remove(&key);
102        }
103    }
104
105    /// Performs the register identity locked operation.
106    pub(crate) fn register_identity_locked(&mut self, task: &Arc<Task>) {
107        let task_id = task.id;
108        let pid = task.pid;
109        let pgid = task.pgid.load(Ordering::Relaxed);
110        let sid = task.sid.load(Ordering::Relaxed);
111        crate::serial_println!(
112            "[trace][sched] register_identity enter tid={} pid={} pgid={} sid={}",
113            task_id.as_u64(),
114            pid,
115            pgid,
116            sid
117        );
118        self.pid_to_pgid.insert(pid, pgid);
119        crate::serial_println!(
120            "[trace][sched] register_identity pid_to_pgid inserted pid={}",
121            pid
122        );
123        self.pid_to_sid.insert(pid, sid);
124        crate::serial_println!(
125            "[trace][sched] register_identity pid_to_sid inserted pid={}",
126            pid
127        );
128        Self::member_add(&mut self.pgid_members, pgid, task_id);
129        Self::member_add(&mut self.sid_members, sid, task_id);
130        crate::serial_println!(
131            "[trace][sched] register_identity done tid={}",
132            task_id.as_u64()
133        );
134    }
135
136    /// Performs the unregister identity locked operation.
137    pub(crate) fn unregister_identity_locked(&mut self, task_id: TaskId, pid: Pid, tid: Tid) {
138        self.pid_to_task.remove(&pid);
139        self.tid_to_task.remove(&tid);
140        if let Some(pgid) = self.pid_to_pgid.remove(&pid) {
141            Self::member_remove(&mut self.pgid_members, pgid, task_id);
142        }
143        if let Some(sid) = self.pid_to_sid.remove(&pid) {
144            Self::member_remove(&mut self.sid_members, sid, task_id);
145        }
146    }
147
148    /// Add a task to the scheduler
149    pub fn add_task(&mut self, task: Arc<Task>) -> Option<usize> {
150        let cpu_index = self.select_cpu_for_task();
151        self.add_task_on_cpu(task, cpu_index)
152    }
153
154    /// Performs the add task with parent operation.
155    pub fn add_task_with_parent(&mut self, task: Arc<Task>, parent: TaskId) -> Option<usize> {
156        let child = task.id;
157        let cpu_index = self.select_cpu_for_task();
158        let ipi = self.add_task_on_cpu(task, cpu_index);
159        self.parent_of.insert(child, parent);
160        self.children_of.entry(parent).or_default().push(child);
161        ipi
162    }
163
164    /// Performs the add task on cpu operation.
165    fn add_task_on_cpu(&mut self, task: Arc<Task>, cpu_index: usize) -> Option<usize> {
166        let task_id = task.id;
167        crate::serial_println!(
168            "[trace][sched] add_task_on_cpu enter tid={} cpu={}",
169            task_id.as_u64(),
170            cpu_index
171        );
172        // SAFETY: We have exclusive access via the scheduler lock
173        unsafe {
174            *task.state.get() = TaskState::Ready;
175        }
176        crate::serial_println!(
177            "[trace][sched] add_task_on_cpu state ready tid={}",
178            task_id.as_u64()
179        );
180
181        crate::serial_println!(
182            "[trace][sched] add_task_on_cpu before clone tid={} all_tasks_len={}",
183            task_id.as_u64(),
184            self.all_tasks.len()
185        );
186        let task_clone = task.clone();
187        crate::serial_println!(
188            "[trace][sched] add_task_on_cpu before all_tasks.insert tid={}",
189            task_id.as_u64()
190        );
191        self.all_tasks.insert(task_id, task_clone);
192        crate::serial_println!(
193            "[trace][sched] add_task_on_cpu all_tasks inserted tid={}",
194            task_id.as_u64()
195        );
196        self.task_cpu.insert(task_id, cpu_index);
197        crate::serial_println!(
198            "[trace][sched] add_task_on_cpu task_cpu inserted tid={}",
199            task_id.as_u64()
200        );
201        self.pid_to_task.insert(task.pid, task_id);
202        crate::serial_println!(
203            "[trace][sched] add_task_on_cpu pid map inserted tid={}",
204            task_id.as_u64()
205        );
206        self.tid_to_task.insert(task.tid, task_id);
207        crate::serial_println!(
208            "[trace][sched] add_task_on_cpu tid map inserted tid={}",
209            task_id.as_u64()
210        );
211        self.register_identity_locked(&task);
212        crate::serial_println!(
213            "[trace][sched] add_task_on_cpu identity registered tid={}",
214            task_id.as_u64()
215        );
216        if let Some(cpu) = self.cpus.get_mut(cpu_index) {
217            let class = self.class_table.class_for_task(&task);
218            cpu.class_rqs.enqueue(class, task);
219            cpu.need_resched = true;
220            crate::serial_println!(
221                "[trace][sched] add_task_on_cpu enqueued tid={} cpu={}",
222                task_id.as_u64(),
223                cpu_index
224            );
225        }
226        sched_trace(format_args!(
227            "enqueue task={} cpu={}",
228            task_id.as_u64(),
229            cpu_index
230        ));
231        if cpu_index != current_cpu_index() {
232            Some(cpu_index)
233        } else {
234            None
235        }
236    }
237
238    /// Performs the clear task wake deadline locked operation.
239    pub fn clear_task_wake_deadline_locked(&mut self, id: TaskId) -> bool {
240        if let Some(task) = self.all_tasks.get(&id) {
241            task.wake_deadline_ns.store(0, Ordering::Relaxed);
242            true
243        } else {
244            false
245        }
246    }
247
248    /// Sets task wake deadline locked.
249    pub fn set_task_wake_deadline_locked(&mut self, id: TaskId, deadline: u64) -> bool {
250        if deadline == 0 {
251            return self.clear_task_wake_deadline_locked(id);
252        }
253        if let Some(task) = self.all_tasks.get(&id) {
254            task.wake_deadline_ns.store(deadline, Ordering::Relaxed);
255            true
256        } else {
257            false
258        }
259    }
260
261    /// Performs the wake task locked operation.
262    ///
263    /// Returns `(was_woken, ipi_cpu)`. The caller must send a resched IPI to
264    /// `ipi_cpu` after releasing the scheduler lock.
265    pub fn wake_task_locked(&mut self, id: TaskId) -> (bool, Option<usize>) {
266        self.clear_task_wake_deadline_locked(id);
267        if let Some(task) = self.blocked_tasks.remove(&id) {
268            // SAFETY: scheduler lock held.
269            unsafe {
270                *task.state.get() = TaskState::Ready;
271            }
272            let cpu_index = self.task_cpu.get(&id).copied().unwrap_or(0);
273            if let Some(cpu) = self.cpus.get_mut(cpu_index) {
274                let class = self.class_table.class_for_task(&task);
275                cpu.class_rqs.enqueue(class, task);
276                cpu.need_resched = true;
277            }
278            let ipi = if cpu_index != current_cpu_index() {
279                Some(cpu_index)
280            } else {
281                None
282            };
283            (true, ipi)
284        } else if let Some(task) = self.all_tasks.get(&id) {
285            task.wake_pending
286                .store(true, core::sync::atomic::Ordering::Release);
287            (true, None)
288        } else {
289            (false, None)
290        }
291    }
292
293    /// Attempts to reap child locked.
294    pub fn try_reap_child_locked(
295        &mut self,
296        parent: TaskId,
297        target: Option<TaskId>,
298    ) -> WaitChildResult {
299        let Some(children_view) = self.children_of.get(&parent) else {
300            return WaitChildResult::NoChildren;
301        };
302
303        if children_view.is_empty() {
304            return WaitChildResult::NoChildren;
305        }
306
307        if let Some(target_id) = target {
308            if !children_view.iter().any(|&id| id == target_id) {
309                return WaitChildResult::NoChildren;
310            }
311        }
312
313        let zombie = children_view
314            .iter()
315            .copied()
316            .find(|id| target.map_or(true, |t| t == *id) && self.zombies.contains_key(id));
317
318        if let Some(child) = zombie {
319            let (status, child_pid) = self.zombies.remove(&child).unwrap_or((0, 0));
320            // Remove from all_tasks now so that pick_next_task (if it races with
321            // reaping) will see was_registered=false and skip cleanup_task_resources.
322            let reaped_task = self.all_tasks.remove(&child);
323            let child_tid = reaped_task.as_ref().map(|t| t.tid);
324            if child_pid != 0 {
325                if let Some(tid) = child_tid {
326                    self.unregister_identity_locked(child, child_pid, tid);
327                }
328            }
329            if let Some(children) = self.children_of.get_mut(&parent) {
330                children.retain(|&id| id != child);
331                if children.is_empty() {
332                    self.children_of.remove(&parent);
333                }
334            }
335            self.parent_of.remove(&child);
336            return WaitChildResult::Reaped {
337                child,
338                pid: child_pid,
339                status,
340            };
341        }
342
343        WaitChildResult::StillRunning
344    }
345
346    /// Pick the next task to run on `cpu_index`.
347    ///
348    /// 1. Re-queues the current Running task (round-robin).
349    ///    The **idle task is never re-queued** — it is always available
350    ///    as the last-resort fallback.  Keeping it out of the ready queue
351    ///    ensures the queue becomes truly empty when there is no real work,
352    ///    which lets work-stealing kick in on other CPUs.
353    /// 2. Pops from the local ready queue.
354    /// 3. Falls back to **work-stealing** from the busiest other CPU.
355    /// 4. Falls back to the per-CPU idle task.
356    pub fn pick_next_task(&mut self, cpu_index: usize) -> Arc<Task> {
357        // Step 1: take current task and re-queue it if still alive.
358        let current_task = self.cpus[cpu_index].current_task.take();
359        if let Some(task) = current_task {
360            // SAFETY: scheduler lock held; we have exclusive access to state.
361            let task_state = unsafe { *task.state.get() };
362            if task_state == TaskState::Running {
363                unsafe {
364                    *task.state.get() = TaskState::Ready;
365                }
366                // Never re-queue the per-CPU idle task.  It lives in
367                // `cpus[cpu_index].idle_task` and is cloned as a fallback
368                // below.  Putting it in the ready queue prevents it from
369                // ever being empty, which breaks work-stealing.
370                if !Arc::ptr_eq(&task, &self.cpus[cpu_index].idle_task) {
371                    // DO NOT push to ready_queue yet!
372                    // Another CPU could steal it before its context is saved.
373                    self.cpus[cpu_index].task_to_requeue = Some(task);
374                }
375            } else if task_state == TaskState::Dead {
376                let task_id = task.id;
377                let task_pid = task.pid;
378                let task_tid = task.tid;
379
380                // Remove from global maps. `try_reap_child_locked` may have
381                // already removed the task; in that case `remove` is a no-op
382                // and we must skip `cleanup_task_resources` to avoid running
383                // port/silo teardown twice.
384                let was_registered = self.all_tasks.remove(&task_id).is_some();
385                self.task_cpu.remove(&task_id);
386                self.unregister_identity_locked(task_id, task_pid, task_tid);
387
388                if was_registered {
389                    // SAFETY: scheduler lock held, task is no longer accessible
390                    super::task_ops::cleanup_task_resources(&task);
391                }
392
393                // Defer the actual Arc drop to finish_switch()
394                self.cpus[cpu_index].task_to_drop = Some(task);
395            } else {
396                // Blocked task: leave bookkeeping intact; blocked_tasks holds the ref.
397                // Dropping the Arc here is fine; the blocked map keeps the task alive.
398            }
399        }
400
401        // Step 2: local queue, then work-steal, then idle.
402        let next_task = if let Some(task) =
403            self.cpus[cpu_index].class_rqs.pick_next(&self.class_table)
404        {
405            task
406        } else if let Some(task) = self.steal_task(cpu_index, TICK_COUNT.load(Ordering::Relaxed)) {
407            task
408        } else {
409            self.cpus[cpu_index].idle_task.clone()
410        };
411
412        // SAFETY: scheduler lock held.
413        unsafe {
414            *next_task.state.get() = TaskState::Running;
415        }
416        let cloned = next_task.clone();
417        let strong_after = Arc::strong_count(&cloned);
418        self.cpus[cpu_index].current_task = Some(cloned);
419        // Reset the runtime accounting for the new task
420        self.cpus[cpu_index].current_runtime = crate::process::sched::CurrentRuntime::new();
421        sched_trace(format_args!(
422            "cpu={} pick_next task={} policy={:?} strong={}",
423            cpu_index,
424            next_task.id.as_u64(),
425            next_task.sched_policy(),
426            strong_after,
427        ));
428        next_task
429    }
430
431    /// Try to steal one task from the most-loaded other CPU.
432    ///
433    /// We steal from the **back** of the source queue (the task added most
434    /// recently — least likely to have warm cache data on that CPU).
435    /// We only steal when the source has ≥ 2 tasks, so it keeps at least one.
436    pub fn steal_task(&mut self, dst_cpu: usize, now_tick: u64) -> Option<Arc<Task>> {
437        if !cpu_is_valid(dst_cpu) || dst_cpu >= self.cpus.len() {
438            return None;
439        }
440        let last = LAST_STEAL_TICK[dst_cpu].load(Ordering::Relaxed);
441        if now_tick.saturating_sub(last) < STEAL_COOLDOWN_TICKS {
442            return None;
443        }
444
445        let dst_load = {
446            let cpu = &self.cpus[dst_cpu];
447            cpu.class_rqs.runnable_len() + usize::from(cpu.current_task.is_some())
448        };
449
450        // Find the busiest CPU that isn't ourselves.
451        let (best_src, best_src_load) = (0..self.cpus.len())
452            .filter(|&i| i != dst_cpu)
453            .map(|i| {
454                let cpu = &self.cpus[i];
455                let load = cpu.class_rqs.runnable_len() + usize::from(cpu.current_task.is_some());
456                (i, load)
457            })
458            .max_by_key(|(_, load)| *load)?;
459
460        if best_src_load < dst_load.saturating_add(STEAL_IMBALANCE_MIN) {
461            return None;
462        }
463
464        // Only steal if source will still have work left.
465        if self.cpus[best_src].class_rqs.runnable_len() < 2 {
466            return None;
467        }
468
469        let task = self.cpus[best_src]
470            .class_rqs
471            .steal_candidate(&self.class_table)?;
472        // Update the task->CPU mapping so wake/resume route correctly.
473        self.task_cpu.insert(task.id, dst_cpu);
474        log::trace!(
475            "WS: CPU {} stole task {:?} from CPU {} (src had {} tasks)",
476            dst_cpu,
477            task.id,
478            best_src,
479            self.cpus[best_src].class_rqs.runnable_len() + 1
480        );
481        sched_trace(format_args!(
482            "work-steal dst_cpu={} src_cpu={} task={}",
483            dst_cpu,
484            best_src,
485            task.id.as_u64()
486        ));
487        if cpu_is_valid(dst_cpu) {
488            CPU_STEAL_IN_COUNT[dst_cpu].fetch_add(1, Ordering::Relaxed);
489        }
490        if cpu_is_valid(best_src) {
491            CPU_STEAL_OUT_COUNT[best_src].fetch_add(1, Ordering::Relaxed);
492        }
493        LAST_STEAL_TICK[dst_cpu].store(now_tick, Ordering::Relaxed);
494        Some(task)
495    }
496
497    /// Prepare a context switch: pick next task, update TSS and CR3,
498    /// return raw pointers for `switch_context()`.
499    ///
500    /// Returns `None` if there's nothing to switch to (same task selected,
501    /// or no current task).
502    pub(super) fn yield_cpu(&mut self, cpu_index: usize) -> Option<SwitchTarget> {
503        if cpu_index >= self.cpus.len() {
504            return None;
505        }
506        // Must have a current task to yield from
507        let current_ref = self.cpus[cpu_index].current_task.as_ref()?;
508        let strong_before = Arc::strong_count(current_ref);
509        if strong_before == 0 || strong_before > (isize::MAX as usize) {
510            log::error!(
511                "[sched] CORRUPT Arc in yield_cpu before clone! cpu={} strong={:#x} task={} ptr={:p}",
512                cpu_index, strong_before,
513                current_ref.id.as_u64(),
514                Arc::as_ptr(current_ref) as *const u8,
515            );
516            return None;
517        }
518        let current = current_ref.clone();
519
520        // Pick the next task
521        let next = self.pick_next_task(cpu_index);
522
523        // Don't switch to ourselves
524        if Arc::ptr_eq(&current, &next) {
525            return None;
526        }
527        if cpu_is_valid(cpu_index) {
528            CPU_SWITCH_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
529        }
530
531        if let Err(e) = validate_task_context(&next) {
532            // Do NOT panic here — the scheduler lock is held, which would
533            // deadlock the panic hook (current_task_clone → SCHEDULER.lock()).
534            let bad_rsp = unsafe { (*next.context.get()).saved_rsp };
535            let stk_base = next.kernel_stack.virt_base.as_u64();
536            let stk_top = stk_base + next.kernel_stack.size as u64;
537            crate::serial_println!(
538                "[sched] WARN: invalid ctx for task '{}' (id={}) cpu={}: {} \
539                 rsp={:#x} stack=[{:#x}..{:#x}] - restoring current task",
540                next.name,
541                next.id.as_u64(),
542                cpu_index,
543                e,
544                bad_rsp,
545                stk_base,
546                stk_top,
547            );
548            log::error!(
549                "[sched] invalid context for task '{}' (id={:?}) cpu={}: {} \
550                 rsp={:#x} stack=[{:#x}..{:#x}]",
551                next.name,
552                next.id,
553                cpu_index,
554                e,
555                bad_rsp,
556                stk_base,
557                stk_top,
558            );
559
560            // ── Restore scheduler invariants ────────────────────────────────────────
561            // `pick_next_task` already mutated state before we reached this point:
562            //   1. Took `current_task`, set its state to Ready, placed it in
563            //      `task_to_requeue` (non-idle tasks only).
564            //   2. Set `current_task = Some(next_clone)` with next.state = Running.
565            //
566            // We are NOT performing the actual context switch. Both mutations must be
567            // undone so the truly running task stays scheduled and no Arc reference
568            // is silently lost (which would lead to corrupted saved_rsp on the next
569            // timer tick when the scheduler tries to save context for the wrong task).
570
571            // (a) Is `next` the per-CPU idle fallback (accessed via clone, never in
572            //     class_rqs)?  We need this before any borrow of self.cpus.
573            let is_idle_fallback = Arc::ptr_eq(&next, &self.cpus[cpu_index].idle_task);
574
575            // (b) If pick_next_task moved a Dead task into task_to_drop, we are NOT
576            //     performing the switch so finish_switch() will never run.  Drop the
577            //     Arc here (inside the lock) as a last resort.  cleanup_task_resources
578            //     was already called by pick_next_task, so the only allocation cost is
579            //     the final Arc refcount decrement + deallocation of the Task struct.
580            drop(self.cpus[cpu_index].task_to_drop.take());
581
582            // (c) Restore the old running task as current_task.
583            //     Non-idle current was placed in task_to_requeue by pick_next_task.
584            //     Idle current was not (it is accessed via idle_task.clone()), so use
585            //     the `current` Arc cloned at the start of yield_cpu.
586            if let Some(prev) = self.cpus[cpu_index].task_to_requeue.take() {
587                // SAFETY: scheduler lock held; exclusive access to task state.
588                unsafe {
589                    *prev.state.get() = TaskState::Running;
590                }
591                self.cpus[cpu_index].current_task = Some(prev);
592            } else {
593                // SAFETY: scheduler lock held.
594                unsafe {
595                    *current.state.get() = TaskState::Running;
596                }
597                self.cpus[cpu_index].current_task = Some(current.clone());
598            }
599
600            // (d) If `next` came from the class run queue (non-idle), re-enqueue it
601            //     so it can be retried on the next tick.  Its context may be
602            //     transiently invalid (e.g. stack not yet committed by a new task);
603            //     it will be validated again when next picked.
604            if !is_idle_fallback {
605                // SAFETY: scheduler lock held.
606                unsafe {
607                    *next.state.get() = TaskState::Ready;
608                }
609                let class = self.class_table.class_for_task(&next);
610                if let Some(cpu) = self.cpus.get_mut(cpu_index) {
611                    cpu.class_rqs.enqueue(class, next);
612                }
613            }
614            // For the idle fallback: state was set to Running by pick_next_task but the
615            // idle task lives in idle_task.clone() not in class_rqs, so no re-enqueue
616            // is needed.  The next pick_next_task will reset its state correctly.
617
618            return None;
619        }
620
621        // Update TSS.rsp0 for the new task (needed for Ring 3 -> Ring 0 transitions)
622        let stack_top = next.kernel_stack.virt_base.as_u64() + next.kernel_stack.size as u64;
623        crate::arch::x86_64::tss::set_kernel_stack(x86_64::VirtAddr::new(stack_top));
624
625        // Update SYSCALL kernel RSP for the new task
626        crate::arch::x86_64::syscall::set_kernel_rsp(stack_top);
627
628        // Switch CR3 if the new task has a different address space
629        // SAFETY: The new task's address space has a valid PML4 with the kernel half mapped.
630        unsafe {
631            (*next.process.address_space.get()).switch_to();
632        }
633
634        Some(SwitchTarget {
635            old_rsp_ptr: unsafe { &raw mut (*current.context.get()).saved_rsp },
636            new_rsp_ptr: unsafe { &raw const (*next.context.get()).saved_rsp },
637            old_fpu_ptr: current.fpu_state.get() as *mut u8,
638            new_fpu_ptr: next.fpu_state.get() as *const u8,
639            old_xcr0: current
640                .xcr0_mask
641                .load(core::sync::atomic::Ordering::Relaxed),
642            new_xcr0: next.xcr0_mask.load(core::sync::atomic::Ordering::Relaxed),
643        })
644    }
645
646    /// Performs the select cpu for task operation.
647    fn select_cpu_for_task(&self) -> usize {
648        if (self as *const Self).is_null() {
649            return 0;
650        }
651        let mut best = 0usize;
652        let mut best_load = usize::MAX;
653        for (idx, cpu) in self.cpus.iter().enumerate() {
654            let mut load = cpu.class_rqs.runnable_len();
655            if let Some(current) = cpu.current_task.as_ref() {
656                if self.class_table.class_for_task(current)
657                    != crate::process::sched::SchedClassId::Idle
658                {
659                    load += 1;
660                }
661            }
662            if load < best_load {
663                best = idx;
664                best_load = load;
665            }
666        }
667        crate::serial_println!(
668            "[trace][sched] select_cpu_for_task best={} load={}",
669            best,
670            best_load
671        );
672        best
673    }
674
675    /// Performs the migrate ready tasks for new class table operation.
676    pub fn migrate_ready_tasks_for_new_class_table(&mut self) {
677        let mut ready: Vec<(TaskId, Arc<Task>, usize)> = Vec::new();
678        for (id, task) in self.all_tasks.iter() {
679            // SAFETY: scheduler lock is held by caller; task state is synchronized by scheduler.
680            let state = unsafe { *task.state.get() };
681            if state != TaskState::Ready {
682                continue;
683            }
684            let cpu = self.task_cpu.get(id).copied().unwrap_or(0);
685            ready.push((*id, task.clone(), cpu));
686        }
687
688        for (id, task, cpu_idx) in ready {
689            let Some(cpu) = self.cpus.get_mut(cpu_idx) else {
690                continue;
691            };
692            if cpu.class_rqs.remove(id) {
693                let class = self.class_table.class_for_task(&task);
694                cpu.class_rqs.enqueue(class, task);
695                cpu.need_resched = true;
696            }
697        }
698    }
699}