Skip to main content

strat9_kernel/process/scheduler/
runtime_ops.rs

1use super::*;
2
3static FINISH_INTERRUPT_TRACE_BUDGET: core::sync::atomic::AtomicU32 =
4    core::sync::atomic::AtomicU32::new(32);
5
6/// Initialize the scheduler
7pub fn init_scheduler() {
8    // Build scheduler state only for CPUs that are actually online. Using the
9    // registered per-CPU count here can strand runnable tasks on AP slots that
10    // never reached the scheduler gate.
11    let cpu_count = crate::arch::x86_64::smp::cpu_count().max(1);
12    crate::serial_println!(
13        "[trace][sched] init_scheduler enter cpu_count={}",
14        cpu_count
15    );
16    // Build the scheduler outside the global scheduler lock to avoid
17    // lock-order inversions (`GLOBAL_SCHED_STATE -> allocator`) during task/stack
18    // allocation in `GlobalSchedState::new`.
19    let new_sched = GlobalSchedState::new();
20    // Initialize per-CPU scheduler state for each active CPU.
21    for i in 0..cpu_count {
22        let cpu_sched = super::core_impl::create_cpu_scheduler(i);
23        *LOCAL_SCHEDULERS[i].lock() = Some(cpu_sched);
24    }
25    crate::serial_println!("[trace][sched] init_scheduler new() done");
26
27    // Race/corruption diagnostic: register scheduler lock for E9 LOCK-A/LOCK-R traces.
28    crate::sync::debug_set_trace_lock_addr(debug_scheduler_lock_addr());
29
30    let mut scheduler = GLOBAL_SCHED_STATE.lock();
31    *scheduler = Some(new_sched);
32    drop(scheduler); // Release the lock
33
34    // Only initialize legacy PIT if APIC timer is not active
35    if !timer::is_apic_timer_active() {
36        timer::init_pit(100); // 100Hz = 10ms interval for quantum
37        log::info!("Scheduler: using legacy PIT timer (100Hz)");
38    } else {
39        log::info!("Scheduler: using APIC timer (100Hz)");
40    }
41    crate::serial_println!("[trace][sched] init_scheduler exit");
42}
43
44/// Add a task to the scheduler
45pub fn add_task(task: Arc<Task>) {
46    let tid = task.id;
47    crate::serial_force_println!(
48        "[trace][sched] add_task enter tid={} name={}",
49        tid.as_u64(),
50        task.name
51    );
52    crate::serial_force_println!(
53        "[trace][sched] lock addrs sched={:#x} slab={:#x} buddy={:#x}",
54        crate::process::scheduler::debug_scheduler_lock_addr(),
55        crate::memory::heap::debug_slab_lock_addr(),
56        crate::memory::buddy::debug_buddy_lock_addr()
57    );
58    let mut spins = 0usize;
59    let mut scheduler = loop {
60        if let Some(guard) = GLOBAL_SCHED_STATE.try_lock() {
61            break guard;
62        }
63        spins = spins.saturating_add(1);
64        if spins == 2_000_000 {
65            crate::serial_force_println!(
66                "[trace][sched] add_task waiting lock tid={} owner_cpu={}",
67                tid.as_u64(),
68                GLOBAL_SCHED_STATE.owner_cpu()
69            );
70            spins = 0;
71        }
72        core::hint::spin_loop();
73    };
74    crate::serial_force_println!("[trace][sched] add_task lock acquired tid={}", tid.as_u64());
75    let ipi_to_cpu = if let Some(ref mut sched) = *scheduler {
76        crate::serial_force_println!(
77            "[trace][sched] add_task scheduler present tid={}",
78            tid.as_u64()
79        );
80        let ipi = sched.add_task(task);
81        crate::serial_force_println!("[trace][sched] add_task done tid={}", tid.as_u64());
82        ipi
83    } else {
84        crate::serial_force_println!(
85            "[trace][sched] add_task scheduler missing tid={}",
86            tid.as_u64()
87        );
88        None
89    };
90    drop(scheduler);
91    if let Some(ci) = ipi_to_cpu {
92        send_resched_ipi_to_cpu(ci);
93    }
94}
95
96/// Add a task and register a parent/child relation.
97pub fn add_task_with_parent(task: Arc<Task>, parent: TaskId) {
98    let ipi_to_cpu = {
99        let mut scheduler = GLOBAL_SCHED_STATE.lock();
100        if let Some(ref mut sched) = *scheduler {
101            sched.add_task_with_parent(task, parent)
102        } else {
103            None
104        }
105    };
106    if let Some(ci) = ipi_to_cpu {
107        send_resched_ipi_to_cpu(ci);
108    }
109}
110
111/// Start the scheduler (called from kernel_main)
112///
113/// Picks the first task and starts running it. Never returns.
114pub fn schedule() -> ! {
115    let cpu_index = current_cpu_index();
116    schedule_on_cpu(cpu_index)
117}
118
119/// Performs the schedule on cpu operation.
120pub fn schedule_on_cpu(cpu_index: usize) -> ! {
121    crate::e9_println!("BD-ENTER cpu={}", cpu_index);
122    // Disable interrupts for the entire critical section.
123    //
124    // On the BSP, IF may be 1 (interrupts were enabled in Phase 9).
125    // Without CLI, a timer interrupt between `pick_next_task` (which sets
126    // `current_task`) and `restore_first_task` would let `maybe_preempt()`
127    // call `switch_context` on the *init stack*, corrupting the task's
128    // `saved_rsp` and creating an infinite loop.
129    //
130    // APs already arrive here with IF=0 (from the trampoline), but the
131    // explicit CLI makes the contract clear for all callers.
132    //
133    // Interrupts are re-enabled by the RFLAGS seed (0x202, IF=1) stored in
134    // each task's bootstrap interrupt frame, so no explicit sti() is needed.
135    crate::arch::x86_64::cli();
136
137    // APs may arrive here before the BSP has called init_scheduler().
138    // Spin-wait (releasing the lock each iteration) until the scheduler
139    // is initialized, then pick the first task.
140    let mut wait_iters: u64 = 0;
141    let first_task = loop {
142        let scheduler = GLOBAL_SCHED_STATE.lock();
143        if let Some(ref _sched) = *scheduler {
144            if wait_iters > 0 {
145                crate::e9_println!("BD first_task cpu={} waited={}", cpu_index, wait_iters);
146            }
147            drop(scheduler);
148            // Pick first task via LOCAL per-CPU state.
149            let idx = if cpu_index < active_cpu_count() {
150                cpu_index
151            } else {
152                0
153            };
154            let mut local = LOCAL_SCHEDULERS[idx].lock();
155            if let Some(ref mut cpu) = *local {
156                break super::core_impl::pick_next_task_local(cpu, idx);
157            }
158            // LOCAL not ready yet : drop and spin.
159            drop(local);
160            wait_iters = wait_iters.saturating_add(1);
161            if wait_iters == 1 || (wait_iters % 1_000_000 == 0 && wait_iters > 0) {
162                crate::e9_println!("BD-WAIT-LOCAL cpu={} iters={}", cpu_index, wait_iters);
163            }
164            core::hint::spin_loop();
165            continue;
166        }
167        // Drop lock before spinning so the BSP can initialize the scheduler.
168        drop(scheduler);
169        wait_iters = wait_iters.saturating_add(1);
170        if wait_iters == 1 || (wait_iters % 1_000_000 == 0 && wait_iters > 0) {
171            crate::e9_println!("BD-WAIT cpu={} iters={}", cpu_index, wait_iters);
172        }
173        core::hint::spin_loop();
174    }; // Lock is released here before jumping to first task
175    super::task_ops::flush_deferred_silo_cleanups();
176
177    crate::serial_force_println!(
178        "[trace][sched] schedule_on_cpu first_task cpu={} tid={} name={} rsp={:#x} kstack=[{:#x}..{:#x}]",
179        cpu_index,
180        first_task.id.as_u64(),
181        first_task.name,
182        unsafe { (*first_task.context.get()).saved_rsp },
183        first_task.kernel_stack.virt_base.as_u64(),
184        first_task.kernel_stack.virt_base.as_u64() + first_task.kernel_stack.size as u64,
185    );
186
187    // Set TSS.rsp0 and SYSCALL kernel RSP for the first task
188    {
189        let stack_top =
190            first_task.kernel_stack.virt_base.as_u64() + first_task.kernel_stack.size as u64;
191        crate::arch::x86_64::tss::set_kernel_stack(x86_64::VirtAddr::new(stack_top));
192        crate::arch::x86_64::syscall::set_kernel_rsp(stack_top);
193        crate::serial_force_println!(
194            "[trace][sched] schedule_on_cpu stacks set cpu={} rsp0={:#x}",
195            cpu_index,
196            stack_top
197        );
198    }
199
200    // Switch to the first task's address space (no-op for kernel tasks)
201    // SAFETY: The first task's address space is valid (kernel AS at boot).
202    if let Err(e) = validate_task_context(&first_task) {
203        panic!(
204            "scheduler: invalid first task '{}' (id={:?}): {}",
205            first_task.name, first_task.id, e
206        );
207    }
208    crate::serial_force_println!(
209        "[trace][sched] schedule_on_cpu first_task ctx valid cpu={} tid={}",
210        cpu_index,
211        first_task.id.as_u64()
212    );
213    unsafe {
214        first_task.process.address_space_arc().switch_to();
215    }
216    crate::serial_force_println!(
217        "[trace][sched] schedule_on_cpu switch_to done cpu={} tid={}",
218        cpu_index,
219        first_task.id.as_u64()
220    );
221
222    // Jump to the first task (never returns)
223    // SAFETY: The context was set up by CpuContext::new with a valid stack frame.
224    // Interrupts are disabled; the trampoline's `sti` re-enables them.
225    crate::serial_force_println!(
226        "[trace][sched] schedule_on_cpu restore_first_task cpu={} tid={}",
227        cpu_index,
228        first_task.id.as_u64()
229    );
230    crate::serial_force_println!(
231        "[trace][sched] schedule_on_cpu calling do_restore_first_task cpu={} tid={} rsp={:#x}",
232        cpu_index,
233        first_task.id.as_u64(),
234        unsafe { (*first_task.context.get()).saved_rsp }
235    );
236    unsafe {
237        // Pass the stack frame pointer (saved_rsp points TO the frame, not the context struct)
238        let frame_ptr = (*first_task.context.get()).saved_rsp as *const u64;
239        crate::process::task::do_restore_first_task(
240            frame_ptr,
241            first_task.fpu_state.get() as *const u8,
242            first_task
243                .xcr0_mask
244                .load(core::sync::atomic::Ordering::Relaxed),
245        );
246    }
247}
248
249/// Called immediately after a context switch completes (in the new task's context).
250/// This safely re-queues the previously running task now that its state is fully saved.
251///
252/// Mirrors Redox switch_finish_hook: minimal work, no serial (avoids lock contention).
253pub fn finish_switch() {
254    let _perf = super::perf_counters::PerfScope::new(
255        &super::perf_counters::CTX_SWITCH_TSC,
256        &super::perf_counters::CTX_SWITCH_COUNT,
257    );
258    let cpu_index = current_cpu_index();
259    let mut task_to_drop = None;
260    {
261        // Use LOCAL lock : no spinning on GLOBAL_SCHED_STATE needed.
262        let mut spins = 0usize;
263        let mut guard = loop {
264            if let Some(g) = LOCAL_SCHEDULERS[cpu_index].try_lock_no_irqsave() {
265                break g;
266            }
267            spins = spins.saturating_add(1);
268            if spins % 1_000_000 == 0 {
269                unsafe { core::arch::asm!("mov al, 'W'; out 0xe9, al", out("al") _) };
270            }
271            core::hint::spin_loop();
272        };
273        if let Some(ref mut cpu) = *guard {
274            // Activate the address space for the current task on this CPU.
275            if let Some(ref task) = cpu.current_task {
276                unsafe { task.process.address_space_arc().switch_to() };
277            }
278            task_to_drop = super::core_impl::drain_post_switch_local(cpu, true);
279        }
280    }
281
282    core::sync::atomic::fence(core::sync::atomic::Ordering::SeqCst);
283    super::task_ops::flush_deferred_silo_cleanups();
284    drop(task_to_drop);
285}
286
287/// Finalize a preemption-driven switch once the raw timer stub has already
288/// moved onto the next task's kernel stack.
289///
290/// This mirrors Redox's `switch_finish_hook` and Maestro's `switch_finish`:
291/// requeue/drop of the old task happens only after the architectural switch is
292/// complete, so another CPU cannot steal the old task while its FPU/stack state
293/// is still in flight.
294pub fn finish_interrupt_switch() {
295    let _perf = super::perf_counters::PerfScope::new(
296        &super::perf_counters::CTX_SWITCH_TSC,
297        &super::perf_counters::CTX_SWITCH_COUNT,
298    );
299    let cpu_index = current_cpu_index();
300    let should_trace = FINISH_INTERRUPT_TRACE_BUDGET
301        .fetch_update(
302            core::sync::atomic::Ordering::AcqRel,
303            core::sync::atomic::Ordering::Relaxed,
304            |budget| budget.checked_sub(1),
305        )
306        .is_ok();
307    let entry_rsp0 = crate::arch::x86_64::tss::kernel_stack_for(cpu_index)
308        .map(|addr| addr.as_u64())
309        .unwrap_or(0);
310    if should_trace {
311        crate::e9_println!("[ifs-enter] cpu={} rsp0={:#x}", cpu_index, entry_rsp0);
312    }
313
314    // Spin until GLOBAL_SCHED_STATE is available (released by maybe_preempt_from_interrupt
315    // before returning to this assembly stub).  We must not block with IRQs enabled
316    // because we are still inside the timer interrupt handler.  try_lock_no_irqsave
317    // requires IRQs already disabled, which is guaranteed here.
318    //
319    // The spin is bounded: if the lock is not released within MAX_IFS_SPINS
320    // iterations something is fundamentally broken (holder deadlocked, or
321    // lock corruption).  Panic so we get a stack trace instead of a silent
322    // hang.
323    // This is around few seconds on recent CPU.
324
325    // Use LOCAL lock : no spinning on GLOBAL_SCHED_STATE. The LOCAL lock for this CPU
326    // is released quickly by maybe_preempt_from_interrupt before we get here.
327    const MAX_IFS_SPINS: usize = 50_000_000;
328    let mut task_to_drop = None;
329    let mut spins = 0usize;
330    loop {
331        if let Some(mut guard) = LOCAL_SCHEDULERS[cpu_index].try_lock_no_irqsave() {
332            if let Some(ref mut cpu) = *guard {
333                // REQUEUE OLD TASK FIRST (while current AS is still active/stable)
334                task_to_drop = super::core_impl::drain_post_switch_local(cpu, false);
335
336                // NOW SWITCH TO NEW ADDRESS SPACE
337                if let Some(ref task) = cpu.current_task {
338                    let task_stack_top =
339                        task.kernel_stack.virt_base.as_u64() + task.kernel_stack.size as u64;
340                    if should_trace {
341                        crate::e9_println!(
342                            "[ifs-task] cpu={} tid={} rsp0={:#x} expected={:#x}",
343                            cpu_index,
344                            task.id.as_u64(),
345                            entry_rsp0,
346                            task_stack_top
347                        );
348                    }
349                    if should_trace && entry_rsp0 != 0 && entry_rsp0 != task_stack_top {
350                        crate::e9_println!(
351                            "[ifs-rsp0-mismatch] cpu={} tid={} rsp0={:#x} expected={:#x}",
352                            cpu_index,
353                            task.id.as_u64(),
354                            entry_rsp0,
355                            task_stack_top
356                        );
357                    }
358                    unsafe { task.process.address_space_arc().switch_to() };
359                }
360            }
361            break;
362        }
363        spins = spins.saturating_add(1);
364        if spins >= MAX_IFS_SPINS {
365            crate::e9_println!(
366                "[BUG] finish_interrupt_switch: LOCAL lock not released after {} spins, cpu={}",
367                spins,
368                cpu_index
369            );
370            panic!(
371                "finish_interrupt_switch: LOCAL lock stuck after {} spins on cpu {}",
372                spins, cpu_index
373            );
374        }
375        core::hint::spin_loop();
376    }
377    let _ = task_to_drop;
378}
379
380/// Yield the current task to allow other tasks to run (cooperative).
381///
382/// Disables interrupts around the scheduler lock to prevent deadlock
383/// with the timer handler's `maybe_preempt()`.
384///
385/// Returns immediately (no-op) if preemption is disabled on this CPU.
386pub fn yield_task() {
387    // Respect the preemption guard: if a `PreemptGuard` is held, do nothing.
388    if !percpu::is_preemptible() {
389        return;
390    }
391    let _perf = super::perf_counters::PerfScope::new(
392        &super::perf_counters::SCHED_YIELD_TSC,
393        &super::perf_counters::SCHED_YIELD_COUNT,
394    );
395
396    // Save RFLAGS and disable interrupts to prevent timer from
397    // trying to lock the scheduler while we hold it
398    let saved_flags = save_flags_and_cli();
399    let cpu_index = current_cpu_index();
400
401    let switch_target = {
402        let mut local = LOCAL_SCHEDULERS[cpu_index].lock();
403        if let Some(ref mut cpu) = *local {
404            super::core_impl::yield_cpu_local(cpu, cpu_index)
405        } else {
406            None
407        }
408    }; // Lock released here, before the actual context switch
409
410    if let Some(ref target) = switch_target {
411        // SAFETY: Pointers are valid (they point into Arc<Task> contexts
412        // kept alive by the scheduler). Interrupts are disabled.
413        unsafe {
414            crate::process::task::do_switch_context(target);
415        }
416        finish_switch();
417    }
418
419    restore_flags(saved_flags);
420}
421
422/// Force a context switch away from the current task, unconditionally.
423///
424/// Unlike [`yield_task`], this function **ignores the preemption guard**.
425/// It must only be called from [`super::task_ops::exit_current_task`] after:
426///
427/// 1. The task has been marked [`TaskState::Dead`].
428/// 2. All scheduler locks have been released.
429/// 3. No spinlock-guarded per-CPU data is being accessed by this task.
430///
431/// At that point the preempt_count is irrelevant : the task will never run
432/// again, so bypassing the guard is both safe and necessary to prevent the
433/// dead task from spinning in a `hlt()` loop.
434pub fn yield_dead_task() {
435    let saved_flags = save_flags_and_cli();
436    let cpu_index = current_cpu_index();
437
438    let switch_target = {
439        let mut local = LOCAL_SCHEDULERS[cpu_index].lock();
440        if let Some(ref mut cpu) = *local {
441            super::core_impl::yield_cpu_local(cpu, cpu_index)
442        } else {
443            None
444        }
445    }; // Lock released before the context switch.
446
447    if let Some(ref target) = switch_target {
448        // SAFETY: Pointers are valid (Arc<Task> contexts kept alive by the
449        // scheduler).  Interrupts are disabled via save_flags_and_cli().
450        unsafe {
451            crate::process::task::do_switch_context(target);
452        }
453        finish_switch();
454    }
455
456    restore_flags(saved_flags);
457}
458
459#[inline]
460fn interrupt_frame_fits(task: &Arc<Task>, rsp: u64) -> bool {
461    let stack_base = task.kernel_stack.virt_base.as_u64();
462    let stack_top = stack_base + task.kernel_stack.size as u64;
463    let frame_size = core::mem::size_of::<crate::syscall::SyscallFrame>() as u64;
464    rsp >= stack_base && rsp.saturating_add(frame_size) <= stack_top
465}
466
467/// Called from the timer interrupt handler (or a resched IPI) to potentially
468/// preempt the current task.
469///
470/// This is safe to call from interrupt context because:
471/// 1. IF is already cleared by the CPU when entering the interrupt.
472/// 2. We use `try_lock()` - if the scheduler is already locked
473///    (e.g., `yield_task()` is in progress), we simply skip preemption
474///    for this tick.
475/// 3. We honour the `PreemptGuard`: if preemption is disabled, we return.
476pub fn maybe_preempt() {
477    let _perf = super::perf_counters::PerfScope::new(
478        &super::perf_counters::SCHED_PREEMPT_TSC,
479        &super::perf_counters::SCHED_PREEMPT_COUNT,
480    );
481    let cpu_index = current_cpu_index();
482    if cpu_is_valid(cpu_index) {
483        RESCHED_IPI_PENDING[cpu_index].store(false, Ordering::Release);
484    }
485
486    // Honour the preemption guard - never preempt a section that asked for it.
487    if !percpu::is_preemptible() {
488        return;
489    }
490
491    // Use the per-CPU LOCAL lock : never blocked by another CPU's cold-path
492    // operations (fork, exit, wake) that hold GLOBAL_SCHED_STATE.
493    let switch_target = {
494        let mut guard = match LOCAL_SCHEDULERS[cpu_index].try_lock_no_irqsave() {
495            Some(g) => g,
496            None => {
497                note_try_lock_fail_on_cpu(cpu_index);
498                return;
499            }
500        };
501        let cpu = match guard.as_mut() {
502            Some(c) => c,
503            None => return,
504        };
505        if take_force_resched_hint(cpu_index) {
506            cpu.need_resched = true;
507        }
508        if cpu.current_task.is_none() || !cpu.need_resched {
509            return;
510        }
511        if let Some(current) = cpu.current_task.as_ref() {
512            sched_trace(format_args!(
513                "cpu={} preempt request task={} rt_delta={}",
514                cpu_index,
515                current.id.as_u64(),
516                cpu.current_runtime.period_delta_ticks
517            ));
518        }
519        cpu.need_resched = false;
520        super::core_impl::yield_cpu_local(cpu, cpu_index)
521    }; // LOCAL lock released here
522
523    if let Some(ref target) = switch_target {
524        if cpu_is_valid(cpu_index) {
525            // One-shot per-CPU: trace the very first real preemption.
526            // NOTE: do NOT acquire GLOBAL_SCHED_STATE here : we are between the lock
527            // release (end of the block above) and do_switch_context. A
528            // nested try_lock in this window re-enters the guardian (CLI +
529            // CAS) on a CPU that is about to switch stacks, producing a
530            // spurious second "locked_raw=true" observation in finish_switch
531            // diagnostics and, if the lock happens to be free, a redundant
532            // owner_cpu store on the wrong context.
533            if !FIRST_PREEMPT_LOGGED[cpu_index].swap(true, Ordering::Relaxed) {
534                let _preempt_n = CPU_PREEMPT_COUNT[cpu_index].load(Ordering::Relaxed);
535            }
536            CPU_PREEMPT_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
537        }
538        unsafe {
539            crate::process::task::do_switch_context(target);
540        }
541        finish_switch();
542    }
543}
544
545/// Interrupt-aware preemption path.
546///
547/// Unlike the legacy `ret`-based scheduler path, the full interrupted user
548/// context is already materialized as a `SyscallFrame` on the current kernel
549/// stack. This lets us save the outgoing task immediately, select the next
550/// runnable task under the scheduler lock, and return an `iretq`-compatible
551/// frame pointer for the raw timer stub.
552pub fn maybe_preempt_from_interrupt(
553    cpu_index: usize,
554    current_frame: &mut crate::syscall::SyscallFrame,
555) -> Option<crate::arch::x86_64::idt::InterruptReturnDecision> {
556    if cpu_is_valid(cpu_index) {
557        RESCHED_IPI_PENDING[cpu_index].store(false, Ordering::Release);
558    }
559
560    if !percpu::is_preemptible() {
561        return None;
562    }
563
564    let current_frame_rsp = current_frame as *mut crate::syscall::SyscallFrame as u64;
565    let mut _task_to_drop: Option<Arc<Task>> = None;
566
567    let decision = {
568        // Use per-CPU LOCAL lock : not blocked by cold-path global operations.
569        let mut guard = match LOCAL_SCHEDULERS[cpu_index].try_lock_no_irqsave() {
570            Some(g) => g,
571            None => {
572                note_try_lock_fail_on_cpu(cpu_index);
573                return None;
574            }
575        };
576        let cpu = match guard.as_mut() {
577            Some(c) => c,
578            None => return None,
579        };
580
581        if take_force_resched_hint(cpu_index) {
582            cpu.need_resched = true;
583        }
584        if cpu.current_task.is_none() || !cpu.need_resched {
585            return None;
586        }
587
588        unsafe { core::arch::asm!("mov al, '1'; out 0xe9, al", out("al") _) };
589        let current = match cpu.current_task.as_ref() {
590            Some(t) => t.clone(),
591            None => {
592                unsafe { core::arch::asm!("mov al, 'X'; out 0xe9, al", out("al") _) };
593                return None;
594            }
595        };
596        unsafe { core::arch::asm!("mov al, '2'; out 0xe9, al", out("al") _) };
597        current.set_resume_kind(crate::process::task::ResumeKind::IretFrame);
598        current.set_interrupt_rsp(current_frame_rsp);
599
600        let next = super::core_impl::pick_next_task_local(cpu, cpu_index);
601
602        if Arc::ptr_eq(&current, &next) {
603            cpu.need_resched = false;
604            _task_to_drop = cpu.task_to_drop.take();
605            // No context switch: return current task's FPU area for save/restore.
606            let current_fpu = current.fpu_state.get() as *mut u8;
607            Some(crate::arch::x86_64::idt::InterruptReturnDecision {
608                next_rsp: 0,
609                old_fpu: current_fpu,
610                new_fpu: current_fpu,
611            })
612        } else {
613            let mut next_rsp = next.interrupt_rsp();
614            if next.resume_kind() == crate::process::task::ResumeKind::RetFrame {
615                // All tasks (kernel and ELF user tasks) start their first execution
616                // in Ring 0 via the task_entry_trampoline. We must seed a kernel
617                // interrupt frame so that the interrupt return path (iretq) can
618                // safely jump to the trampoline.
619                next.seed_kernel_interrupt_frame_from_context();
620                next_rsp = next.interrupt_rsp();
621            }
622            let fits = interrupt_frame_fits(&next, next_rsp);
623            if next_rsp == 0 || !fits {
624                unsafe { core::arch::asm!("mov al, 'A'; out 0xe9, al", out("al") _) };
625                let is_idle_fallback = Arc::ptr_eq(&next, &cpu.idle_task);
626                _task_to_drop = cpu.task_to_drop.take();
627
628                if let Some(prev) = cpu.task_to_requeue.take() {
629                    prev.set_state(TaskState::Running);
630                    cpu.current_task = Some(prev);
631                } else {
632                    current.set_state(TaskState::Running);
633                    cpu.current_task = Some(current.clone());
634                }
635
636                if !is_idle_fallback {
637                    next.set_state(TaskState::Ready);
638                    let class = cpu.class_table.class_for_task(&next);
639                    cpu.class_rqs.enqueue(class, next);
640                }
641                // Abort switch: return current task's FPU area for save/restore.
642                let current_fpu = current.fpu_state.get() as *mut u8;
643                return Some(crate::arch::x86_64::idt::InterruptReturnDecision {
644                    next_rsp: 0,
645                    old_fpu: current_fpu,
646                    new_fpu: current_fpu,
647                });
648            } else {
649                next.set_resume_kind(crate::process::task::ResumeKind::IretFrame);
650                cpu.need_resched = false;
651                // Do NOT drain `task_to_requeue` / `task_to_drop` here.
652                // The raw timer stub still has to save the outgoing FPU state and
653                // pivot onto the next task's stack. Defer that finalization to
654                // `finish_interrupt_switch()` on the new stack.
655
656                let stack_top =
657                    next.kernel_stack.virt_base.as_u64() + next.kernel_stack.size as u64;
658
659                crate::arch::x86_64::tss::set_kernel_stack(x86_64::VirtAddr::new(stack_top));
660                crate::arch::x86_64::syscall::set_kernel_rsp(stack_top);
661
662                let old_fpu = current.fpu_state.get() as *mut u8;
663                let new_fpu = next.fpu_state.get() as *const u8;
664
665                Some(crate::arch::x86_64::idt::InterruptReturnDecision {
666                    next_rsp,
667                    old_fpu,
668                    new_fpu,
669                })
670            }
671        }
672    }; // LOCAL lock released here
673
674    if decision.is_some() && cpu_is_valid(cpu_index) {
675        CPU_PREEMPT_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
676    }
677
678    decision
679}
680
681/// Enable or disable verbose scheduler tracing.
682pub fn set_verbose(enabled: bool) {
683    SCHED_VERBOSE.store(enabled, Ordering::Relaxed);
684    log::info!(
685        "[sched][trace] verbose={}",
686        if enabled { "on" } else { "off" }
687    );
688}
689
690/// Return current verbose tracing state.
691pub fn verbose_enabled() -> bool {
692    SCHED_VERBOSE.load(Ordering::Relaxed)
693}
694
695/// Return the scheduler class-table currently in use.
696pub fn class_table() -> crate::process::sched::SchedClassTable {
697    let saved_flags = save_flags_and_cli();
698    let out = {
699        let scheduler = GLOBAL_SCHED_STATE.lock();
700        if let Some(ref sched) = *scheduler {
701            sched.class_table
702        } else {
703            crate::process::sched::SchedClassTable::default()
704        }
705    };
706    restore_flags(saved_flags);
707    out
708}
709
710/// Configure scheduler class pick/steal order at runtime.
711pub fn configure_class_table(table: crate::process::sched::SchedClassTable) -> bool {
712    if !table.validate() {
713        return false;
714    }
715    let saved_flags = save_flags_and_cli();
716    let mut ipi_targets = [false; crate::arch::x86_64::percpu::MAX_CPUS];
717    let my_cpu = current_cpu_index();
718    let applied = {
719        let mut scheduler = GLOBAL_SCHED_STATE.lock();
720        if let Some(ref mut sched) = *scheduler {
721            let prev = sched.class_table;
722            sched.class_table = table;
723            let n = active_cpu_count();
724            // Propagate the new class table to every LOCAL and set need_resched
725            // in a single pass to avoid locking each LOCAL multiple times.
726            for cpu_idx in 0..n {
727                if let Some(ref mut local_cpu) = *LOCAL_SCHEDULERS[cpu_idx].lock() {
728                    local_cpu.class_table = table;
729                    local_cpu.need_resched = true;
730                }
731                if cpu_idx != my_cpu && cpu_is_valid(cpu_idx) {
732                    ipi_targets[cpu_idx] = true;
733                }
734            }
735            if prev.policy_map() != sched.class_table.policy_map() {
736                sched.migrate_ready_tasks_for_new_class_table();
737            }
738            true
739        } else {
740            false
741        }
742    };
743    restore_flags(saved_flags);
744    for (cpu, send) in ipi_targets.iter().copied().enumerate() {
745        if send {
746            send_resched_ipi_to_cpu(cpu);
747        }
748    }
749    applied
750}
751
752/// Dump per-cpu scheduler queues for tracing/debug.
753pub fn log_state(label: &str) {
754    let saved_flags = save_flags_and_cli();
755    let scheduler = GLOBAL_SCHED_STATE.lock();
756    if let Some(ref sched) = *scheduler {
757        let pick = sched.class_table.pick_order();
758        let steal = sched.class_table.steal_order();
759        log::info!(
760            "[sched][state] label={} class_table.pick=[{},{},{}] class_table.steal=[{},{}]",
761            label,
762            pick[0].as_str(),
763            pick[1].as_str(),
764            pick[2].as_str(),
765            steal[0].as_str(),
766            steal[1].as_str()
767        );
768        let n = active_cpu_count();
769        for cpu_id in 0..n {
770            use crate::process::sched::SchedClassRq;
771            let local_guard = LOCAL_SCHEDULERS[cpu_id].lock();
772            if let Some(ref cpu) = *local_guard {
773                let current = cpu
774                    .current_task
775                    .as_ref()
776                    .map(|t| t.id.as_u64())
777                    .unwrap_or(u64::MAX);
778                let blocked_len = super::BLOCKED_TASKS.lock().len();
779                log::info!(
780                    "[sched][state] label={} cpu={} current={} rq_rt={} rq_fair={} rq_idle={} blocked={} need_resched={}",
781                    label,
782                    cpu_id,
783                    current,
784                    cpu.class_rqs.real_time.len(),
785                    cpu.class_rqs.fair.len(),
786                    cpu.class_rqs.idle.len(),
787                    blocked_len,
788                    cpu.need_resched
789                );
790            }
791        }
792    }
793    drop(scheduler);
794    restore_flags(saved_flags);
795}
796
797/// Structured scheduler state snapshot for shell/top/debug tooling.
798pub fn state_snapshot() -> SchedulerStateSnapshot {
799    let mut out = SchedulerStateSnapshot {
800        initialized: false,
801        boot_phase: 0,
802        cpu_count: 0,
803        pick_order: [
804            crate::process::sched::SchedClassId::RealTime,
805            crate::process::sched::SchedClassId::Fair,
806            crate::process::sched::SchedClassId::Idle,
807        ],
808        steal_order: [
809            crate::process::sched::SchedClassId::Fair,
810            crate::process::sched::SchedClassId::RealTime,
811        ],
812        blocked_tasks: 0,
813        current_task: [u64::MAX; crate::arch::x86_64::percpu::MAX_CPUS],
814        rq_rt: [0; crate::arch::x86_64::percpu::MAX_CPUS],
815        rq_fair: [0; crate::arch::x86_64::percpu::MAX_CPUS],
816        rq_idle: [0; crate::arch::x86_64::percpu::MAX_CPUS],
817        need_resched: [false; crate::arch::x86_64::percpu::MAX_CPUS],
818    };
819
820    let saved_flags = save_flags_and_cli();
821    {
822        let scheduler = GLOBAL_SCHED_STATE.lock();
823        if let Some(ref sched) = *scheduler {
824            use crate::process::sched::SchedClassRq;
825            let cpu_count = active_cpu_count().min(crate::arch::x86_64::percpu::MAX_CPUS);
826            out.initialized = true;
827            out.boot_phase = if cpu_count > 0 { 2 } else { 1 };
828            out.cpu_count = cpu_count;
829            out.pick_order = *sched.class_table.pick_order();
830            out.steal_order = *sched.class_table.steal_order();
831            out.blocked_tasks = super::BLOCKED_TASKS.lock().len();
832            for i in 0..cpu_count {
833                let local_guard = LOCAL_SCHEDULERS[i].lock();
834                if let Some(ref cpu) = *local_guard {
835                    out.current_task[i] = cpu
836                        .current_task
837                        .as_ref()
838                        .map(|t| t.id.as_u64())
839                        .unwrap_or(u64::MAX);
840                    out.rq_rt[i] = cpu.class_rqs.real_time.len();
841                    out.rq_fair[i] = cpu.class_rqs.fair.len();
842                    out.rq_idle[i] = cpu.class_rqs.idle.len();
843                    out.need_resched[i] = cpu.need_resched;
844                }
845            }
846        }
847    }
848    restore_flags(saved_flags);
849    out
850}
851
852/// The main function for the idle task
853pub(super) extern "C" fn idle_task_main() -> ! {
854    let cpu = crate::arch::x86_64::percpu::current_cpu_index();
855    crate::serial_force_println!("[trace][sched] idle_task_main start cpu={}", cpu);
856    loop {
857        // Be explicit on SMP: never rely on inherited IF state.
858        // If IF=0, HLT can deadlock that CPU forever.
859        crate::arch::x86_64::sti();
860
861        // Halt until next interrupt (saves power, timer will wake us)
862        crate::arch::x86_64::hlt();
863    }
864}