Skip to main content

strat9_kernel/process/scheduler/
runtime_ops.rs

1use super::*;
2
3/// Initialize the scheduler
4pub fn init_scheduler() {
5    let cpu_count = percpu::cpu_count().max(1);
6    crate::serial_println!(
7        "[trace][sched] init_scheduler enter cpu_count={}",
8        cpu_count
9    );
10    // Build the scheduler outside the global scheduler lock to avoid
11    // lock-order inversions (`SCHEDULER -> allocator`) during task/stack
12    // allocation in `Scheduler::new`.
13    let new_sched = Scheduler::new(cpu_count);
14    crate::serial_println!("[trace][sched] init_scheduler new() done");
15
16    let mut scheduler = SCHEDULER.lock();
17    *scheduler = Some(new_sched);
18    drop(scheduler); // Release the lock
19
20    // Only initialize legacy PIT if APIC timer is not active
21    if !timer::is_apic_timer_active() {
22        timer::init_pit(100); // 100Hz = 10ms interval for quantum
23        log::info!("Scheduler: using legacy PIT timer (100Hz)");
24    } else {
25        log::info!("Scheduler: using APIC timer (100Hz)");
26    }
27    crate::serial_println!("[trace][sched] init_scheduler exit");
28}
29
30/// Add a task to the scheduler
31pub fn add_task(task: Arc<Task>) {
32    let tid = task.id;
33    crate::serial_force_println!(
34        "[trace][sched] add_task enter tid={} name={}",
35        tid.as_u64(),
36        task.name
37    );
38    crate::serial_force_println!(
39        "[trace][sched] lock addrs sched={:#x} slab={:#x} buddy={:#x}",
40        crate::process::scheduler::debug_scheduler_lock_addr(),
41        crate::memory::heap::debug_slab_lock_addr(),
42        crate::memory::buddy::debug_buddy_lock_addr()
43    );
44    let mut spins = 0usize;
45    let mut scheduler = loop {
46        if let Some(guard) = SCHEDULER.try_lock() {
47            break guard;
48        }
49        spins = spins.saturating_add(1);
50        if spins == 2_000_000 {
51            crate::serial_force_println!(
52                "[trace][sched] add_task waiting lock tid={} owner_cpu={}",
53                tid.as_u64(),
54                SCHEDULER.owner_cpu()
55            );
56            spins = 0;
57        }
58        core::hint::spin_loop();
59    };
60    crate::serial_force_println!("[trace][sched] add_task lock acquired tid={}", tid.as_u64());
61    let ipi_to_cpu = if let Some(ref mut sched) = *scheduler {
62        crate::serial_force_println!(
63            "[trace][sched] add_task scheduler present tid={}",
64            tid.as_u64()
65        );
66        let ipi = sched.add_task(task);
67        crate::serial_force_println!("[trace][sched] add_task done tid={}", tid.as_u64());
68        ipi
69    } else {
70        crate::serial_force_println!(
71            "[trace][sched] add_task scheduler missing tid={}",
72            tid.as_u64()
73        );
74        None
75    };
76    drop(scheduler);
77    if let Some(ci) = ipi_to_cpu {
78        send_resched_ipi_to_cpu(ci);
79    }
80}
81
82/// Add a task and register a parent/child relation.
83pub fn add_task_with_parent(task: Arc<Task>, parent: TaskId) {
84    let ipi_to_cpu = {
85        let mut scheduler = SCHEDULER.lock();
86        if let Some(ref mut sched) = *scheduler {
87            sched.add_task_with_parent(task, parent)
88        } else {
89            None
90        }
91    };
92    if let Some(ci) = ipi_to_cpu {
93        send_resched_ipi_to_cpu(ci);
94    }
95}
96
97/// Start the scheduler (called from kernel_main)
98///
99/// Picks the first task and starts running it. Never returns.
100pub fn schedule() -> ! {
101    let cpu_index = current_cpu_index();
102    schedule_on_cpu(cpu_index)
103}
104
105/// Performs the schedule on cpu operation.
106pub fn schedule_on_cpu(cpu_index: usize) -> ! {
107    // Disable interrupts for the entire critical section.
108    //
109    // On the BSP, IF may be 1 (interrupts were enabled in Phase 9).
110    // Without CLI, a timer interrupt between `pick_next_task` (which sets
111    // `current_task`) and `restore_first_task` would let `maybe_preempt()`
112    // call `switch_context` on the *init stack*, corrupting the task's
113    // `saved_rsp` and creating an infinite loop.
114    //
115    // APs already arrive here with IF=0 (from the trampoline), but the
116    // explicit CLI makes the contract clear for all callers.
117    //
118    // `task_entry_trampoline` executes `sti` when the first task starts,
119    // so interrupts are re-enabled at exactly the right moment.
120    crate::arch::x86_64::cli();
121
122    // APs may arrive here before the BSP has called init_scheduler().
123    // Spin-wait (releasing the lock each iteration) until the scheduler
124    // is initialized, then pick the first task.
125    let first_task = loop {
126        let mut scheduler = SCHEDULER.lock();
127        if let Some(ref mut sched) = *scheduler {
128            let idx = if cpu_index < sched.cpus.len() {
129                cpu_index
130            } else {
131                0
132            };
133            break sched.pick_next_task(idx);
134        }
135        // Drop lock before spinning so the BSP can initialize the scheduler.
136        drop(scheduler);
137        core::hint::spin_loop();
138    }; // Lock is released here before jumping to first task
139    super::task_ops::flush_deferred_silo_cleanups();
140
141    crate::serial_force_println!(
142        "[trace][sched] schedule_on_cpu first_task cpu={} tid={} name={} rsp={:#x} kstack=[{:#x}..{:#x}]",
143        cpu_index,
144        first_task.id.as_u64(),
145        first_task.name,
146        unsafe { (*first_task.context.get()).saved_rsp },
147        first_task.kernel_stack.virt_base.as_u64(),
148        first_task.kernel_stack.virt_base.as_u64() + first_task.kernel_stack.size as u64,
149    );
150
151    // Set TSS.rsp0 and SYSCALL kernel RSP for the first task
152    {
153        let stack_top =
154            first_task.kernel_stack.virt_base.as_u64() + first_task.kernel_stack.size as u64;
155        crate::arch::x86_64::tss::set_kernel_stack(x86_64::VirtAddr::new(stack_top));
156        crate::arch::x86_64::syscall::set_kernel_rsp(stack_top);
157        crate::serial_force_println!(
158            "[trace][sched] schedule_on_cpu stacks set cpu={} rsp0={:#x}",
159            cpu_index,
160            stack_top
161        );
162    }
163
164    // Switch to the first task's address space (no-op for kernel tasks)
165    // SAFETY: The first task's address space is valid (kernel AS at boot).
166    if let Err(e) = validate_task_context(&first_task) {
167        panic!(
168            "scheduler: invalid first task '{}' (id={:?}): {}",
169            first_task.name, first_task.id, e
170        );
171    }
172    crate::serial_force_println!(
173        "[trace][sched] schedule_on_cpu first_task ctx valid cpu={} tid={}",
174        cpu_index,
175        first_task.id.as_u64()
176    );
177    unsafe {
178        (*first_task.process.address_space.get()).switch_to();
179    }
180    crate::serial_force_println!(
181        "[trace][sched] schedule_on_cpu switch_to done cpu={} tid={}",
182        cpu_index,
183        first_task.id.as_u64()
184    );
185
186    // Jump to the first task (never returns)
187    // SAFETY: The context was set up by CpuContext::new with a valid stack frame.
188    // Interrupts are disabled; the trampoline's `sti` re-enables them.
189    crate::serial_force_println!(
190        "[trace][sched] schedule_on_cpu restore_first_task cpu={} tid={}",
191        cpu_index,
192        first_task.id.as_u64()
193    );
194    unsafe {
195        crate::process::task::do_restore_first_task(
196            &raw const (*first_task.context.get()).saved_rsp,
197            first_task.fpu_state.get() as *const u8,
198            first_task
199                .xcr0_mask
200                .load(core::sync::atomic::Ordering::Relaxed),
201        );
202    }
203}
204
205/// Called immediately after a context switch completes (in the new task's context).
206/// This safely re-queues the previously running task now that its state is fully saved.
207pub fn finish_switch() {
208    let cpu_index = current_cpu_index();
209    crate::serial_force_println!("[trace][sched] finish_switch enter cpu={}", cpu_index);
210    let mut task_to_drop = None;
211    {
212        crate::serial_force_println!(
213            "[trace][sched] finish_switch before lock cpu={} sched_lock={:#x}",
214            cpu_index,
215            crate::process::scheduler::debug_scheduler_lock_addr()
216        );
217        let mut spins = 0usize;
218        let mut scheduler = loop {
219            if let Some(guard) = SCHEDULER.try_lock() {
220                break guard;
221            }
222            spins = spins.saturating_add(1);
223            if spins == 2_000_000 {
224                crate::serial_force_println!(
225                    "[trace][sched] finish_switch waiting lock cpu={} owner_cpu={}",
226                    cpu_index,
227                    SCHEDULER.owner_cpu()
228                );
229                spins = 0;
230            }
231            core::hint::spin_loop();
232        };
233        crate::serial_force_println!(
234            "[trace][sched] finish_switch lock acquired cpu={}",
235            cpu_index
236        );
237        if let Some(ref mut sched) = *scheduler {
238            let mut requeue_task = None;
239            if let Some(cpu) = sched.cpus.get_mut(cpu_index) {
240                task_to_drop = cpu.task_to_drop.take();
241                requeue_task = cpu.task_to_requeue.take();
242            }
243            if let Some(task) = requeue_task {
244                crate::serial_force_println!(
245                    "[trace][sched] finish_switch requeue cpu={} tid={}",
246                    cpu_index,
247                    task.id.as_u64()
248                );
249                let class = sched.class_table.class_for_task(&task);
250                if let Some(cpu) = sched.cpus.get_mut(cpu_index) {
251                    cpu.class_rqs.enqueue(class, task);
252                }
253            }
254        }
255    }
256    crate::serial_force_println!(
257        "[trace][sched] finish_switch after lock cpu={} drop={}",
258        cpu_index,
259        task_to_drop.is_some()
260    );
261    super::task_ops::flush_deferred_silo_cleanups();
262
263    // Drop the previous task outside the scheduler lock (if it was the last ref).
264    // This is safe because we are fully switched to the new task's stack and CR3.
265    drop(task_to_drop);
266
267    // Temporary safety mode: skip FS.base restore in finish_switch.
268    // This avoids cloning current_task() on a path that currently trips
269    // an Arc refcount invariant under heavy early context-switch churn.
270}
271
272/// Yield the current task to allow other tasks to run (cooperative).
273///
274/// Disables interrupts around the scheduler lock to prevent deadlock
275/// with the timer handler's `maybe_preempt()`.
276///
277/// Returns immediately (no-op) if preemption is disabled on this CPU.
278pub fn yield_task() {
279    // Respect the preemption guard: if a `PreemptGuard` is held, do nothing.
280    if !percpu::is_preemptible() {
281        return;
282    }
283
284    // Save RFLAGS and disable interrupts to prevent timer from
285    // trying to lock the scheduler while we hold it
286    let saved_flags = save_flags_and_cli();
287    let cpu_index = current_cpu_index();
288
289    let switch_target = {
290        let mut scheduler = SCHEDULER.lock();
291        if let Some(ref mut sched) = *scheduler {
292            if cpu_index < sched.cpus.len() {
293                sched.yield_cpu(cpu_index)
294            } else {
295                None
296            }
297        } else {
298            None
299        }
300    }; // Lock released here, before the actual context switch
301
302    if let Some(ref target) = switch_target {
303        // SAFETY: Pointers are valid (they point into Arc<Task> contexts
304        // kept alive by the scheduler). Interrupts are disabled.
305        unsafe {
306            crate::process::task::do_switch_context(target);
307        }
308        finish_switch();
309    }
310
311    restore_flags(saved_flags);
312}
313
314/// Called from the timer interrupt handler (or a resched IPI) to potentially
315/// preempt the current task.
316///
317/// This is safe to call from interrupt context because:
318/// 1. IF is already cleared by the CPU when entering the interrupt.
319/// 2. We use `try_lock()` - if the scheduler is already locked
320///    (e.g., `yield_task()` is in progress), we simply skip preemption
321///    for this tick.
322/// 3. We honour the `PreemptGuard`: if preemption is disabled, we return.
323pub fn maybe_preempt() {
324    let cpu_index = current_cpu_index();
325    if cpu_is_valid(cpu_index) {
326        RESCHED_IPI_PENDING[cpu_index].store(false, Ordering::Release);
327    }
328
329    // Honour the preemption guard - never preempt a section that asked for it.
330    if !percpu::is_preemptible() {
331        return;
332    }
333
334    // Try to lock the scheduler. If it's already locked (yield_task in
335    // progress), just skip this tick - we'll preempt on the next one.
336    let switch_target = {
337        let mut scheduler = match SCHEDULER.try_lock() {
338            Some(guard) => guard,
339            None => {
340                note_try_lock_fail_on_cpu(cpu_index);
341                return;
342            } // Lock contended, skip this tick
343        };
344
345        if let Some(ref mut sched) = *scheduler {
346            // Skip if no task is running yet (during early boot)
347            let cpu = match sched.cpus.get_mut(cpu_index) {
348                Some(cpu) => cpu,
349                None => return,
350            };
351            if cpu.current_task.is_none() {
352                return;
353            }
354            if !cpu.need_resched {
355                return;
356            }
357            if let Some(current) = cpu.current_task.as_ref() {
358                sched_trace(format_args!(
359                    "cpu={} preempt request task={} rt_delta={}",
360                    cpu_index,
361                    current.id.as_u64(),
362                    cpu.current_runtime.period_delta_ticks
363                ));
364            }
365            cpu.need_resched = false;
366            sched.yield_cpu(cpu_index)
367        } else {
368            None
369        }
370    }; // Lock released here
371
372    if let Some(ref target) = switch_target {
373        if cpu_is_valid(cpu_index) {
374            CPU_PREEMPT_COUNT[cpu_index].fetch_add(1, Ordering::Relaxed);
375        }
376        unsafe {
377            crate::process::task::do_switch_context(target);
378        }
379        finish_switch();
380    }
381}
382
383/// Enable or disable verbose scheduler tracing.
384pub fn set_verbose(enabled: bool) {
385    SCHED_VERBOSE.store(enabled, Ordering::Relaxed);
386    log::info!(
387        "[sched][trace] verbose={}",
388        if enabled { "on" } else { "off" }
389    );
390}
391
392/// Return current verbose tracing state.
393pub fn verbose_enabled() -> bool {
394    SCHED_VERBOSE.load(Ordering::Relaxed)
395}
396
397/// Return the scheduler class-table currently in use.
398pub fn class_table() -> crate::process::sched::SchedClassTable {
399    let saved_flags = save_flags_and_cli();
400    let out = {
401        let scheduler = SCHEDULER.lock();
402        if let Some(ref sched) = *scheduler {
403            sched.class_table
404        } else {
405            crate::process::sched::SchedClassTable::default()
406        }
407    };
408    restore_flags(saved_flags);
409    out
410}
411
412/// Configure scheduler class pick/steal order at runtime.
413pub fn configure_class_table(table: crate::process::sched::SchedClassTable) -> bool {
414    if !table.validate() {
415        return false;
416    }
417    let saved_flags = save_flags_and_cli();
418    let mut ipi_targets = [false; crate::arch::x86_64::percpu::MAX_CPUS];
419    let my_cpu = current_cpu_index();
420    let applied = {
421        let mut scheduler = SCHEDULER.lock();
422        if let Some(ref mut sched) = *scheduler {
423            let prev = sched.class_table;
424            sched.class_table = table;
425            if prev.policy_map() != sched.class_table.policy_map() {
426                sched.migrate_ready_tasks_for_new_class_table();
427            }
428            for (cpu_idx, cpu) in sched.cpus.iter_mut().enumerate() {
429                cpu.need_resched = true;
430                if cpu_idx != my_cpu && cpu_is_valid(cpu_idx) {
431                    ipi_targets[cpu_idx] = true;
432                }
433            }
434            true
435        } else {
436            false
437        }
438    };
439    restore_flags(saved_flags);
440    for (cpu, send) in ipi_targets.iter().copied().enumerate() {
441        if send {
442            send_resched_ipi_to_cpu(cpu);
443        }
444    }
445    applied
446}
447
448/// Dump per-cpu scheduler queues for tracing/debug.
449pub fn log_state(label: &str) {
450    let saved_flags = save_flags_and_cli();
451    let scheduler = SCHEDULER.lock();
452    if let Some(ref sched) = *scheduler {
453        let pick = sched.class_table.pick_order();
454        let steal = sched.class_table.steal_order();
455        log::info!(
456            "[sched][state] label={} class_table.pick=[{},{},{}] class_table.steal=[{},{}]",
457            label,
458            pick[0].as_str(),
459            pick[1].as_str(),
460            pick[2].as_str(),
461            steal[0].as_str(),
462            steal[1].as_str()
463        );
464        for (cpu_id, cpu) in sched.cpus.iter().enumerate() {
465            use crate::process::sched::SchedClassRq;
466            let current = cpu
467                .current_task
468                .as_ref()
469                .map(|t| t.id.as_u64())
470                .unwrap_or(u64::MAX);
471            log::info!(
472                "[sched][state] label={} cpu={} current={} rq_rt={} rq_fair={} rq_idle={} blocked={} need_resched={}",
473                label,
474                cpu_id,
475                current,
476                cpu.class_rqs.real_time.len(),
477                cpu.class_rqs.fair.len(),
478                cpu.class_rqs.idle.len(),
479                sched.blocked_tasks.len(),
480                cpu.need_resched
481            );
482        }
483    }
484    drop(scheduler);
485    restore_flags(saved_flags);
486}
487
488/// Structured scheduler state snapshot for shell/top/debug tooling.
489pub fn state_snapshot() -> SchedulerStateSnapshot {
490    let mut out = SchedulerStateSnapshot {
491        initialized: false,
492        boot_phase: 0,
493        cpu_count: 0,
494        pick_order: [
495            crate::process::sched::SchedClassId::RealTime,
496            crate::process::sched::SchedClassId::Fair,
497            crate::process::sched::SchedClassId::Idle,
498        ],
499        steal_order: [
500            crate::process::sched::SchedClassId::Fair,
501            crate::process::sched::SchedClassId::RealTime,
502        ],
503        blocked_tasks: 0,
504        current_task: [u64::MAX; crate::arch::x86_64::percpu::MAX_CPUS],
505        rq_rt: [0; crate::arch::x86_64::percpu::MAX_CPUS],
506        rq_fair: [0; crate::arch::x86_64::percpu::MAX_CPUS],
507        rq_idle: [0; crate::arch::x86_64::percpu::MAX_CPUS],
508        need_resched: [false; crate::arch::x86_64::percpu::MAX_CPUS],
509    };
510
511    let saved_flags = save_flags_and_cli();
512    {
513        let scheduler = SCHEDULER.lock();
514        if let Some(ref sched) = *scheduler {
515            use crate::process::sched::SchedClassRq;
516            let cpu_count = sched.cpus.len().min(crate::arch::x86_64::percpu::MAX_CPUS);
517            out.initialized = true;
518            out.boot_phase = if cpu_count > 0 { 2 } else { 1 };
519            out.cpu_count = cpu_count;
520            out.pick_order = *sched.class_table.pick_order();
521            out.steal_order = *sched.class_table.steal_order();
522            out.blocked_tasks = sched.blocked_tasks.len();
523            for i in 0..cpu_count {
524                let cpu = &sched.cpus[i];
525                out.current_task[i] = cpu
526                    .current_task
527                    .as_ref()
528                    .map(|t| t.id.as_u64())
529                    .unwrap_or(u64::MAX);
530                out.rq_rt[i] = cpu.class_rqs.real_time.len();
531                out.rq_fair[i] = cpu.class_rqs.fair.len();
532                out.rq_idle[i] = cpu.class_rqs.idle.len();
533                out.need_resched[i] = cpu.need_resched;
534            }
535        }
536    }
537    restore_flags(saved_flags);
538    out
539}
540
541/// The main function for the idle task
542pub(super) extern "C" fn idle_task_main() -> ! {
543    log::info!("[sched][idle] started");
544    loop {
545        // Be explicit on SMP: never rely on inherited IF state.
546        // If IF=0, HLT can deadlock that CPU forever.
547        crate::arch::x86_64::sti();
548
549        // Halt until next interrupt (saves power, timer will wake us)
550        crate::arch::x86_64::hlt();
551    }
552}